In [None]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import lit

# Câu 1:

In [None]:
# Create a SparkSession
spark = SparkSession.builder.appName("PopularMovies").getOrCreate()

In [None]:
# Convert u.data lines into (userID, movieID, rating) rows
def parseInput(line):
    fields = line.value.split(',')
    return Row(userID = int(fields[0]), itemID = int(fields[1]), action = float(fields[2]))

# Get the raw data
lines = spark.read.text("/content/tuongtac.data").rdd

# Convert it to a RDD of Row objects with (userID, movieID, rating)
ratingsRDD = lines.map(parseInput).cache()
ratingsRDD.take(10)

[Row(userID=204, itemID=230, action=16.0),
 Row(userID=317, itemID=140, action=12.0),
 Row(userID=922, itemID=63, action=13.0),
 Row(userID=223, itemID=74, action=7.0),
 Row(userID=575, itemID=397, action=18.0),
 Row(userID=261, itemID=203, action=7.0),
 Row(userID=278, itemID=387, action=14.0),
 Row(userID=989, itemID=52, action=4.0),
 Row(userID=720, itemID=89, action=7.0),
 Row(userID=921, itemID=80, action=5.0)]

In [None]:
itemDataset = spark.createDataFrame(ratingsRDD)
itemDataset.take(10)

[Row(userID=204, itemID=230, action=16.0),
 Row(userID=317, itemID=140, action=12.0),
 Row(userID=922, itemID=63, action=13.0),
 Row(userID=223, itemID=74, action=7.0),
 Row(userID=575, itemID=397, action=18.0),
 Row(userID=261, itemID=203, action=7.0),
 Row(userID=278, itemID=387, action=14.0),
 Row(userID=989, itemID=52, action=4.0),
 Row(userID=720, itemID=89, action=7.0),
 Row(userID=921, itemID=80, action=5.0)]

In [None]:
# 1.4
sumAction = itemDataset.groupBy('userID').sum('action')
sumAction.take(10)

[Row(userID=26, sum(action)=0.0),
 Row(userID=964, sum(action)=17.0),
 Row(userID=29, sum(action)=6.0),
 Row(userID=474, sum(action)=7.0),
 Row(userID=65, sum(action)=28.0),
 Row(userID=541, sum(action)=10.0),
 Row(userID=558, sum(action)=16.0),
 Row(userID=418, sum(action)=16.0),
 Row(userID=222, sum(action)=18.0),
 Row(userID=270, sum(action)=17.0)]

In [None]:
# 1.5
avgAction = itemDataset.groupBy('userID').avg('action')
avgAction.take(10)

[Row(userID=26, avg(action)=0.0),
 Row(userID=964, avg(action)=17.0),
 Row(userID=29, avg(action)=6.0),
 Row(userID=474, avg(action)=7.0),
 Row(userID=65, avg(action)=9.333333333333334),
 Row(userID=541, avg(action)=10.0),
 Row(userID=558, avg(action)=16.0),
 Row(userID=418, avg(action)=8.0),
 Row(userID=222, avg(action)=18.0),
 Row(userID=270, avg(action)=17.0)]

In [None]:
# 1.6
joinTable = sumAction.join(avgAction, 'userID')
joinTable.take(10)

[Row(userID=26, sum(action)=0.0, avg(action)=0.0),
 Row(userID=964, sum(action)=17.0, avg(action)=17.0),
 Row(userID=29, sum(action)=6.0, avg(action)=6.0),
 Row(userID=474, sum(action)=7.0, avg(action)=7.0),
 Row(userID=65, sum(action)=28.0, avg(action)=9.333333333333334),
 Row(userID=541, sum(action)=10.0, avg(action)=10.0),
 Row(userID=558, sum(action)=16.0, avg(action)=16.0),
 Row(userID=418, sum(action)=16.0, avg(action)=8.0),
 Row(userID=222, sum(action)=18.0, avg(action)=18.0),
 Row(userID=270, sum(action)=17.0, avg(action)=17.0)]

In [None]:
# ALS
# Create an ALS collaborative filtering model from the complete data set
# 1.7
datas = spark.createDataFrame(ratingsRDD)

als = ALS(maxIter=5, regParam=0.01, userCol="userID", itemCol="itemID", ratingCol="action")
model = als.fit(datas)

In [None]:
# 1.8:Print out ratings of user 720:
print("\nRatings for user ID 720:")
userActions = datas.filter("userID = 720")
# userActions.take(10)
for action in userActions.collect():
    print("ItemID: ", action['itemID'], ", Action:", action['action'])


Ratings for user ID 720:
ItemID:  89 , Action: 7.0
ItemID:  370 , Action: 4.0
ItemID:  296 , Action: 18.0


In [None]:
userRatings = datas.filter("userID = 720").show()

+------+------+------+
|userID|itemID|action|
+------+------+------+
|   720|    89|   7.0|
|   720|   370|   4.0|
|   720|   296|  18.0|
+------+------+------+



In [None]:
# 1.9
ratingCounts = datas.groupBy("itemID").count().filter("count > 2")
ratingCounts.take(10)

[Row(itemID=29, count=3),
 Row(itemID=26, count=4),
 Row(itemID=474, count=3),
 Row(itemID=65, count=4),
 Row(itemID=418, count=5),
 Row(itemID=222, count=3),
 Row(itemID=287, count=6),
 Row(itemID=112, count=4),
 Row(itemID=167, count=3),
 Row(itemID=385, count=4)]

In [None]:
# 1.10.
popularMovies = ratingCounts.select("itemID").withColumn('userID', lit(0))
popularMovies.take(10)

[Row(itemID=29, userID=0),
 Row(itemID=26, userID=0),
 Row(itemID=474, userID=0),
 Row(itemID=65, userID=0),
 Row(itemID=418, userID=0),
 Row(itemID=222, userID=0),
 Row(itemID=287, userID=0),
 Row(itemID=112, userID=0),
 Row(itemID=167, userID=0),
 Row(itemID=385, userID=0)]

In [None]:
# Run our model on that list of popular movies for userID = 0
recommendations = model.transform(popularMovies)
recommendations.take(10)

[Row(itemID=29, userID=0, prediction=6.444790840148926),
 Row(itemID=26, userID=0, prediction=-1.6820874214172363),
 Row(itemID=474, userID=0, prediction=2.8173065185546875),
 Row(itemID=65, userID=0, prediction=6.120847702026367),
 Row(itemID=418, userID=0, prediction=14.271123886108398),
 Row(itemID=222, userID=0, prediction=3.6594159603118896),
 Row(itemID=287, userID=0, prediction=-10.544295310974121),
 Row(itemID=112, userID=0, prediction=5.220533847808838),
 Row(itemID=167, userID=0, prediction=-3.1557960510253906),
 Row(itemID=385, userID=0, prediction=-12.831786155700684)]

# Câu 2:

In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf()

sc = SparkContext(conf = conf)

In [None]:
lines = sc.textFile("/content/tuongtac.data")

In [None]:
import numpy as np
n_movies = 1000

def parser(line):
    line = line.split(',')
    v = [0, ]* n_movies
    m_id = int(line[1]) - 1
    v[m_id] = int(line[2])

    return (int(line[0]), v)

In [None]:
user_profiles = lines.map(parser)

In [None]:
def reducer(u1, u2):
    u1 = np.array(u1)
    u2 = np.array(u2)
    return list(u1 + u2)

In [None]:
user_p = user_profiles.reduceByKey(reducer)
print(type(user_p))

<class 'pyspark.rdd.PipelinedRDD'>


In [None]:
result = user_p.collect()
for user_id, profile in result:
    print(f"User ID: {user_id}, Profile: {profile}")

User ID: 204, Profile: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10

In [None]:
X = np.array([x[1] for x in user_p.collect()])
print(X.shape)

(613, 1000)


In [None]:
X[2]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [None]:
epsilon = 1e-8
sim = lambda v1, v2: np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)+ epsilon)


vo = X[111]

d = np.array([sim(vo, v) for v in X])

idx = np.argsort(d)
d[idx[-5:]]

array([0.0993377 , 0.34570536, 0.71358095, 0.93834312, 1.        ])

In [None]:
recommended = []
for j in idx[-5:-1]:
    most_similar_user = X[j]
    for i, r in enumerate(most_similar_user):
        if (vo[i] == 0) and (r > 3):
            recommended.append(i)

print(recommended)

[400, 224]
