In [72]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

In [74]:
ratings = pd.read_csv("data/ratings.csv")
movies = pd.read_csv("data/movies.csv")

# Movies

In [77]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


# Ratings

In [80]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


# Tiền xử lý dữ liệu

In [83]:
ratings = ratings.drop("timestamp", axis = 1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [85]:
ratings.shape

(100836, 3)

## Loại bỏ những bộ phim được số lượng người dùng đánh giá <10

In [88]:
movie_counts = ratings['movieId'].value_counts()
movie_counts

movieId
356       329
318       317
296       307
593       279
2571      278
         ... 
86279       1
86922       1
5962        1
87660       1
163981      1
Name: count, Length: 9724, dtype: int64

In [90]:
movie_counts = ratings['movieId'].value_counts()
valid_movies = movie_counts[movie_counts >= 10].index
valid_movies

Index([  356,   318,   296,   593,  2571,   260,   480,   110,   589,   527,
       ...
         728,  8947,  7137, 79057,  2142,  1096,  1541,  4103,   818,  2111],
      dtype='int64', name='movieId', length=2269)

In [92]:
ratings = ratings[ratings['movieId'].isin(valid_movies)]
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100818,610,159093,3.0
100829,610,164179,5.0
100830,610,166528,4.0
100833,610,168250,5.0


## loại bỏ những user đánh giá ít hơn 50 bộ phim

In [95]:
user_counts = ratings['userId'].value_counts()
user_counts

userId
414    1634
599    1368
68     1085
474    1077
448    1017
       ... 
320      15
598      14
324      13
578      11
175       7
Name: count, Length: 610, dtype: int64

In [97]:
valid_users = user_counts[user_counts >= 50]
valid_users

userId
414    1634
599    1368
68     1085
474    1077
448    1017
       ... 
170      50
303      50
262      50
124      50
583      50
Name: count, Length: 367, dtype: int64

In [99]:
ratings = ratings[ratings['userId'].isin(valid_users)]
ratings

Unnamed: 0,userId,movieId,rating
7112,50,1,3.0
7113,50,32,3.0
7114,50,111,4.0
7115,50,165,3.0
7116,50,296,4.0
...,...,...,...
95063,599,166528,3.0
95066,599,168250,3.0
95067,599,168252,3.5
95087,599,174055,4.0


# Xây dựng ma trận và chuẩn hóa ma trận

In [102]:
movie_user_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
movie_user_matrix

movieId,1,2,3,5,6,7,9,10,11,12,...,166461,166528,166643,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,3.0,,,,,,,,,,...,,,,,,2.5,3.0,,,
51,,4.5,4.0,,,4.0,,,,,...,,,,,,,,,,
52,,,,,,,,,,,...,,,,,,,,,,
53,,,,,,,,,,,...,,,,,,,,,,
54,3.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,,,,,,,,,,,...,,,,,,,,,,
563,,2.5,,,,,,,,,...,,,,,,,,,,
572,4.0,,,,,,,,,,...,,,,,,,,,,
574,,,,,,,,,,,...,,,,,,,,,,


In [104]:
movie_user_matrix.shape

(188, 2265)

In [106]:
movie_mean = movie_user_matrix.mean(axis=0)
movie_mean

movieId
1         3.828125
2         3.363636
3         3.125000
5         3.117647
6         3.833333
            ...   
174055    3.571429
176371    3.500000
177765    3.142857
179819    3.200000
187593    3.700000
Length: 2265, dtype: float64

In [108]:
normalized_utility_matrix_NaN = movie_user_matrix.sub(movie_mean, axis=1)
normalized_utility_matrix_NaN

movieId,1,2,3,5,6,7,9,10,11,12,...,166461,166528,166643,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,-0.828125,,,,,,,,,,...,,,,,,-1.071429,-0.5,,,
51,,1.136364,0.875,,,0.527778,,,,,...,,,,,,,,,,
52,,,,,,,,,,,...,,,,,,,,,,
53,,,,,,,,,,,...,,,,,,,,,,
54,-0.828125,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,,,,,,,,,,,...,,,,,,,,,,
563,,-0.863636,,,,,,,,,...,,,,,,,,,,
572,0.171875,,,,,,,,,,...,,,,,,,,,,
574,,,,,,,,,,,...,,,,,,,,,,


In [110]:
normalized_utility_matrix = normalized_utility_matrix_NaN.fillna(0)
normalized_utility_matrix

movieId,1,2,3,5,6,7,9,10,11,12,...,166461,166528,166643,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,-0.828125,0.000000,0.000,0.0,0.000000,0.000000,0.000,0.000000,0.000000,0.0,...,0.0,0.00,0.0,0.0,0.000000,-1.071429,-0.5,0.0,0.0,0.0
51,0.000000,1.136364,0.875,0.0,0.000000,0.527778,0.000,0.000000,0.000000,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
52,0.000000,0.000000,0.000,0.0,0.000000,0.000000,0.000,0.000000,0.000000,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
53,0.000000,0.000000,0.000,0.0,0.000000,0.000000,0.000,0.000000,0.000000,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
54,-0.828125,0.000000,0.000,0.0,0.000000,0.000000,0.000,0.000000,0.000000,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,0.000000,0.000000,0.000,0.0,0.000000,0.000000,0.000,0.000000,0.000000,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
563,0.000000,-0.863636,0.000,0.0,0.000000,0.000000,0.000,0.000000,0.000000,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
572,0.171875,0.000000,0.000,0.0,0.000000,0.000000,0.000,0.000000,0.000000,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
574,0.000000,0.000000,0.000,0.0,0.000000,0.000000,0.000,0.000000,0.000000,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0


# Tính độ tương đồng giữa các movie

## cosine_similarity

In [114]:
movie_similarity_matrix = cosine_similarity(normalized_utility_matrix.T)
movie_similarity_matrix = pd.DataFrame(movie_similarity_matrix, index=normalized_utility_matrix.columns, columns=normalized_utility_matrix.columns)
print("\nMa trận tương đồng giữa các phim:\n")
movie_similarity_matrix


Ma trận tương đồng giữa các phim:



movieId,1,2,3,5,6,7,9,10,11,12,...,166461,166528,166643,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.117245,0.126847,0.071642,-0.086312,0.164430,0.204015,-0.055644,0.086883,-0.019198,...,0.214094,-0.009735,0.059480,0.138477,0.071269,0.219286,0.035487,0.207199,0.222777,0.040650
2,0.117245,1.000000,0.238685,0.265092,0.078097,0.199709,0.101280,0.005392,0.097740,0.126791,...,0.225189,-0.008639,0.026394,0.125370,0.206148,0.087417,-0.011664,0.108980,0.183939,0.010842
3,0.126847,0.238685,1.000000,0.148002,0.323483,0.273151,0.213859,-0.100185,0.136812,0.004831,...,0.000000,0.062543,-0.070395,0.261468,0.140215,0.034134,0.074666,0.000000,0.068210,0.000000
5,0.071642,0.265092,0.148002,1.000000,0.043507,0.456090,0.000000,0.017563,-0.062851,0.088690,...,0.049120,-0.164029,0.125280,0.000000,0.064694,-0.137168,-0.132879,0.000000,-0.121391,0.000000
6,-0.086312,0.078097,0.323483,0.043507,1.000000,-0.043389,-0.129662,0.139946,-0.203621,-0.125468,...,0.000000,-0.008999,0.054986,-0.109971,0.028395,-0.042143,-0.043741,0.000000,-0.054304,0.012907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174055,0.219286,0.087417,0.034134,-0.137168,-0.042143,-0.104209,-0.103275,-0.042460,0.105346,-0.052597,...,0.501939,0.077651,-0.364966,-0.054745,-0.028270,1.000000,0.409686,0.291346,0.709317,0.000000
176371,0.035487,-0.011664,0.074666,-0.132879,-0.043741,-0.062406,0.000000,-0.040066,0.142397,0.000000,...,0.000000,0.231455,-0.294628,0.117851,0.030429,0.409686,1.000000,0.217213,0.184466,0.562505
177765,0.207199,0.108980,0.000000,0.000000,0.000000,0.000000,0.000000,-0.021303,0.097121,0.000000,...,0.373906,0.329073,0.000000,0.207450,0.000000,0.291346,0.217213,1.000000,0.539400,0.305289
179819,0.222777,0.183939,0.068210,-0.121391,-0.054304,-0.057010,0.000000,-0.037274,0.130087,0.000000,...,0.683394,0.248041,-0.161494,-0.024845,0.006415,0.709317,0.184466,0.539400,1.000000,-0.016524


# Cập nhật bảng các bảng

In [117]:
def update_normalized_utility_matrix(ratings):
    movie_user_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
    movie_mean = movie_user_matrix.mean(axis=0)
    normalized_utility_matrix = movie_user_matrix.sub(movie_mean, axis=1).fillna(0)
    return normalized_utility_matrix, movie_mean


In [119]:
def update_similarity(normalized_utility_matrix):
    movie_similarity = cosine_similarity(normalized_utility_matrix.T)
    movie_similarity = pd.DataFrame(movie_similarity, index=normalized_utility_matrix.columns, columns=normalized_utility_matrix.columns)
    return movie_similarity


# Xây dựng dự đoán

In [122]:
def pre(userId, movieId, k):
    if movieId not in movie_similarity_matrix.index:
        return None
    # tìm những movie đã được rating bởi userId
    rated_movies = normalized_utility_matrix.loc[userId]
    movie_ids = rated_movies[rated_movies != 0].index
    if movie_ids.empty:
        return None
    # lấy k giá trị tương đồng.
    sim_movies = movie_similarity_matrix.loc[movieId, movie_ids] 
    k_sim_movies = sim_movies.sort_values(ascending=False).head(k)
    
    k_normal_ratings_movieIds = normalized_utility_matrix.loc[userId, k_sim_movies.index]

    similarity_sum = 0 
    weighted_sum = 0 
    for idx in k_sim_movies.index:
        similarity_sum += abs(k_sim_movies[idx])
        weighted_sum += k_sim_movies[idx] * k_normal_ratings_movieIds[idx]
        
    if similarity_sum == 0:
        return None
    predicted_rating = weighted_sum / similarity_sum
    predicted_rating += movie_mean[movieId]
    
    predicted_rating = min(5, max(1, predicted_rating))
    return predicted_rating
    
predict = pre(102, 2, 10)
predict

3.3671116068713745

In [124]:
# recommend
def recommend_from_prec(userId, k):
    rated_movies = normalized_utility_matrix.loc[userId]
    rated_movies = rated_movies[rated_movies != 0].index  
    all_movies = normalized_utility_matrix.columns
    predicted_ratings = {}
    for movieId in all_movies:
        if movieId in rated_movies:
            continue
        rating = pre(userId, movieId, k)
        if rating is not None:
            predicted_ratings[movieId] = rating

    # Sắp xếp theo rating giảm dần
    top_recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)
    recommend_df = pd.DataFrame(top_recommendations, columns=['movieId', 'predicted_rating'])
    recommend_df = recommend_df.merge(movies[['movieId', 'title']], on='movieId')
    recommend_series = pd.Series(data=recommend_df['predicted_rating'].values, index=recommend_df['title'])
    return top_recommendations
    
recommend = recommend_from_prec(102, 10)[: 10]
recommend

[(1136, 4.905368200104073),
 (4499, 4.900181389678435),
 (1208, 4.889648774806541),
 (720, 4.863504339203047),
 (1927, 4.851099490757929),
 (1212, 4.846135811406548),
 (116897, 4.786086262671265),
 (1276, 4.78595216913144),
 (2329, 4.7831730585047625),
 (3275, 4.7691973616426955)]

# Đánh giá mô hình 

In [126]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

normalized_utility_matrix_train, movie_mean_train = update_normalized_utility_matrix(train_data)
movie_similarity_matrix_train = update_similarity(normalized_utility_matrix_train)

normalized_utility_matrix= normalized_utility_matrix_train
movie_mean= movie_mean_train
movie_similarity_matrix= movie_similarity_matrix_train

In [127]:
def predict_all(test_data, k):
    y_true = []
    y_pred = []

    for _, row in test_data.iterrows():
        userId = row['userId']
        movieId = row['movieId']
        true_rating = row['rating']
        try:
            pred = pre(userId, movieId, k)
            if pred is not None and not np.isnan(pred):
                y_true.append(true_rating)
                y_pred.append(pred)
        except:
            continue

    return y_true, y_pred

y_true, y_pred = predict_all(test_data, k=10)

rmse = sqrt(mean_squared_error(y_true, y_pred))
print("RMSE:", rmse)


RMSE: 0.8554230077626882


## Precision@K và Recall@K

In [131]:
def get_liked_movies(userId, test_data, threshold):
    user_test_data = test_data[test_data['userId'] == userId]
    liked_movies = user_test_data[user_test_data['rating'] >= threshold]['movieId'].tolist()
    return liked_movies

In [132]:
def precision_at_k(userId, k, test_data, threshold):
    liked_movies = set(get_liked_movies(userId, test_data, threshold))
    recommended_movies = set([movieId for movieId, _ in recommend_from_prec(userId, k)[:k]])
    
    if not recommended_movies:
        return None

    precision = len(recommended_movies & liked_movies) / k
    return precision

In [133]:
def recall_at_k(userId, k, test_data, threshold):
    liked_movies = set(get_liked_movies(userId, test_data, threshold))
    
    if not liked_movies:
        return None
    
    recommended_movies = set([movieId for movieId, _ in recommend_from_prec(userId, k)[:k]])
    
    recall = len(recommended_movies & liked_movies) / len(liked_movies)
    return recall

In [134]:
def evaluate_model(k, test_data, threshold):
    precisions = []
    recalls = []
    
    for userId in test_data['userId'].unique():
        precision = precision_at_k(userId, k, test_data, threshold)
        recall = recall_at_k(userId, k, test_data, threshold)
        
        if precision is not None:
            precisions.append(precision)
        if recall is not None:
            recalls.append(recall)
    
    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0
    
    return avg_precision, avg_recall

In [135]:
k = 10
threshold = 3.5
avg_precision, avg_recall = evaluate_model(k, test_data, threshold)
print(f'Average Precision@{k}: {avg_precision:.4f}')
print(f'Average Recall@{k}: {avg_recall:.4f}')

Average Precision@10: 0.0144
Average Recall@10: 0.0069
