In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

In [2]:
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

# Movies

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


# Ratings

In [4]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


# Tiền xử lý dữ liệu

In [5]:
ratings = ratings.drop("timestamp", axis = 1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
ratings.shape

(100836, 3)

## Loại bỏ những bộ phim được số lượng người dùng đánh giá <10

In [7]:
movie_counts = ratings['movieId'].value_counts()
movie_counts

movieId
356       329
318       317
296       307
593       279
2571      278
         ... 
160341      1
160527      1
160836      1
163937      1
135534      1
Name: count, Length: 9724, dtype: int64

In [8]:

valid_movies = movie_counts[movie_counts >= 10].index
valid_movies

Index([   356,    318,    296,    593,   2571,    260,    480,    110,    589,
          527,
       ...
          258,   1290,   5621,    918,   2380,   4167,  50794,   4255,   1147,
       120466],
      dtype='int64', name='movieId', length=2269)

In [9]:
ratings = ratings[ratings['movieId'].isin(valid_movies)]
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100818,610,159093,3.0
100829,610,164179,5.0
100830,610,166528,4.0
100833,610,168250,5.0


## loại bỏ những user đánh giá ít hơn 50 bộ phim

In [10]:
user_counts = ratings['userId'].value_counts()
user_counts

userId
414    1634
599    1368
68     1085
474    1077
448    1017
       ... 
499      15
598      14
324      13
578      11
175       7
Name: count, Length: 610, dtype: int64

In [11]:
valid_users = user_counts[user_counts >= 50].index
valid_users

Index([414, 599,  68, 474, 448, 274, 380, 288, 249, 610,
       ...
       583, 591, 571,  36, 512, 262, 170, 457, 303,  88],
      dtype='int64', name='userId', length=367)

In [12]:
ratings = ratings[ratings['userId'].isin(valid_users)]
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100818,610,159093,3.0
100829,610,164179,5.0
100830,610,166528,4.0
100833,610,168250,5.0


# Xây dựng ma trận và chuẩn hóa ma trận

In [13]:
movie_user_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
movie_user_matrix

movieId,1,2,3,5,6,7,9,10,11,12,...,166461,166528,166643,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,4.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,5.0,4.0,4.0,,3.0,4.0,,...,,,,,,,,,,
7,4.5,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,4.0,3.5,,,,,,,,,...,,,,,,,,,,
606,2.5,,,,,2.5,,,2.5,,...,,,,,,,,,,
607,4.0,,,,,,,,3.0,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,4.0,,,...,,,,,,,,,,


In [14]:
movie_user_matrix.shape

(367, 2269)

In [15]:
movie_mean = movie_user_matrix.mean(axis=0)
movie_mean

movieId
1         3.877193
2         3.441176
3         3.261905
5         2.971429
6         3.928571
            ...   
174055    3.545455
176371    4.000000
177765    3.500000
179819    3.045455
187593    4.000000
Length: 2269, dtype: float64

In [16]:
normalized_utility_matrix_NaN = movie_user_matrix.sub(movie_mean, axis=1)
normalized_utility_matrix_NaN

movieId,1,2,3,5,6,7,9,10,11,12,...,166461,166528,166643,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.122807,,0.738095,,0.071429,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
6,,0.558824,1.738095,2.028571,0.071429,0.837838,,-0.453704,0.346774,,...,,,,,,,,,,
7,0.622807,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,0.122807,0.058824,,,,,,,,,...,,,,,,,,,,
606,-1.377193,,,,,-0.662162,,,-1.153226,,...,,,,,,,,,,
607,0.122807,,,,,,,,-0.653226,,...,,,,,,,,,,
608,-1.377193,-1.441176,-1.261905,,,,,0.546296,,,...,,,,,,,,,,


In [17]:
normalized_utility_matrix = normalized_utility_matrix_NaN.fillna(0)
normalized_utility_matrix

movieId,1,2,3,5,6,7,9,10,11,12,...,166461,166528,166643,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.122807,0.000000,0.738095,0.000000,0.071429,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0000,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0000,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
6,0.000000,0.558824,1.738095,2.028571,0.071429,0.837838,0.0,-0.453704,0.346774,0.0,...,0.0,0.0000,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
7,0.622807,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0000,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0000,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,0.122807,0.058824,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0000,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
606,-1.377193,0.000000,0.000000,0.000000,0.000000,-0.662162,0.0,0.000000,-1.153226,0.0,...,0.0,0.0000,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
607,0.122807,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,-0.653226,0.0,...,0.0,0.0000,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
608,-1.377193,-1.441176,-1.261905,0.000000,0.000000,0.000000,0.0,0.546296,0.000000,0.0,...,0.0,0.0000,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0


# Tính độ tương đồng giữa các movie

## cosine_similarity

In [18]:
movie_similarity_matrix = cosine_similarity(normalized_utility_matrix.T)
movie_similarity_matrix = pd.DataFrame(movie_similarity_matrix, index=normalized_utility_matrix.columns, columns=normalized_utility_matrix.columns)
print("\nMa trận tương đồng giữa các phim:\n")
movie_similarity_matrix


Ma trận tương đồng giữa các phim:



movieId,1,2,3,5,6,7,9,10,11,12,...,166461,166528,166643,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.167275,0.130949,0.084027,0.018362,0.056997,0.047789,-0.009744,0.062203,0.015188,...,0.089254,0.046515,0.069315,-0.010038,0.111425,0.155751,0.066175,0.158377,0.047386,0.032165
2,0.167275,1.000000,0.216603,0.147101,0.068316,0.077306,0.054114,0.026197,0.083102,0.098712,...,0.094227,0.060997,0.006910,0.023345,0.115684,0.052861,-0.013204,0.088050,0.027162,-0.041760
3,0.130949,0.216603,1.000000,0.261988,0.210166,0.199515,0.237909,-0.029575,0.104594,0.115187,...,0.000000,0.040850,0.009233,0.037576,0.066147,0.011031,0.117797,0.000000,0.029319,0.000000
5,0.084027,0.147101,0.261988,1.000000,0.059832,0.282232,0.106066,0.052459,0.105601,0.036904,...,0.022867,-0.014435,-0.014941,-0.026973,0.027201,-0.070276,-0.031268,0.000000,-0.047447,0.000000
6,0.018362,0.068316,0.210166,0.059832,1.000000,0.000198,-0.021505,0.198709,0.006525,0.035438,...,0.000000,0.033279,-0.009969,0.056050,0.071839,-0.044447,-0.057372,0.000000,-0.096951,-0.075775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174055,0.155751,0.052861,0.011031,-0.070276,-0.044447,-0.024935,-0.063367,-0.021796,0.058215,-0.022056,...,0.217457,-0.022856,0.086302,0.000876,-0.016366,1.000000,0.381279,0.333301,0.424290,0.231677
176371,0.066175,-0.013204,0.117797,-0.031268,-0.057372,0.045088,0.090221,-0.044156,0.078777,0.031403,...,-0.528017,-0.218765,-0.102396,-0.110153,-0.046603,0.381279,1.000000,-0.255883,-0.133791,0.074125
177765,0.158377,0.088050,0.000000,0.000000,0.000000,0.000000,0.000000,-0.006524,0.059173,0.000000,...,0.625305,0.398973,0.341809,0.299171,0.070826,0.333301,-0.255883,1.000000,0.498594,0.144841
179819,0.047386,0.027162,0.029319,-0.047447,-0.096951,-0.007397,0.000000,-0.105857,0.043733,-0.181946,...,0.495078,0.191915,0.215467,0.161237,0.079556,0.424290,-0.133791,0.498594,1.000000,0.346226


# Cập nhật bảng các bảng

In [19]:
def update_normalized_utility_matrix(ratings):
    movie_user_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
    movie_mean = movie_user_matrix.mean(axis=0)
    normalized_utility_matrix = movie_user_matrix.sub(movie_mean, axis=1).fillna(0)
    return normalized_utility_matrix, movie_mean


In [20]:
def update_similarity(normalized_utility_matrix):
    movie_similarity = cosine_similarity(normalized_utility_matrix.T)
    movie_similarity = pd.DataFrame(movie_similarity, index=normalized_utility_matrix.columns, columns=normalized_utility_matrix.columns)
    return movie_similarity


# Xây dựng dự đoán

In [21]:
def pre(userId, movieId, k):
    if movieId not in movie_similarity_matrix.index:
        return None
    # tìm những movie đã được rating bởi userId
    rated_movies = normalized_utility_matrix.loc[userId]
    movie_ids = rated_movies[rated_movies != 0].index
    if movie_ids.empty:
        return None
    # lấy k giá trị tương đồng.
    sim_movies = movie_similarity_matrix.loc[movieId, movie_ids] 
    k_sim_movies = sim_movies.sort_values(ascending=False).head(k)
    
    k_normal_ratings_movieIds = normalized_utility_matrix.loc[userId, k_sim_movies.index]

    similarity_sum = 0 
    weighted_sum = 0 
    for idx in k_sim_movies.index:
        similarity_sum += abs(k_sim_movies[idx])
        weighted_sum += k_sim_movies[idx] * k_normal_ratings_movieIds[idx]
        
    if similarity_sum == 0:
        return None
    predicted_rating = weighted_sum / similarity_sum
    predicted_rating += movie_mean[movieId]
    
    predicted_rating = min(5, max(1, predicted_rating))
    return predicted_rating
    
predict = pre(102, 2, 10)
predict

np.float64(3.6454155386309077)

In [22]:
# recommend
def recommend_from_prec(userId, k):
    rated_movies = normalized_utility_matrix.loc[userId]
    rated_movies = rated_movies[rated_movies != 0].index  
    all_movies = normalized_utility_matrix.columns
    predicted_ratings = {}
    for movieId in all_movies:
        if movieId in rated_movies:
            continue
        rating = pre(userId, movieId, k)
        if rating is not None:
            predicted_ratings[movieId] = rating

    # Sắp xếp theo rating giảm dần
    top_recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)
    recommend_df = pd.DataFrame(top_recommendations, columns=['movieId', 'predicted_rating'])
    recommend_df = recommend_df.merge(movies[['movieId', 'title']], on='movieId')
    recommend_series = pd.Series(data=recommend_df['predicted_rating'].values, index=recommend_df['title'])
    return recommend_series
    
recommend = recommend_from_prec(4, 10)[:10]
recommend

title
Wallace & Gromit: The Best of Aardman Animation (1996)                                                  5.000000
Secrets & Lies (1996)                                                                                   4.899698
All About Eve (1950)                                                                                    4.875327
Hustler, The (1961)                                                                                     4.821757
In the Heat of the Night (1967)                                                                         4.802012
Seven Pounds (2008)                                                                                     4.798988
Streetcar Named Desire, A (1951)                                                                        4.779041
Bicycle Thieves (a.k.a. The Bicycle Thief) (a.k.a. The Bicycle Thieves) (Ladri di biciclette) (1948)    4.760399
Iron Giant, The (1999)                                                                    

# Xây dựng hệ thống

In [23]:
import gradio as gr
import pandas as pd

valid_user_ids = sorted(list(normalized_utility_matrix.index)) 

def get_recommendations(user_id: int, k: int, n: int):
    recs_series = recommend_from_prec(user_id, k).head(n)
    df = recs_series.reset_index()
    df.columns = ["Title", "Score"]
    df["Score"] = df["Score"].map(lambda x: f"{x:.2f}")
    return df

with gr.Blocks() as demo:
    gr.Markdown("## Movie Recommender 🎬")
    with gr.Row():
        user_id = gr.Dropdown(choices=valid_user_ids, label="User ID", value=valid_user_ids[0])
        k = gr.Slider(1, 50, value=10, step=1, label="Số hàng xóm (k)")
        n = gr.Slider(1, 50, value=10, step=1, label="Số phim đề xuất")
    btn = gr.Button("Lấy đề xuất")
    output = gr.DataFrame(headers=["Title", "Score"], label="Top Recommendations")
    btn.click(fn=get_recommendations, inputs=[user_id, k, n], outputs=output)

if __name__ == "__main__":
    demo.launch()

* Running on local URL:  http://127.0.0.1:7865

To create a public link, set `share=True` in `launch()`.


# Đánh giá mô hình 

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

normalized_utility_matrix_train, movie_mean_train = update_normalized_utility_matrix(train_data)
movie_similarity_matrix_train = update_similarity(normalized_utility_matrix_train)

normalized_utility_matrix= normalized_utility_matrix_train
movie_mean= movie_mean_train
movie_similarity_matrix= movie_similarity_matrix_train

## RMSE

In [25]:
def predict_all(test_data, k):
    y_true = []
    y_pred = []

    for _, row in test_data.iterrows():
        userId = row['userId']
        movieId = row['movieId']
        true_rating = row['rating']
        try:
            pred = pre(userId, movieId, k)
            if pred is not None and not np.isnan(pred):
                y_true.append(true_rating)
                y_pred.append(pred)
        except:
            continue

    return y_true, y_pred

y_true, y_pred = predict_all(test_data, k=10)

rmse = sqrt(mean_squared_error(y_true, y_pred))
print("RMSE:", rmse)


RMSE: 0.8360517822021303


## Precision@K và Recall@K

In [26]:
def get_liked_movies(userId, test_data, threshold):
    user_test_data = test_data[test_data['userId'] == userId]
    liked_movies = user_test_data[user_test_data['rating'] >= threshold]['movieId'].tolist()
    return liked_movies

In [27]:
def precision_at_k(userId, k, test_data, threshold):
    liked_movies = set(get_liked_movies(userId, test_data, threshold))
    
    recommend_df = recommend_from_prec(userId, k).reset_index()
    recommend_df = recommend_df.merge(movies[['title', 'movieId']], on='title')
    recommended_movies = set(recommend_df['movieId'].head(k))
    
    if not recommended_movies:
        return None

    precision = len(recommended_movies & liked_movies) / k
    return precision

In [28]:
def recall_at_k(userId, k, test_data, threshold):
    liked_movies = set(get_liked_movies(userId, test_data, threshold))
    if not liked_movies:
        return None
    
    recommend_df = recommend_from_prec(userId, k).reset_index()
    recommend_df = recommend_df.merge(movies[['title', 'movieId']], on='title')
    recommended_movies = set(recommend_df['movieId'].head(k))
    
    recall = len(recommended_movies & liked_movies) / len(liked_movies)
    return recall

In [29]:
def evaluate_model(k, test_data, threshold):
    precisions = []
    recalls = []
    
    for userId in test_data['userId'].unique():
        precision = precision_at_k(userId, k, test_data, threshold)
        recall = recall_at_k(userId, k, test_data, threshold)
        
        if precision is not None:
            precisions.append(precision)
        if recall is not None:
            recalls.append(recall)
    
    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0
    
    return avg_precision, avg_recall

In [30]:
k = 10
threshold = 3.5
avg_precision, avg_recall = evaluate_model(k, test_data, threshold)
print(f'Average Precision@{k}: {avg_precision:.4f}')
print(f'Average Recall@{k}: {avg_recall:.4f}')

Average Precision@10: 0.0302
Average Recall@10: 0.0168
