# Collaborative Filtering
[Tài liệu 1](https://www.kaggle.com/code/mehmetisik/user-based-collaborative-filtering/notebook)

[Tài liệu 2](https://www.kaggle.com/code/fatihakkaya/user-based-recommendation-system)

# 0. Preparation of Data Set

In [27]:
import requests
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from scipy import stats
from ast import literal_eval

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [28]:
# @markdown ---
# @markdown ### Enter a file path:
file_ratings_path = "data/movies/ratings.csv" # @param {type:"string"}
file_movies_path = "data/movies/movies.csv" # @param {type:"string"}
# @markdown ---

# 1. Đọc và phân chia dữ liệu

In [29]:
def missing_values_analysis(data):
    na_columns = [col for col in data.columns if data[col].isnull().sum() > 0]
    n_miss = data[na_columns].isnull().sum().sort_values(ascending=True)
    ratio = (data[na_columns].isnull().sum() / data.shape[0] * 100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

In [30]:
def check_df(data, row_num=5, col_num=10):
    print("*************** Dataset Shape ***************")
    print("No. of Rows:", data.shape[0], "\nNo. of Columns:", data.shape[1])
    print("*************** Dataset Information ***************")
    print(data.info())
    print("*************** Types of Columns ***************")
    print(data.dtypes)
    print(f"*************** First {row_num} Rows ***************")
    print(data.iloc[:row_num,:col_num])
    print(f"*************** Last {row_num} Rows ***************")
    print(data.iloc[-row_num:,:col_num])
    print("*************** Summary Statistics of The Dataset ***************")
    print(data.describe([0.10, 0.25, 0.50, 0.70, 0.80, 0.90, 0.95, 0.99]).T)
    print("*************** Dataset Missing Values Analysis ***************")
    print(missing_values_analysis(data))

In [31]:
movie = pd.read_csv(file_movies_path)
movie = movie[["movieId", "title", "genres"]]
rating = pd.read_csv(file_ratings_path)
df = movie.merge(rating, how="left", on="movieId")
df_sorted = df.sort_values(by="movieId", ascending=True)
print(df_sorted)

        movieId                                              title  \
0             1                                   Toy Story (1995)   
157           1                                   Toy Story (1995)   
158           1                                   Toy Story (1995)   
159           1                                   Toy Story (1995)   
160           1                                   Toy Story (1995)   
...         ...                                                ...   
100059   162672                                Mohenjo Daro (2016)   
100060   163056                               Shin Godzilla (2016)   
100061   163949  The Beatles: Eight Days a Week - The Touring Y...   
100062   164977                           The Gay Desperado (1936)   
100063   164979                              Women of '69, Unboxed   

                                             genres  userId  rating  \
0       Adventure|Animation|Children|Comedy|Fantasy     0.0     0.0   
157     Adventure

In [32]:
check_df(df)

*************** Dataset Shape ***************
No. of Rows: 100064 
No. of Columns: 6
*************** Dataset Information ***************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100064 entries, 0 to 100063
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100064 non-null  int64  
 1   title      100064 non-null  object 
 2   genres     100064 non-null  object 
 3   userId     100005 non-null  float64
 4   rating     100005 non-null  float64
 5   timestamp  100005 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 4.6+ MB
None
*************** Types of Columns ***************
movieId        int64
title         object
genres        object
userId       float64
rating       float64
timestamp    float64
dtype: object
*************** First 5 Rows ***************
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animati

In [33]:
print(df.columns)

Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp'], dtype='object')


In [34]:
def create_user_movie_df(dataframe):
    # comment
    comment_counts = pd.DataFrame(dataframe["title"].value_counts())
    print(comment_counts.head(20))

    # rare movies
    rare_movies = comment_counts[comment_counts["count"] <= 1000].index
    print(rare_movies)

    common_movies = df[~df["title"].isin(rare_movies)]
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
    return user_movie_df

user_movie_df = create_user_movie_df(df)

                                                    count
title                                                    
Forrest Gump (1994)                                   341
Pulp Fiction (1994)                                   324
Shawshank Redemption, The (1994)                      311
Silence of the Lambs, The (1991)                      304
Star Wars: Episode IV - A New Hope (1977)             291
Jurassic Park (1993)                                  274
Matrix, The (1999)                                    259
Toy Story (1995)                                      248
Schindler's List (1993)                               244
Terminator 2: Judgment Day (1991)                     237
Star Wars: Episode V - The Empire Strikes Back ...    234
Braveheart (1995)                                     228
Back to the Future (1985)                             226
Fargo (1996)                                          224
American Beauty (1999)                                220
Raiders of the

In [35]:
user_movie_df.head()

title
userId


# 2. Code recommender movie

In [36]:
def user_based_recommender(random_user, user_movie_df, ratio=60, cor_th=0.65, score=3.5):
    random_user_df = user_movie_df[user_movie_df.index == random_user]
    # Chọn phim mà người dùng đã xem
    movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist() # select watched and rated movie by random_user
    print(f"Số lượng bộ phim mà người dung ID: [{random_user}] đã xem là: {len(movies_watched)}")

    # ===== Truy cập dữ liệu và id của những người dùng khác đang xem cùng một bộ phim ===== #
    movies_watched_df = user_movie_df[movies_watched]
    user_movie_count = movies_watched_df.T.notnull().sum()
    user_movie_count = user_movie_count.reset_index()
    user_movie_count.columns = ["userId", "movie_count"]
    perc = len(movies_watched) * ratio / 100
    # ===== Chọn những người dùng khác đã xem cùng một bộ phim ===== #
    users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"] #select other users who watched same movies
    # print(users_same_movies)


    # ===== Xác định người dùng có hành vi giống nhất với người dùng để được đề xuất ===== #
    # Để làm điều này, chúng tôi sẽ thực hiện 3 bước:
    #   Chúng tôi sẽ thu thập dữ liệu của Người dùng của chúng tôi và những người dùng khác.
    #   Chúng ta sẽ tạo mối tương quan df.
    #   Chúng tôi sẽ tìm những người dùng tương tự nhất (Top Users)

    final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                          random_user_df[movies_watched]])
    # print(final_df.head())
    corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
    corr_df = pd.DataFrame(corr_df, columns=["corr"])
    corr_df.index.names = ['user_id_1', 'user_id_2']
    corr_df = corr_df.reset_index()
    # print(corr_df.head())

    top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= cor_th)][
        ["user_id_2", "corr"]].reset_index(drop=True) # select similar users have correlation over cor_th on random_user
    top_users = top_users.sort_values(by='corr', ascending=False)
    top_users.rename(columns={"user_id_2": "userId"}, inplace=True)
    rating = pd.read_csv(file_ratings_path)
    top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
    # ===== Tính điểm đề xuất trung bình có trọng số ===== #
    top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating'] # calculate rating*corr score
    # print(top_users_ratings.head())
    # print(top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"}).head())

    # ===== Thuật toán đề xuất film ===== #
    recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
    recommendation_df = recommendation_df.reset_index()
    # print(recommendation_df)
    # print(recommendation_df[recommendation_df["weighted_rating"] > 3.5])
    movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > score].sort_values("weighted_rating", ascending=False)
    movie = pd.read_csv(file_movies_path)
    return movies_to_be_recommend.merge(movie[["movieId", "title"]])

In [37]:
def ShowAnswer(UserID = 0, cor_th = 0.70, score = 4):
  print("=============== Thông tin ===============")
  print("                                         ")
  print(f"Users ID: \t {UserID}")
  print(f"Score : \t {score}")
  print(f"cor_th: \t {cor_th}")
  print("                                         ")
  print("=========== Phim được đề xuất ===========")
  print("                                         ")
  user_based_recommender(UserID, user_movie_df, cor_th=cor_th, score=score)

In [42]:
# @markdown ---
# @markdown ### Nhập dữ liệu:
random_user = 1231 # @param {type:"number"}
score = 4 # @param {type:"slider", min:1, max:5, step:0.5}
cor_th = 0.9 # @param {type:"slider", min:0.5, max:1, step:0.1}
if random_user == 0:
  random_user = int(pd.Series(user_movie_df.index).sample(1).values)
# @markdown ---

ShowAnswer(UserID=1, cor_th=cor_th, score=score)

                                         
Users ID: 	 1
Score : 	 4
cor_th: 	 0.9
                                         
                                         
Số lượng bộ phim mà người dung ID: [1] đã xem là: 0
