In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
file_ratings_path = "data/movies/ratings.csv"
rating = pd.read_csv(file_ratings_path)
df=rating

In [3]:
def missing_values_analysis(data):
    na_columns = [col for col in data.columns if data[col].isnull().sum() > 0]
    n_miss = data[na_columns].isnull().sum().sort_values(ascending=True)
    ratio = (data[na_columns].isnull().sum() / data.shape[0] * 100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

In [4]:
def check_df(data, row_num=5, col_num=10):
    print("*************** Dataset Shape ***************")
    print("No. of Rows:", data.shape[0], "\nNo. of Columns:", data.shape[1])
    print("*************** Dataset Information ***************")
    print(data.info())
    print("*************** Types of Columns ***************")
    print(data.dtypes)
    print(f"*************** First {row_num} Rows ***************")
    print(data.iloc[:row_num,:col_num])
    print(f"*************** Last {row_num} Rows ***************")
    print(data.iloc[-row_num:,:col_num])
    print("*************** Summary Statistics of The Dataset ***************")
    print(data.describe([0.10, 0.25, 0.50, 0.70, 0.80, 0.90, 0.95, 0.99]).T)
    print("*************** Dataset Missing Values Analysis ***************")
    print(missing_values_analysis(data))


In [5]:
print(df)

         userId  movieId  rating               tstamp
0           206     4803     4.0  2003-04-07 13:52:01
1          5073    72731     4.0  2020-02-19 16:07:53
2          4739    91653     4.0  2020-12-28 15:35:58
3           535     3005     3.0  2008-12-26 05:38:11
4           465     4776     3.0  2008-08-13 20:22:36
...         ...      ...     ...                  ...
3908652    2099    77328     4.5  2017-02-18 23:29:18
3908653    2024   148652     3.5  2019-03-24 00:29:28
3908654    3751    60684     1.0  2019-04-06 22:25:38
3908655      17     2694     3.0  2007-11-09 16:11:26
3908656    1684     3948     3.5  2017-02-03 18:19:11

[3908657 rows x 4 columns]


In [6]:
check_df(df)

*************** Dataset Shape ***************
No. of Rows: 3908657 
No. of Columns: 4
*************** Dataset Information ***************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3908657 entries, 0 to 3908656
Data columns (total 4 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
 3   tstamp   object 
dtypes: float64(1), int64(2), object(1)
memory usage: 119.3+ MB
None
*************** Types of Columns ***************
userId       int64
movieId      int64
rating     float64
tstamp      object
dtype: object
*************** First 5 Rows ***************
   userId  movieId  rating               tstamp
0     206     4803     4.0  2003-04-07 13:52:01
1    5073    72731     4.0  2020-02-19 16:07:53
2    4739    91653     4.0  2020-12-28 15:35:58
3     535     3005     3.0  2008-12-26 05:38:11
4     465     4776     3.0  2008-08-13 20:22:36
*************** Last 5 Rows ***************
         userId  movieId  rating 

             count          mean           std  min     10%     25%      50%  \
userId   3908657.0   2434.060851   1950.445219  1.0   215.0   727.0   1973.0   
movieId  3908657.0  62535.529829  67051.572745  1.0  1221.0  3478.0  45447.0   
rating   3908657.0      3.419320      1.022044  0.5     2.0     3.0      3.5   

             70%       80%       90%       95%       99%       max  
userId    3393.0    4417.0    5590.0    6138.0    6609.0    6724.0  
movieId  93982.0  119145.0  172881.0  195305.0  225776.0  270592.0  
rating       4.0       4.0       4.5       5.0       5.0       5.0  
*************** Dataset Missing Values Analysis ***************
Empty DataFrame
Columns: [Total Missing Values, Ratio]
Index: []


In [7]:
def create_user_movie_df(dataframe):
    # comment
    comment_counts = pd.DataFrame(dataframe["movieId"].value_counts())
    print(comment_counts)

    # rare movies
    rare_movies = comment_counts[comment_counts["movieId"] <= 1000].index
    print(rare_movies)
    
    common_movies = df[~df["movieId"].isin(rare_movies)]
    # common_movies = df
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["movieId"], values="rating")
    return user_movie_df

user_movie_df = create_user_movie_df(rating)

        movieId
79132      5506
2571       5347
2959       5101
58559      4975
318        4945
...         ...
261523        1
254488        1
267804        1
108683        1
215387        1

[68044 rows x 1 columns]
Int64Index([  2023,   1333, 177763,  41997,  27904,   1690, 116897,   1982,
             71838,  60126,
            ...
            122025, 246818, 166978, 125435, 192607, 261523, 254488, 267804,
            108683, 215387],
           dtype='int64', length=67170)


In [8]:
user_movie_df.head()

movieId,1,2,6,10,16,19,32,34,39,47,...,206857,207313,208205,208703,209965,210861,217465,218537,225173,254726
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,,,,4.0,5.0,,...,,4.5,3.5,,,,,,,
2,3.0,,,,,,,4.0,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,,,
4,5.0,,3.0,,3.5,,,5.0,4.0,,...,,4.0,3.5,,,,,,,
5,3.0,3.0,3.0,3.0,2.0,,4.0,,,2.0,...,,,,,,,,,,


In [19]:
def user_based_recommender(random_user, user_movie_df, ratio=50, cor_th=0.65, score=3.5):
    random_user_df = user_movie_df[user_movie_df.index == random_user]
    # Chọn phim mà người dùng đã xem
    movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist()
    print(f"Số lượng bộ phim mà người dùng ID: [{random_user}] đã xem là: {len(movies_watched)}")

    # ===== Truy cập dữ liệu và id của những người dùng khác đang xem cùng một bộ phim ===== #
    movies_watched_df = user_movie_df[movies_watched]
    user_movie_count = movies_watched_df.T.notnull().sum()
    user_movie_count = user_movie_count.reset_index()
    user_movie_count.columns = ["userId", "movie_count"]
    perc = len(movies_watched) * ratio / 100
    print(user_movie_count)
    # ===== Chọn những người dùng khác đã xem cùng một bộ phim ===== #
    users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]
    print(users_same_movies)

    # ===== Xác định người dùng có hành vi giống nhất với người dùng để được đề xuất ===== #
    # Để làm điều này, chúng ta sẽ thực hiện 3 bước:
    #   Chúng tôi sẽ thu thập dữ liệu của Người dùng của chúng tôi và những người dùng khác.
    #   Chúng ta sẽ tạo mối tương quan df.
    #   Chúng tôi sẽ tìm những người dùng tương tự nhất (Top Users)

    final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                          random_user_df[movies_watched]])
    print(final_df)
    print(final_df.T)
    print(final_df.T.corr())
    print(final_df.T.corr().unstack())
    # print(final_df.T.corr().unstack().sort_values().drop_duplicates())
    corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
    # print(corr_df)
    corr_df = pd.DataFrame(corr_df, columns=["corr"])
    corr_df.index.names = ['user_id_1', 'user_id_2']
    corr_df = corr_df.reset_index()
    # print(corr_df.head)
    print(corr_df)
    # print(corr_df[(corr_df["user_id_1" ] == random_user) & (corr_df["corr"] >= cor_th)])
    
    top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= cor_th)][
        ["user_id_2", "corr"]].reset_index(drop=True) # select similar users have correlation over cor_th on random_user
    top_users = top_users.sort_values(by='corr', ascending=True)
    # print(top_users)
    top_users.rename(columns={"user_id_2": "userId"}, inplace=True)
    print(top_users)
    rating = pd.read_csv(file_ratings_path)
    top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
    print(top_users_ratings)
    # ===== Tính điểm đề xuất trung bình có trọng số ===== #
    top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']
    # print(top_users_ratings.head())
    # print(top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"}).head())
    
    # ===== Thuật toán đề xuất film ===== #
    recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
    recommendation_df = recommendation_df.reset_index()
    # print(recommendation_df)
    # print(recommendation_df[recommendation_df["weighted_rating"] > 3.5])
    movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > score].sort_values("weighted_rating", ascending=False)

    return movies_to_be_recommend

In [20]:
def ShowAnswer(UserID = 0, cor_th = 0.70, score = 4):
  print("=============== Thông tin ===============")
  print("                                         ")
  print(f"Users ID: \t {UserID}")
  print(f"Score : \t {score}")
  print(f"cor_th: \t {cor_th}")
  print("                                         ")
  print("=========== Phim được đề xuất ===========")
  print("                                         ")
  print(user_based_recommender(UserID, user_movie_df, cor_th=cor_th, score=score))


In [21]:
# @markdown ---
# @markdown ### Nhập dữ liệu:
random_user = 123  # @param {type:"number"}
score = 3 # @param {type:"slider", min:1, m67ax:5, step:0.5}
cor_th = 0.5 # @param {type:"slider", min:0.5, max:1, step:0.1}
if random_user == 0:
  random_user = int(pd.Series(user_movie_df.index).sample(1).values)
# @markdown ---

ShowAnswer(UserID=random_user, cor_th=cor_th, score=score)

                                         
Users ID: 	 123
Score : 	 3
cor_th: 	 0.5
                                         
                                         
Số lượng bộ phim mà người dùng ID: [123] đã xem là: 512
      userId  movie_count
0          1          123
1          2          113
2          3           89
3          4          167
4          5          260
...      ...          ...
6717    6720           56
6718    6721           47
6719    6722           47
6720    6723           94
6721    6724          115

[6722 rows x 2 columns]
4          5
5          6
6          7
8          9
9         10
        ... 
6652    6655
6663    6666
6692    6695
6693    6696
6706    6709
Name: userId, Length: 1242, dtype: int64
movieId  6       16      19      32      34      39      47      50      \
userId                                                                    
5           3.0     2.0     NaN     4.0     NaN     NaN     2.0     4.5   
6           4.5     4.0     Na