In [1]:
import pickle
import pandas as pd
import numpy as np
import requests
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
result = pickle.load(open('movies.pkl', 'rb'))

In [3]:
result.count()

tmdbId          5851
title           5851
genre           5851
overview        5851
release_date    5851
vote_average    5851
vote_count      5851
movieId         5851
imdbId          5851
dtype: int64

In [4]:
result.isnull().sum()

tmdbId          0
title           0
genre           0
overview        0
release_date    0
vote_average    0
vote_count      0
movieId         0
imdbId          0
dtype: int64

In [5]:
result.head(10)

Unnamed: 0,tmdbId,title,genre,overview,release_date,vote_average,vote_count,movieId,imdbId
0,278,The Shawshank Redemption,"Drama, Crime",Framed in the 1940s for the double murder of h...,1994-09-23,8.7,21862,318.0,111161.0
1,238,The Godfather,"Drama, Crime","Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,8.7,16280,858.0,68646.0
2,424,Schindler's List,"Drama, History, War",The true story of how businessman Oskar Schind...,1993-12-15,8.6,12959,527.0,108052.0
3,240,The Godfather: Part II,"Drama, Crime",In the continuing saga of the Corleone crime f...,1974-12-20,8.6,9811,1221.0,71562.0
4,129,Spirited Away,"Animation, Family, Fantasy","A young girl, Chihiro, becomes trapped in a st...",2001-07-20,8.5,13093,5618.0,245429.0
5,372058,Your Name.,"Romance, Animation, Drama",High schoolers Mitsuha and Taki are complete s...,2016-08-26,8.5,8895,163134.0,5311514.0
6,389,12 Angry Men,Drama,The defense and the prosecution have rested an...,1957-04-10,8.5,6533,1203.0,50083.0
7,497,The Green Mile,"Fantasy, Drama, Crime",A supernatural tale set on death row in a Sout...,1999-12-10,8.5,14162,3147.0,120689.0
8,155,The Dark Knight,"Drama, Action, Crime, Thriller",Batman raises the stakes in his war on crime. ...,2008-07-14,8.5,27925,58559.0,468569.0
9,429,"The Good, the Bad and the Ugly",Western,While the Civil War rages between the Union an...,1966-12-23,8.5,6747,1201.0,60196.0


In [6]:
similarity_rs = pickle.load(open('similarity.pkl','rb'))

In [7]:
result["tmdbId"]

0          278
1          238
2          424
3          240
4          129
         ...  
5846    297596
5847     10196
5848    331446
5849     13995
5850      2312
Name: tmdbId, Length: 5851, dtype: int64

In [8]:
similarity_rs

array([[1.        , 0.01991024, 0.00970764, ..., 0.        , 0.0605234 ,
        0.03270394],
       [0.01991024, 1.        , 0.0106358 , ..., 0.01291344, 0.01052524,
        0.01948909],
       [0.00970764, 0.0106358 , 1.        , ..., 0.01246508, 0.08142744,
        0.00457762],
       ...,
       [0.        , 0.01291344, 0.01246508, ..., 1.        , 0.02338756,
        0.02043552],
       [0.0605234 , 0.01052524, 0.08142744, ..., 0.02338756, 1.        ,
        0.00487906],
       [0.03270394, 0.01948909, 0.00457762, ..., 0.02043552, 0.00487906,
        1.        ]])

In [9]:
rating_data = pd.read_csv('dataset/ml-latest-small/ratings.csv')

In [10]:
rating_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [11]:
rating_data[rating_data["userId"]==2]

Unnamed: 0,userId,movieId,rating,timestamp
232,2,318,3.0,1445714835
233,2,333,4.0,1445715029
234,2,1704,4.5,1445715228
235,2,3578,4.0,1445714885
236,2,6874,4.0,1445714952
237,2,8798,3.5,1445714960
238,2,46970,4.0,1445715013
239,2,48516,4.0,1445715064
240,2,58559,4.5,1445715141
241,2,60756,5.0,1445714980


In [12]:
rating_data[rating_data["userId"]==5]

Unnamed: 0,userId,movieId,rating,timestamp
516,5,1,4.0,847434962
517,5,21,4.0,847435238
518,5,34,4.0,847434881
519,5,36,4.0,847435292
520,5,39,3.0,847434961
521,5,50,4.0,847434881
522,5,58,5.0,847435238
523,5,110,4.0,847434880
524,5,150,3.0,847434748
525,5,153,3.0,847434802


In [13]:
rating_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [14]:
len(rating_data.movieId.unique())

9724

In [15]:
len(rating_data.userId.unique())

610

In [16]:
result_for_rating = result[["tmdbId","movieId","title"]].copy()

In [17]:
result_for_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5851 entries, 0 to 5850
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   tmdbId   5851 non-null   int64  
 1   movieId  5851 non-null   float64
 2   title    5851 non-null   object 
dtypes: float64(1), int64(1), object(1)
memory usage: 137.3+ KB


In [18]:
result_for_rating.head(10)

Unnamed: 0,tmdbId,movieId,title
0,278,318.0,The Shawshank Redemption
1,238,858.0,The Godfather
2,424,527.0,Schindler's List
3,240,1221.0,The Godfather: Part II
4,129,5618.0,Spirited Away
5,372058,163134.0,Your Name.
6,389,1203.0,12 Angry Men
7,497,3147.0,The Green Mile
8,155,58559.0,The Dark Knight
9,429,1201.0,"The Good, the Bad and the Ugly"


In [19]:
movies_data_for_rating = pd.merge(result_for_rating, rating_data, on="movieId", how="left")

In [20]:
movies_data_for_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91959 entries, 0 to 91958
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tmdbId     91959 non-null  int64  
 1   movieId    91959 non-null  float64
 2   title      91959 non-null  object 
 3   userId     91954 non-null  float64
 4   rating     91954 non-null  float64
 5   timestamp  91954 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 4.2+ MB


In [21]:
movies_data_for_rating.isnull().sum()

tmdbId       0
movieId      0
title        0
userId       5
rating       5
timestamp    5
dtype: int64

In [22]:
movies_data_for_rating.dropna(subset=['userId'], inplace=True)

In [23]:
movies_data_for_rating = movies_data_for_rating.reset_index()

In [24]:
movies_data_for_rating = movies_data_for_rating.drop(columns=["index","timestamp"])

In [25]:
movies_data_for_rating['userId']

0          2.0
1          5.0
2          6.0
3          8.0
4         11.0
         ...  
91949     50.0
91950    448.0
91951    382.0
91952    599.0
91953    232.0
Name: userId, Length: 91954, dtype: float64

In [26]:
movies_data_for_rating.head()

Unnamed: 0,tmdbId,movieId,title,userId,rating
0,278,318.0,The Shawshank Redemption,2.0,3.0
1,278,318.0,The Shawshank Redemption,5.0,3.0
2,278,318.0,The Shawshank Redemption,6.0,5.0
3,278,318.0,The Shawshank Redemption,8.0,5.0
4,278,318.0,The Shawshank Redemption,11.0,4.0


In [27]:
movies_data_for_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91954 entries, 0 to 91953
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   tmdbId   91954 non-null  int64  
 1   movieId  91954 non-null  float64
 2   title    91954 non-null  object 
 3   userId   91954 non-null  float64
 4   rating   91954 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 3.5+ MB


In [28]:
len(movies_data_for_rating.title.unique())

5685

In [29]:
len(movies_data_for_rating.userId.unique())

610

In [30]:
movies_data_for_rating

Unnamed: 0,tmdbId,movieId,title,userId,rating
0,278,318.0,The Shawshank Redemption,2.0,3.0
1,278,318.0,The Shawshank Redemption,5.0,3.0
2,278,318.0,The Shawshank Redemption,6.0,5.0
3,278,318.0,The Shawshank Redemption,8.0,5.0
4,278,318.0,The Shawshank Redemption,11.0,4.0
...,...,...,...,...,...
91949,331446,136305.0,Sharknado 3: Oh Hell No!,50.0,1.0
91950,331446,136305.0,Sharknado 3: Oh Hell No!,448.0,1.0
91951,13995,26764.0,Captain America,382.0,0.5
91952,13995,26764.0,Captain America,599.0,2.0


In [31]:
pickle.dump(movies_data_for_rating, open('movies_ratings.pkl', 'wb'))

In [32]:
ratings = pickle.load(open('movies_ratings.pkl','rb'))

In [33]:
result.head()

Unnamed: 0,tmdbId,title,genre,overview,release_date,vote_average,vote_count,movieId,imdbId
0,278,The Shawshank Redemption,"Drama, Crime",Framed in the 1940s for the double murder of h...,1994-09-23,8.7,21862,318.0,111161.0
1,238,The Godfather,"Drama, Crime","Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,8.7,16280,858.0,68646.0
2,424,Schindler's List,"Drama, History, War",The true story of how businessman Oskar Schind...,1993-12-15,8.6,12959,527.0,108052.0
3,240,The Godfather: Part II,"Drama, Crime",In the continuing saga of the Corleone crime f...,1974-12-20,8.6,9811,1221.0,71562.0
4,129,Spirited Away,"Animation, Family, Fantasy","A young girl, Chihiro, becomes trapped in a st...",2001-07-20,8.5,13093,5618.0,245429.0


In [34]:
result.isnull().sum()


tmdbId          0
title           0
genre           0
overview        0
release_date    0
vote_average    0
vote_count      0
movieId         0
imdbId          0
dtype: int64

In [35]:
result[result["tmdbId"]==278].index[0]

0

In [36]:
ratings

Unnamed: 0,tmdbId,movieId,title,userId,rating
0,278,318.0,The Shawshank Redemption,2.0,3.0
1,278,318.0,The Shawshank Redemption,5.0,3.0
2,278,318.0,The Shawshank Redemption,6.0,5.0
3,278,318.0,The Shawshank Redemption,8.0,5.0
4,278,318.0,The Shawshank Redemption,11.0,4.0
...,...,...,...,...,...
91949,331446,136305.0,Sharknado 3: Oh Hell No!,50.0,1.0
91950,331446,136305.0,Sharknado 3: Oh Hell No!,448.0,1.0
91951,13995,26764.0,Captain America,382.0,0.5
91952,13995,26764.0,Captain America,599.0,2.0


In [37]:
ratings[ratings["rating"]==5]

Unnamed: 0,tmdbId,movieId,title,userId,rating
2,278,318.0,The Shawshank Redemption,6.0,5.0
3,278,318.0,The Shawshank Redemption,8.0,5.0
6,278,318.0,The Shawshank Redemption,15.0,5.0
8,278,318.0,The Shawshank Redemption,17.0,5.0
9,278,318.0,The Shawshank Redemption,18.0,5.0
...,...,...,...,...,...
91561,9405,1497.0,Double Team,594.0,5.0
91755,3177,27704.0,Battle Royale II: Requiem,419.0,5.0
91847,9306,880.0,The Island of Dr. Moreau,53.0,5.0
91858,9306,880.0,The Island of Dr. Moreau,313.0,5.0


In [38]:
n_users = int(np.max(ratings["userId"]))
items = ratings["tmdbId"].unique()
n_items = int(np.max(ratings["tmdbId"]))

In [39]:
#UU
users = ratings["userId"]
ratings_copy = ratings.copy()
mu = np.zeros((n_users,))
for n in range(n_users):
    ids = np.where(users == n+1)[0]
    item_ids = ratings["tmdbId"].iloc[ids]
    ratings_for_mean = ratings["rating"].iloc[ids]
    m = np.mean(ratings_for_mean)
    if np.isnan(m):
        m = 0  # để tránh mảng trống và NaN value
    mu[n] = m
    # chuẩn hóa
    ratings_copy.loc[ids,"rating"] = ratings_for_mean - mu[n]
ratings_matrix = sparse.coo_matrix((ratings_copy["rating"],
                                (ratings_copy["tmdbId"]-1, ratings_copy["userId"]-1)), shape=(n_items, n_users))
ratings_matrix = ratings_matrix.tocsr()
ratings_similar_uuCB = cosine_similarity(ratings_matrix.T,ratings_matrix.T)
ratings_similar_uuCB

array([[ 1.        ,  0.00260842,  0.02400436, ...,  0.0831835 ,
        -0.02811377,  0.01056375],
       [ 0.00260842,  1.        ,  0.        , ..., -0.00442231,
        -0.0691926 ,  0.03445089],
       [ 0.02400436,  0.        ,  1.        , ..., -0.01542553,
         0.        ,  0.01742464],
       ...,
       [ 0.0831835 , -0.00442231, -0.01542553, ...,  1.        ,
         0.05049763,  0.05662199],
       [-0.02811377, -0.0691926 ,  0.        , ...,  0.05049763,
         1.        , -0.018313  ],
       [ 0.01056375,  0.03445089,  0.01742464, ...,  0.05662199,
        -0.018313  ,  1.        ]])

In [40]:
print(ratings_matrix)

  (4, 43)	0.6585365853658538
  (4, 65)	-0.051118210862619584
  (4, 67)	-1.2242990654205608
  (4, 94)	-0.5796178343949041
  (4, 102)	1.0826558265582658
  (4, 108)	-0.25
  (4, 178)	0.1846153846153844
  (4, 181)	1.4976525821596245
  (4, 255)	1.0617647058823527
  (4, 294)	-0.24390243902439046
  (4, 306)	0.22684085510688856
  (4, 317)	-0.7055476529160738
  (4, 379)	0.30706287683031874
  (4, 413)	-0.4615915363385463
  (4, 482)	0.37358757062146886
  (4, 488)	-1.0240770465489568
  (4, 500)	1.7619047619047619
  (4, 520)	0.3783783783783785
  (4, 598)	0.25195822454308114
  (4, 605)	0.34416299559471364
  (5, 50)	-1.787461773700306
  (5, 216)	0.17799999999999994
  (5, 243)	-1.7241379310344827
  (5, 413)	-1.4615915363385463
  (10, 0)	0.6261261261261257
  :	:
  (429732, 566)	-1.2315068493150685
  (431529, 110)	0.19965277777777768
  (431818, 183)	-1.6367924528301887
  (433309, 110)	0.19965277777777768
  (433309, 248)	-0.21736997055937213
  (437556, 513)	-0.4525862068965516
  (444704, 110)	0.6996527777

In [41]:
sorted(list(enumerate(ratings_similar_uuCB[0])), reverse=True, key=lambda vector:vector[1])

[(0, 1.0000000000000027),
 (300, 0.12761127874159686),
 (476, 0.10964862302896043),
 (596, 0.10847966225040778),
 (413, 0.10680001118903001),
 (56, 0.10652061454809936),
 (534, 0.10421285069935955),
 (368, 0.10115099794108372),
 (134, 0.09973932151304764),
 (205, 0.09930084395717076),
 (589, 0.09818647413455946),
 (18, 0.0971105470175024),
 (417, 0.09630406621934036),
 (225, 0.09508354880921253),
 (119, 0.09442747385449293),
 (74, 0.09416760240070074),
 (197, 0.09327491572994903),
 (159, 0.09288490802463811),
 (576, 0.09253747045504483),
 (468, 0.09028564252137239),
 (357, 0.09015720869398682),
 (311, 0.08926737106143888),
 (265, 0.08912882628769442),
 (44, 0.08811694541238918),
 (483, 0.08716170713661138),
 (43, 0.0864339291700706),
 (170, 0.08513721910499578),
 (592, 0.08446063459080788),
 (71, 0.0843468967693467),
 (296, 0.08408572066163723),
 (607, 0.0831834962100113),
 (449, 0.08261563277568762),
 (198, 0.0824148108399364),
 (482, 0.0820986217945365),
 (551, 0.08025889018361822),


In [42]:
pickle.dump(ratings_similar_uuCB, open('similarity_uuCB_ratings.pkl', 'wb'))
pickle.dump(ratings_matrix, open('ratings_matrix.pkl', 'wb'))

In [43]:
def __pred(u, i, normalized=1):
    ids = np.where(ratings["tmdbId"] == i)[0]
    users_rated_i = ratings['userId'].iloc[ids]
    sim = ratings_similar_uuCB[u-1, users_rated_i-1]
    a = np.argsort(sim)[-2:]
    nearest_s = sim[a]
    r = ratings_matrix[i, users_rated_i[a]]
    if normalized:
        return (r * nearest_s)[0] / (np.abs(nearest_s).sum() + 1e-8)
    return (r * nearest_s)[0] / (np.abs(nearest_s).sum() + 1e-8) + mu[u]

def recommend_top(u):
    ids = np.where(ratings["userId"] == u)[0]
    items_rated_by_u = ratings["tmdbId"].iloc[ids]
    item = {'id': None, 'similar': None}
    list_items = []

    def take_similar(elem):
        return elem['similar']

    for i in items:
        if i not in items_rated_by_u:
            rating = __pred(u, i)
            item['id'] = i
            item['similar'] = rating
            list_items.append(item.copy())

    sorted_items = sorted(list_items, key=take_similar, reverse=True)
    return sorted_items

In [44]:
def fetch_poster(movie_id):
    url = "https://api.themoviedb.org/3/movie/{}?api_key=19b98f78a51a3924b33b555437599b0c&language=en-US".format(movie_id)
    data=requests.get(url)
    data=data.json()
    poster_path = data['poster_path']
    full_path = "https://image.tmdb.org/t/p/w500/"+poster_path
    return full_path
def pred(u, i, normalized=1):
    ids = np.where(ratings["tmdbId"] == i)[0]
    users_rated_i = ratings['userId'].iloc[ids].astype(int)
    sim = ratings_similar_uuCB[u-1, users_rated_i-1]
    a = np.argsort(sim)[-2:]
    nearest_s = sim[a]
    r = ratings_matrix[i-1, users_rated_i.iloc[a]-1]
    # if normalized:
    #     return (r * nearest_s)[0] / (np.abs(nearest_s).sum() + 1e-8)
    return (r * nearest_s)[0] / (np.abs(nearest_s).sum() + 1e-8) + mu[u]

def recommend_top(u):  
    u = int(u)
    items = ratings["tmdbId"].unique()
    itemsName = ratings["title"].unique()
    ids = np.where(ratings["userId"] == u)[0]
    items_rated_by_u = ratings["tmdbId"].iloc[ids]
    item = {'id': None, 'name': None, 'poster': None, 'similar': None}
    list_items = []

    def take_similar(elem):
        return elem['similar']
    count = 0
    for i, n in zip(items, itemsName):
        if count == 24:
            break
        if i not in items_rated_by_u.values:
            rating = pred(u, i)
            item['name'] = n
            item['id'] = i
            item['poster'] = fetch_poster(i)
            item['similar'] = rating
            list_items.append(item.copy())
            count+=1

    sorted_items = sorted(list_items, key=take_similar, reverse=True)
    return sorted_items

test = recommend_top(2)
test[0]['name']


'Grave of the Fireflies'