In [35]:
# Importing all libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
# Download rating, movies file
!gdown 1HOFWUAMFlYbd-gk1B2IyV2-hXDZI7gKR
!gdown 1Q9UJtrN_v_dS-garl5gQ1I_SotGhye_1


Downloading...
From: https://drive.google.com/uc?id=1HOFWUAMFlYbd-gk1B2IyV2-hXDZI7gKR
To: /Users/megha/ratings.csv
100%|██████████████████████████████████████| 2.48M/2.48M [00:00<00:00, 6.98MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Q9UJtrN_v_dS-garl5gQ1I_SotGhye_1
To: /Users/megha/movies.csv
100%|████████████████████████████████████████| 516k/516k [00:00<00:00, 3.74MB/s]


In [37]:
# Read Rating csv file
df_rating = pd.read_csv("/Users/megha/ratings.csv")

In [38]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [39]:
df_rating.shape

(105339, 4)

In [40]:
df_rating.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


rating lies from 0.5 - 5

In [41]:
df_rating.isna().sum() # no any Null value present

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [42]:
df_rating["userId"].nunique() # total 668 user present

668

In [43]:
df_rating["movieId"].nunique() # total 10325 moview present

10325

In [44]:
# pivoting the table - creating a interaction matrix(user-item) matrix
user_item_matrix = df_rating.pivot(index = "userId", columns="movieId",values="rating").fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# Reading movies csv file
df_movies = pd.read_csv("/Users/megha/movies.csv")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
df_movies.shape

(10329, 3)

In [13]:
# Mapping movie id to movie name
user_item_matrix.columns =  [df_movies.loc[df_movies.movieId==i ,'title'].values[0] for i in user_item_matrix.columns]
user_item_matrix

Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Circle (2015),The Measure of a Man (2015),Bone Tomahawk (2015),Elämältä kaiken sain ( ),Creed (2015),Cosmic Scrat-tastrophe (2015),Le Grand Restaurant (1966),A Very Murray Christmas (2015),The Big Short (2015),Marco Polo: One Hundred Eyes (2015)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Check how many 0 value are present
user_item_matrix.isin([0]).sum().sum() # so many 0's are present

np.int64(6791761)

In [15]:
# calculate % how many values are not 0 in whole matrix
non_zero_value = (user_item_matrix !=0).sum().sum()/(user_item_matrix.size)
non_zero_value*100

np.float64(1.5272940801206305)

Only 1.52% data are not null/ not 0 -> we can say this a Sparse Matrix

# Collaborative Filtering


# Calculate a User-User Similarity


In [16]:
user_user = cosine_similarity(user_item_matrix)
user_user_df = pd.DataFrame(user_user, index=user_item_matrix.index, columns = user_item_matrix.index)
user_user_df

userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.101113,0.210044,0.128766,0.057896,0.077130,0.358090,0.097434,0.239189,0.026663,...,0.291162,0.144741,0.106583,0.091049,0.236805,0.154519,0.245071,0.238660,0.278217,0.153479
2,0.101113,1.000000,0.115559,0.034610,0.032705,0.028305,0.062914,0.471918,0.194232,0.000000,...,0.068325,0.000000,0.477330,0.146887,0.163553,0.061737,0.050948,0.051423,0.035907,0.064816
3,0.210044,0.115559,1.000000,0.058208,0.044426,0.012816,0.084522,0.066620,0.459703,0.068454,...,0.152078,0.301021,0.081626,0.098949,0.310234,0.079452,0.092821,0.080940,0.158943,0.109648
4,0.128766,0.034610,0.058208,1.000000,0.019298,0.005781,0.059089,0.024420,0.050572,0.000000,...,0.055860,0.024329,0.040467,0.108881,0.076241,0.014011,0.042643,0.174275,0.061677,0.157794
5,0.057896,0.032705,0.044426,0.019298,1.000000,0.053378,0.080822,0.041536,0.023168,0.011915,...,0.058450,0.007315,0.024708,0.038163,0.053085,0.048993,0.055431,0.026053,0.086667,0.068275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,0.154519,0.061737,0.079452,0.014011,0.048993,0.011697,0.207565,0.070784,0.098666,0.037486,...,0.156459,0.064727,0.059112,0.082097,0.093945,1.000000,0.101132,0.068558,0.091651,0.102090
665,0.245071,0.050948,0.092821,0.042643,0.055431,0.131016,0.178275,0.033969,0.116292,0.026110,...,0.204038,0.061159,0.020011,0.052398,0.125183,0.101132,1.000000,0.096167,0.211776,0.172559
666,0.238660,0.051423,0.080940,0.174275,0.026053,0.068479,0.174009,0.039911,0.084109,0.049958,...,0.138398,0.038339,0.037231,0.121721,0.079120,0.068558,0.096167,1.000000,0.123252,0.158368
667,0.278217,0.035907,0.158943,0.061677,0.086667,0.031558,0.195029,0.119225,0.169818,0.030780,...,0.268394,0.115746,0.044327,0.090725,0.215705,0.091651,0.211776,0.123252,1.000000,0.110955


# Calculate a Item-Item Similarity


In [17]:
item_item = cosine_similarity(user_item_matrix.T)
item_item_df = pd.DataFrame(item_item, index =user_item_matrix.columns, columns=user_item_matrix.columns )
item_item_df

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Circle (2015),The Measure of a Man (2015),Bone Tomahawk (2015),Elämältä kaiken sain ( ),Creed (2015),Cosmic Scrat-tastrophe (2015),Le Grand Restaurant (1966),A Very Murray Christmas (2015),The Big Short (2015),Marco Polo: One Hundred Eyes (2015)
Toy Story (1995),1.000000,0.383068,0.337453,0.134724,0.360289,0.392823,0.347485,0.092432,0.123093,0.368602,...,0.065762,0.000000,0.075491,0.000000,0.052609,0.0,0.0,0.000000,0.060148,0.000000
Jumanji (1995),0.383068,1.000000,0.199207,0.123377,0.216562,0.302732,0.201137,0.056784,0.103498,0.422472,...,0.090899,0.090899,0.052834,0.090899,0.127258,0.0,0.0,0.090899,0.099767,0.090899
Grumpier Old Men (1995),0.337453,0.199207,1.000000,0.173366,0.514108,0.312031,0.338144,0.173191,0.228211,0.159173,...,0.000000,0.000000,0.046041,0.000000,0.000000,0.0,0.0,0.000000,0.047422,0.000000
Waiting to Exhale (1995),0.134724,0.123377,0.173366,1.000000,0.127038,0.173900,0.098651,0.190777,0.028099,0.166986,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
Father of the Bride Part II (1995),0.360289,0.216562,0.514108,0.127038,1.000000,0.228020,0.418153,0.188991,0.133090,0.174580,...,0.000000,0.000000,0.054951,0.000000,0.000000,0.0,0.0,0.000000,0.056599,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cosmic Scrat-tastrophe (2015),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,1.0,0.000000,0.000000,0.000000
Le Grand Restaurant (1966),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,1.0,0.000000,0.000000,0.000000
A Very Murray Christmas (2015),0.000000,0.090899,0.000000,0.000000,0.000000,0.112388,0.135673,0.000000,0.000000,0.105453,...,0.000000,1.000000,0.000000,1.000000,0.600000,0.0,0.0,1.000000,0.000000,1.000000
The Big Short (2015),0.060148,0.099767,0.047422,0.000000,0.056599,0.127091,0.152133,0.000000,0.160659,0.149645,...,0.000000,0.000000,0.781000,0.000000,0.000000,0.0,0.0,0.000000,1.000000,0.000000


In [18]:
# checking where similarity is close to 1 [greater than 0.5]
item_item_df[item_item_df > 0.5]

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Circle (2015),The Measure of a Man (2015),Bone Tomahawk (2015),Elämältä kaiken sain ( ),Creed (2015),Cosmic Scrat-tastrophe (2015),Le Grand Restaurant (1966),A Very Murray Christmas (2015),The Big Short (2015),Marco Polo: One Hundred Eyes (2015)
Toy Story (1995),1.0,,,,,,,,,,...,,,,,,,,,,
Jumanji (1995),,1.0,,,,,,,,,...,,,,,,,,,,
Grumpier Old Men (1995),,,1.000000,,0.514108,,,,,,...,,,,,,,,,,
Waiting to Exhale (1995),,,,1.0,,,,,,,...,,,,,,,,,,
Father of the Bride Part II (1995),,,0.514108,,1.000000,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cosmic Scrat-tastrophe (2015),,,,,,,,,,,...,,,,,,1.0,1.0,,,
Le Grand Restaurant (1966),,,,,,,,,,,...,,,,,,1.0,1.0,,,
A Very Murray Christmas (2015),,,,,,,,,,,...,,1.0,,1.0,0.6,,,1.0,,1.0
The Big Short (2015),,,,,,,,,,,...,,,0.781,,,,,,1.0,


From above table , movie - Father of the Bride Part II (1995)	, Grumpier Old Men (1995)	 both are comedy movies so there similraity is > 0.5 (0.514108)

In [19]:
def user_based_predict_rating(user, item):
    # Find rated user - who give the rating to movies
    rated_user = user_item_matrix[user_item_matrix[item] > 0 ].index
    # Find weighted avg
    num = sum(user_user_df.loc[user, other] * user_item_matrix.loc[other, item] for other in rated_user)
    denom = sum(abs(user_user_df.loc[user, other]) for other in rated_user)
    return np.round(num/denom)
    

In [20]:
user_based_predict_rating(3, "Tom and Huck (1995)") # predicted rating is 4 

np.float64(4.0)

In [21]:
user_based_predict_rating(668, "Toy Story (1995)") # predicted rating is 4 

np.float64(4.0)

In [22]:
user_item_matrix.loc[668, "Toy Story (1995)"] # Actual is 3

np.float64(3.0)

In [23]:
def item_based_predict_rating(user, item):
    rated_item = user_item_matrix.loc[user][user_item_matrix.loc[user]>0].index 
    # calculate weightes avg
    num = sum(item_item_df.loc[item , other] * user_item_matrix.loc[user, other] for other in rated_item)
    denom = sum(abs(item_item_df.loc[item , other]) for other in rated_item)
    return np.round(num/denom)
    
    

In [24]:
item_based_predict_rating(1, "Tom and Huck (1995)") # predicted rating is 3

np.float64(3.0)

# Content Based Filtering

In [25]:
# Splitting movie_genre to sepaarate Column
df_movies["genres"] = df_movies["genres"].str.split("|")

In [26]:
mov = df_movies.explode("genres")
mov

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


In [27]:
# Create a separate column for Genre using pivot
mov_new = mov.pivot(index='movieId',columns='genres',values='title') 
mov_new

genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,Toy Story (1995),Toy Story (1995),Toy Story (1995),Toy Story (1995),,,,Toy Story (1995),,,,,,,,,,
2,,,Jumanji (1995),,Jumanji (1995),,,,,Jumanji (1995),,,,,,,,,,
3,,,,,,Grumpier Old Men (1995),,,,,,,,,,Grumpier Old Men (1995),,,,
4,,,,,,Waiting to Exhale (1995),,,Waiting to Exhale (1995),,,,,,,Waiting to Exhale (1995),,,,
5,,,,,,Father of the Bride Part II (1995),,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146684,,,,Cosmic Scrat-tastrophe (2015),Cosmic Scrat-tastrophe (2015),Cosmic Scrat-tastrophe (2015),,,,,,,,,,,,,,
146878,,,,,,Le Grand Restaurant (1966),,,,,,,,,,,,,,
148238,,,,,,A Very Murray Christmas (2015),,,,,,,,,,,,,,
148626,,,,,,,,,The Big Short (2015),,,,,,,,,,,


In [28]:
m = ~mov_new.isna()
m = m.astype(int)
m.head()

genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
# Rmoving (no genres listed) from columns
m = m.iloc[:, 1:]
m.head()

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


above dataframe can be used for movie feature

In [30]:
# Hamming distance
def hamming_distance(m1, m2):
    return sum(abs(m1-m2))

In [31]:
hamming_distance(m.loc[2], m.loc[4]) # distance is 6 - higher diff

6

In [32]:
hamming_distance(m.loc[2], m.loc[2]) # with same movie, distance is 0

0

In [33]:
# calcualate rating_matrix based on distance
rank = []
for i in m.index:
    for j in m.index:
        if i == j :
            continue  # no need to calcualte distance for same moview - its always 0
        rank.append([i, j, hamming_distance(m.loc[i], m.loc[j])])

rank_matrix = pd.DataFrame(rank, columns = ["query_id","candidate_id","distance"])
rank_matrix.head()

Unnamed: 0,query_id,candidate_id,distance
0,1,2,2
1,1,3,5
2,1,4,6
3,1,5,4
4,1,6,8


Shorted the distance - more similarity

In [34]:
# add query / candidate movie title in rank matrix
# Creating a feature for movies
rank_matrix_n = rank_matrix.merge(df_movies, left_on="query_id", right_on="movieId").rename(columns={"title": "query_movie_title"}).reset_index().drop(["movieId", "genres"], axis=1)
rank_matrix_n = rank_matrix_n.merge(df_movies, left_on="candidate_id", right_on="movieId").rename(columns={"title": "candidate_movie_title"}).reset_index().drop(["movieId", "genres"], axis=1)
rank_matrix_n.head()




Unnamed: 0,level_0,index,query_id,candidate_id,distance,query_movie_title,candidate_movie_title
0,0,0,1,2,2,Toy Story (1995),Jumanji (1995)
1,1,1,1,3,5,Toy Story (1995),Grumpier Old Men (1995)
2,2,2,1,4,6,Toy Story (1995),Waiting to Exhale (1995)
3,3,3,1,5,4,Toy Story (1995),Father of the Bride Part II (1995)
4,4,4,1,6,8,Toy Story (1995),Heat (1995)


In [43]:
rank_matrix_n.drop(["level_0","index"], axis=1, inplace=True)
rank_matrix_n.head()

Unnamed: 0,query_id,candidate_id,distance,query_movie_title,candidate_movie_title
0,1,2,2,Toy Story (1995),Jumanji (1995)
1,1,3,5,Toy Story (1995),Grumpier Old Men (1995)
2,1,4,6,Toy Story (1995),Waiting to Exhale (1995)
3,1,5,4,Toy Story (1995),Father of the Bride Part II (1995)
4,1,6,8,Toy Story (1995),Heat (1995)


Movie feature is created

In [61]:
# Now find a movie if user have watched Robots (2005)
rank_matrix_n.loc[rank_matrix_n["query_movie_title"] == "Robots (2005)"].sort_values("distance")


Unnamed: 0,query_id,candidate_id,distance,query_movie_title,candidate_movie_title
67357486,32031,78499,1,Robots (2005),Toy Story 3 (2010)
67349468,32031,673,1,Robots (2005),Space Jam (1996)
67359095,32031,130520,1,Robots (2005),Home (2015)
67355978,32031,47124,1,Robots (2005),"Ant Bully, The (2006)"
67357493,32031,78637,1,Robots (2005),Shrek Forever After (a.k.a. Shrek: The Final C...
...,...,...,...,...,...
67350506,32031,2058,12,Robots (2005),"Negotiator, The (1998)"
67353022,32031,5388,12,Robots (2005),Insomnia (2002)
67356685,32031,60688,12,Robots (2005),Conspiracy (2008)
67357272,32031,73511,12,Robots (2005),Horsemen (2009)


if user is interested in Robots (2005) -> can also watch Toy Story 3 (2010), Space Jam (1996) etc

# Regression


In [44]:
!gdown 1b7_yRRBs3s3atp1WQHN2GU577vxY8u_h

Downloading...
From: https://drive.google.com/uc?id=1b7_yRRBs3s3atp1WQHN2GU577vxY8u_h
To: /Users/megha/users.csv
100%|██████████████████████████████████████| 16.8k/16.8k [00:00<00:00, 20.1MB/s]


In [45]:
# Reading a user.csv file
df_user = pd.read_csv("/Users/megha/users.csv")
df_user.head()

Unnamed: 0,userId,age,time_spent_per_day
0,1,16,3.976315
1,2,24,1.891303
2,3,20,4.521478
3,4,23,2.095284
4,5,35,1.75986


In [53]:
from datetime import datetime
df_rating["hour"] = df_rating["timestamp"].apply(lambda x : datetime.fromtimestamp(x).hour)
df_rating

Unnamed: 0,userId,movieId,rating,timestamp,hour
0,1,16,4.0,1217897793,6
1,1,24,1.5,1217895807,5
2,1,32,4.0,1217896246,6
3,1,47,4.0,1217896556,6
4,1,50,4.0,1217896523,6
...,...,...,...,...,...
105334,668,142488,4.0,1451535844,9
105335,668,142507,3.5,1451535889,9
105336,668,143385,4.0,1446388585,20
105337,668,144976,2.5,1448656898,2


In [68]:
k = df_rating.groupby("userId")["rating"].mean().reset_index()
k1 = df_rating.groupby("userId")["hour"].mean().reset_index()
df_user_n = df_user.merge(k, on="userId")
df_user_n = df_user_n.merge(k1, on="userId")
df_user_n.set_index("userId", inplace=True)
df_user_n

Unnamed: 0_level_0,age,time_spent_per_day,rating,hour
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,16,3.976315,3.628319,5.628319
2,24,1.891303,3.896552,21.000000
3,20,4.521478,3.794521,14.465753
4,23,2.095284,4.161290,8.000000
5,35,1.759860,3.183824,0.411765
...,...,...,...,...
664,22,5.288101,4.067568,17.972973
665,20,5.220446,3.211454,17.105727
666,19,3.262313,3.342222,13.511111
667,17,3.674356,3.790541,15.081081


above one is feature for user

In [73]:
# standarize above datframe as all features are not on same scale
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_user_ss = sc.fit_transform(df_user_n)
df_user_ss

array([[-1.47029242,  0.34107301, -0.07696514, -0.88711396],
       [-0.13561556, -1.07994746,  0.5116626 ,  1.49086125],
       [-0.80295399,  0.71262375,  0.28775894,  0.48002347],
       ...,
       [-0.9697886 , -0.14554853, -0.70479307,  0.33234179],
       [-1.30345781,  0.13527572,  0.27902496,  0.5752137 ],
       [-0.13561556, -0.51603068, -1.90082486,  0.01190815]],
      shape=(668, 4))

In [84]:
df_user_ss = pd.DataFrame(df_user_ss, columns = df_user_n.columns, index=df_user_n.index)
df_user_ss

Unnamed: 0_level_0,age,time_spent_per_day,rating,hour
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-1.470292,0.341073,-0.076965,-0.887114
2,-0.135616,-1.079947,0.511663,1.490861
3,-0.802954,0.712624,0.287759,0.480023
4,-0.302450,-0.940926,1.092622,-0.520219
5,1.699565,-1.169532,-1.052393,-1.694107
...,...,...,...,...
664,-0.469285,1.235109,0.886951,1.022585
665,-0.802954,1.188999,-0.991760,0.888423
666,-0.969789,-0.145549,-0.704793,0.332342
667,-1.303458,0.135276,0.279025,0.575214


In [112]:
# create a dataframe
data = df_rating[["userId","movieId","rating"]].copy().reset_index(drop=True)
data = data.merge(df_user_ss, on="userId", how = "right")
data = data.merge(m, on="movieId", how = "right")
data.head()

Unnamed: 0,userId,movieId,rating_x,age,time_spent_per_day,rating_y,hour,Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,2.0,1,5.0,-0.135616,-1.079947,0.511663,1.490861,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,5.0,1,4.0,1.699565,-1.169532,-1.052393,-1.694107,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,8.0,1,5.0,0.364888,0.298545,0.292345,1.336163,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,11.0,1,4.0,-1.303458,0.513712,-0.301997,0.562671,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,14.0,1,4.0,-0.30245,1.251552,-0.410805,0.562671,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [113]:
data = data.drop(["userId", "movieId"], axis = 1)
data.dropna(inplace=True)
y = data.pop("rating_x")
data.head()

Unnamed: 0,age,time_spent_per_day,rating_y,hour,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,-0.135616,-1.079947,0.511663,1.490861,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1.699565,-1.169532,-1.052393,-1.694107,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0.364888,0.298545,0.292345,1.336163,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,-1.303458,0.513712,-0.301997,0.562671,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.30245,1.251552,-0.410805,0.562671,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [114]:
data.rename(columns={"rating_y": "avg_rating"}, inplace=True)
data.head() # X Feature

Unnamed: 0,age,time_spent_per_day,avg_rating,hour,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,-0.135616,-1.079947,0.511663,1.490861,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1.699565,-1.169532,-1.052393,-1.694107,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0.364888,0.298545,0.292345,1.336163,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,-1.303458,0.513712,-0.301997,0.562671,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.30245,1.251552,-0.410805,0.562671,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [115]:
y.head() # y-> feature

0    5.0
1    4.0
2    5.0
3    4.0
4    4.0
Name: rating_x, dtype: float64

In [120]:
# Train a regression model 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse

train_x, test_x, train_y, test_y = train_test_split(data, y, test_size=0.2, random_state=42)
model = GradientBoostingRegressor(max_depth=4, n_estimators=500)
model.fit(train_x, train_y)
y_pred = model.predict(test_x)
print(f"RMSE : {mse(test_y, y_pred) ** 0.5}")

RMSE : 0.8979411247866143


In [125]:
test_y.iloc[100]

np.float64(4.0)

In [126]:
y_pred[100]

np.float64(3.461221661943119)

In above case actual and predicted rating are too close.

# Matrix Factorization

In [67]:
def matrix_factorization(A, B , C, K, it= 100, lr = 0.01):
    C = C.T
    A_values = A.values # Changing to numy array
    for st in range(it):
        for i in range(len(A_values)):
            for j in range(len(A_values[0])):
                if A_values[i, j] == 0 :
                    continue # only looking for values where its greater than 0 
                A_hat_ij = np.dot(B[i, :], C[: , j])
                error = (A_hat_ij - A_values[i, j])
                for k1 in range(K):
                    b_old = B[i][k1]
                    B[i][k1] = B[i][k1] - lr * (2 * error * C[k1][j])
                    C[k1][j] = C[k1][j] - lr * (2 * error *  b_old )
    return B, C.T
    

In [68]:
# A = B.CT
# A = user_item_matrix (n  m)
# n = 668, m = 10325
# B ,C = Intialize random 

n, m = user_item_matrix.shape
K = 2 # embedding / Features

# Intialize matrix B, C with random variable
B = np.random.normal(size=(n, K))
C = np.random.normal(size=(m, K))


B_, C_ = matrix_factorization(user_item_matrix, B , C, K)

In [71]:
# PRedicted rating getting a 4.14 , actual is 5 which is close by

np.dot(B_[1, :], C_[0,:])


np.float64(4.142109856919997)

In [56]:
# Train using inbuild package

from cmfrec import CMF
rm = df_rating[["userId","movieId","rating"]].copy()
rm.columns = ['UserId', 'ItemId', 'Rating']  # Lib requires specific column names

model = CMF(k = 2, lambda_ = 0.1, verbose = False, user_bias= False, item_bias=False)
model.fit(rm)

Collective matrix factorization model
(explicit-feedback variant)


In [57]:
model.A_

array([[ 0.61810845, -1.2968159 ],
       [ 1.0773845 ,  0.44746554],
       [ 0.7495233 , -0.2906971 ],
       ...,
       [ 0.72334623, -1.0197821 ],
       [ 0.6413663 , -0.22247137],
       [-0.25338945, -1.4885309 ]], shape=(668, 2), dtype=float32)

In [58]:
model.B_

array([[ 0.7574347 , -0.06085441],
       [-0.23201276,  1.1658955 ],
       [ 0.6025151 , -0.2405855 ],
       ...,
       [ 0.10675105,  0.6455458 ],
       [-0.05072207, -0.30672663],
       [ 0.001769  ,  0.01069749]], shape=(10325, 2), dtype=float32)

In [63]:
# PRedicted value
np.dot(model.A_[1, :], model.B_[0,: ]) + model.glob_mean_

np.float32(4.305669)

In [65]:
# Actual value
user_item_matrix.iloc[1,0] 

np.float64(5.0)

4.305669 close to 5 based on above observation