In [20]:
!pip install pickle5



In [21]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import pickle5 as pickle

In [22]:
item_train=pd.read_csv('content_item_train.csv',header=None)
user_train=pd.read_csv('content_user_train.csv',header=None)
y_train=pd.read_csv('content_y_train.csv',header=None)

item_train=item_train.drop([0], axis=1)
user_train=user_train.drop([0,1,2], axis=1)
# print(y_train)

In [23]:
num_user_features = user_train.shape[1]
num_item_features = item_train.shape[1]

# We scale the users and items by standard scaler of scikit learn and scale y or ratings also but with min max scaler

item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.to_numpy().reshape(-1, 1))
y_train = scalerTarget.transform(y_train.to_numpy().reshape(-1, 1))
# ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

# As we have set same random_state then all of them will be similarly shuffled

item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)

In [24]:
# We use here keras functional API which lends us more flexibility to make layers with non-linear topology and multiple i/p and o/p

user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256,activation='relu'),
    tf.keras.layers.Dense(units=128,activation='relu'),
    tf.keras.layers.Dense(units=32)
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256,activation='relu'),
    tf.keras.layers.Dense(units=128,activation='relu'),
    tf.keras.layers.Dense(units=32)
])
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1) # o/p=x/sqrt(max(sum(x**2),epsilon))

input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

output = tf.keras.layers.Dot(axes=1)([vu, vm])

model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 14)]                 0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, 16)]                 0         []                            
                                                                                                  
 sequential_2 (Sequential)   (None, 32)                   40864     ['input_5[0][0]']             
                                                                                                  
 sequential_3 (Sequential)   (None, 32)                   41376     ['input_6[0][0]']             
                                                                                            

In [25]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01),loss=tf.keras.losses.MeanSquaredError()) # Mostly dont need to change the learning rate as adam so change automatically
model.fit([user_train, item_train], y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x78c271ef13f0>

In [65]:
model.evaluate([user_test, item_test], y_test) # Check if the error is comparable to training error or the model would have overfit



0.08200788497924805

In [66]:
item_vecs=pd.read_csv('content_item_vecs.csv',header=None)
item_vecs_with_index=pd.read_csv('content_item_vecs.csv',header=None)
item_vecs=item_vecs.drop([0],axis=1)
movie_list=pd.read_csv('content_movie_list.csv')

movie_list.set_index("movieId", drop=True, inplace=True)
# movie_dict = movie_list.to_dict(orient="index")

print(movie_list)
print(item_vecs)

                                        title                           genres
movieId                                                                       
4054               Save the Last Dance (2001)                    Drama|Romance
4069              Wedding Planner, The (2001)                   Comedy|Romance
4148                          Hannibal (2001)                  Horror|Thriller
4149     Saving Silverman (Evil Woman) (2001)                   Comedy|Romance
4153                     Down to Earth (2001)           Comedy|Fantasy|Romance
...                                       ...                              ...
174055                         Dunkirk (2017)            Action|Drama|Thriller
176371               Blade Runner 2049 (2017)                           Sci-Fi
177765                            Coco (2017)     Adventure|Animation|Children
179819        Star Wars: The Last Jedi (2017)  Action|Adventure|Fantasy|Sci-Fi
187593                      Deadpool 2 (2018)       

In [67]:
# To make predictions for a new user we first construct the user vector with all fields as user_train
# We also take the items vector which has all or some(retrieval step) of movies(all or some of v_m) in the training set
# Then we normalize both of them using standard scaler
# Then we pass both of them through model.predict() but before that we also have to expand the user vec to the size of item vec to pass in the network
# We inverse transform the predicted value and then output the items with highest prediction ratings scaler.inverse_transform(y_new)

user_vec = np.array([[0.0,5.0,0.0,0.0,0.0, 0.0, 0.0,0.0, 5.0, 0.0, 0.0,0.0, 0.0, 0.0]])
print(user_vec.shape)
num_items=item_vecs.shape[0]
user_vecs = np.tile(user_vec, (num_items, 1))

suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

y_p = model.predict([suser_vecs, sitem_vecs])

y_pu = scalerTarget.inverse_transform(y_p)

# print(y_pu)

sorted_index = np.argsort(-y_pu.reshape(-1))
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs_with_index.loc[sorted_index]

print(movie_list.loc[sorted_items[0]])

(1, 14)
                                                     title  \
movieId                                                      
8368       Harry Potter and the Prisoner of Azkaban (2004)   
5952         Lord of the Rings: The Two Towers, The (2002)   
40815           Harry Potter and the Goblet of Fire (2005)   
81834    Harry Potter and the Deathly Hallows: Part 1 (...   
98809            Hobbit: An Unexpected Journey, The (2012)   
...                                                    ...   
5266                                     Panic Room (2002)   
6595                                       S.W.A.T. (2003)   
4167                                     15 Minutes (2001)   
6550                                 Johnny English (2003)   
81229                                           Red (2010)   

                             genres  
movieId                              
8368              Adventure|Fantasy  
5952              Adventure|Fantasy  
40815    Adventure|Fantasy|Thrill

In [68]:
# For existing user lets say id 15 we do the same process and compare it with the existing y_train
# We also need to find the user vec here from our user csv and tile it
user_train=pd.read_csv('content_user_train.csv',header=None)
# print(user_train)

dbfile = open('content_user_to_genre.pickle', 'rb')
db = pickle.load(dbfile)

uid=15
user_vec=[]

rated_by=db[uid]["movies"]
# print(rated_by)

for i in range(user_train.shape[0]):
  if user_train.iloc[i][0]==uid:
    user_vec=user_train.iloc[i]

user_vecs = np.tile(user_vec[3:], (num_items, 1))

# print(user_vecs.shape)

suser_vecs = scalerUser.transform(user_vecs)

y_p = model.predict([suser_vecs, sitem_vecs])

y_pu = scalerTarget.inverse_transform(y_p)

sorted_index = np.argsort(-y_pu.reshape(-1))
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs_with_index.loc[sorted_index]

# print(movie_list.loc[sorted_items[0]])
# print(sorted_items[0].shape[0])

for i in range(sorted_items[0].shape[0]):
  title=movie_list.loc[sorted_items[0][i]]["title"]
  if sorted_items[0][i] in rated_by:
    print(f"The movie {title} has predicted rating {sorted_ypu[i]} and actual rating {rated_by[sorted_items[0][i]]}")
  else:
    print(f"The movie {title} has predicted rating {sorted_ypu[i]}")

# result=[]
# print(sorted_ypu)

The movie Save the Last Dance (2001) has predicted rating [4.543369]
The movie Wedding Planner, The (2001) has predicted rating [4.506286]
The movie Hannibal (2001) has predicted rating [4.5019274]
The movie Saving Silverman (Evil Woman) (2001) has predicted rating [4.4961147]
The movie Down to Earth (2001) has predicted rating [4.49541]
The movie Mexican, The (2001) has predicted rating [4.492109]
The movie 15 Minutes (2001) has predicted rating [4.428908]
The movie Enemy at the Gates (2001) has predicted rating [4.4033356]
The movie Heartbreakers (2001) has predicted rating [4.3997154]
The movie Spy Kids (2001) has predicted rating [4.3873963]
The movie Along Came a Spider (2001) has predicted rating [4.384523]
The movie Blow (2001) has predicted rating [4.3840675]
The movie Bridget Jones's Diary (2001) has predicted rating [4.3837423]
The movie Joe Dirt (2001) has predicted rating [4.36357]
The movie Crocodile Dundee in Los Angeles (2001) has predicted rating [4.347271]
The movie Mu

In [69]:
def sq_dist(a,b):
    d=(np.linalg.norm(a-b))**2
    return d

In [70]:
# To find items similar to a given item we pass the v_m of all items one by one along with the v_m of to whom we want similar items to the above function
# Then we get the required no of items with smallest distance

# OR

# We can directly create a distance matrix between all items once item is trained which can be used to make recommendations
# For this we need the item vec for each item for this we make another network model mimicking item_NN

In [71]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))
vm_m = item_NN(input_item_m)
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)
model_m = tf.keras.Model(input_item_m, vm_m)
model_m.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 16)]              0         
                                                                 
 sequential_3 (Sequential)   (None, 32)                41376     
                                                                 
 tf.math.l2_normalize_8 (TF  (None, 32)                0         
 OpLambda)                                                       
                                                                 
Total params: 41376 (161.62 KB)
Trainable params: 41376 (161.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [72]:
# Then we pass the item_vector which will be the matrix consisting of all the features of all the items as in item_train to obtain the v_m of all the movies after scaling it

scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs)
print(vms)

[[-0.23094149  0.03424974 -0.07746395 ...  0.16041799  0.07380514
   0.04663192]
 [-0.33621916  0.02558675 -0.06370818 ...  0.11614625  0.00463033
   0.02654301]
 [-0.22976854  0.20899443 -0.10060906 ...  0.06000428  0.21577398
  -0.05218875]
 ...
 [-0.34601504 -0.26502442  0.01434084 ...  0.16600609 -0.08263359
   0.06651987]
 [-0.41391793  0.09882239  0.09683219 ...  0.03524744 -0.09905239
   0.14889987]
 [-0.343359   -0.20091392  0.07435548 ... -0.06800257 -0.27845266
  -0.0079408 ]]


In [73]:
# This precomputation of vms can also be used to make faster recommendations to users(new or existing) as we now will only need to make their own network(carry out inference) like
# we did for vm and then only run it to get the vu for that specific user(which will be a vector as it will be for a specific user(existing or new)) and then do matmul to get prob

input_user_u = tf.keras.layers.Input(shape=(num_user_features))
vu_u = user_NN(input_user_u)
vu_u = tf.linalg.l2_normalize(vu_u, axis=1)
model_u = tf.keras.Model(input_user_u, vu_u)
model_u.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 14)]              0         
                                                                 
 sequential_2 (Sequential)   (None, 32)                40864     
                                                                 
 tf.math.l2_normalize_9 (TF  (None, 32)                0         
 OpLambda)                                                       
                                                                 
Total params: 40864 (159.62 KB)
Trainable params: 40864 (159.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [74]:
user_vec = np.array([[0.0,5.0,0.0,0.0,0.0, 0.0, 0.0,0.0, 5.0, 0.0, 0.0,0.0, 0.0, 0.0]])
num_items=item_vecs.shape[0]

suser_vecs = scalerUser.transform(user_vec)

vu = model_u.predict(suser_vecs)

y_p=np.matmul(vu,vms.T)

y_pu = scalerTarget.inverse_transform(y_p)

# print(y_pu.reshape(-1).shape)

sorted_index = np.argsort(-y_pu.reshape(-1))
# print(sorted_index)
sorted_items = item_vecs_with_index.loc[sorted_index]

print(movie_list.loc[sorted_items[0]])

                                                     title  \
movieId                                                      
8368       Harry Potter and the Prisoner of Azkaban (2004)   
5952         Lord of the Rings: The Two Towers, The (2002)   
40815           Harry Potter and the Goblet of Fire (2005)   
81834    Harry Potter and the Deathly Hallows: Part 1 (...   
98809            Hobbit: An Unexpected Journey, The (2012)   
...                                                    ...   
5266                                     Panic Room (2002)   
6595                                       S.W.A.T. (2003)   
4167                                     15 Minutes (2001)   
6550                                 Johnny English (2003)   
81229                                           Red (2010)   

                             genres  
movieId                              
8368              Adventure|Fantasy  
5952              Adventure|Fantasy  
40815    Adventure|Fantasy|Thriller  
818

In [84]:
# Then we make the distance matrix between each v_m
dim=len(vms)
dist = np.zeros((dim,dim))

for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])

index_values=movie_list.index.values
dist_df = pd.DataFrame(data = dist,index = index_values,columns=index_values)

print(dist_df)
# Then we can take from each row or each column minimum k distance items which wil be most similar to the respective row or column where k is the number of similar items we want to a particular item

          4054      4069      4148      4149      4153      4161      4167    \
4054    0.000000  0.128373  1.282353  0.074446  0.180492  0.381402  0.550592   
4069    0.128373  0.000000  1.419855  0.035299  0.252390  0.212081  0.612380   
4148    1.282353  1.419855  0.000000  1.304273  1.427146  0.977582  0.608636   
4149    0.074446  0.035299  1.304273  0.000000  0.120776  0.234413  0.546392   
4153    0.180492  0.252390  1.427146  0.120776  0.000000  0.475693  0.768363   
...          ...       ...       ...       ...       ...       ...       ...   
174055  1.762136  1.605538  1.287718  1.670968  1.890793  0.953057  1.063393   
176371  1.628555  1.481684  2.371672  1.699386  1.981351  1.614332  2.085173   
177765  1.836944  1.643760  1.880828  1.837492  2.175137  1.252472  1.518915   
179819  1.183848  0.829532  1.978093  0.986989  1.119154  0.755242  1.787571   
187593  1.740685  1.474568  2.139864  1.721564  2.037920  1.350595  1.738996   

          4223      4228      4232    .

In [96]:
# For example for movieId 6323 Identity (2003) if we want to find similar movies

print(movie_list.loc[6323])
print("\n")
# print(dist_df.loc[6323])

sorted_index = np.argsort(dist_df.loc[6323])
# print(sorted_index)
sorted_items = item_vecs_with_index.loc[sorted_index]
indices=sorted_items[0]
print(movie_list.loc[indices[1:]])

title                   Identity (2003)
genres    Crime|Horror|Mystery|Thriller
Name: 6323, dtype: object


                                                     title  \
movieId                                                      
4865                                      From Hell (2001)   
5445                                Minority Report (2002)   
99007                                   Warm Bodies (2013)   
81564                                      Megamind (2010)   
4720                                    Others, The (2001)   
...                                                    ...   
40815           Harry Potter and the Goblet of Fire (2005)   
41566    Chronicles of Narnia: The Lion, the Witch and ...   
137857                              The Jungle Book (2016)   
59501     Chronicles of Narnia: Prince Caspian, The (2008)   
4247                                       Joe Dirt (2001)   

                                          genres  
movieId                           

In [103]:
# Now for new movie we construct the new movie vector and then run it through the cloned movies model and get its vm and find minimum distance of it from all other movies to find movies related to it
# and then add it to all other vms to suggest it to users

new_item_vec=np.array([[2023,3.65,0.0,1.0,0.0,0.0,0.0, 0.0, 0.0,0.0, 1.0, 0.0, 0.0,0.0, 0.0, 0.0]])

scaled_new_item_vec = scalerItem.transform(new_item_vec)
new_item_vm = model_m.predict(scaled_new_item_vec)
# print(new_item_vm.shape)

# print(vms.shape)

dist_new_item=np.zeros(vms.shape[0])
for i in range(vms.shape[0]):
  dist_new_item[i] = sq_dist(vms[i, :], new_item_vm)

# print(dist_new_item)
sorted_index = np.argsort(dist_new_item)
# print(sorted_index)
sorted_items = item_vecs_with_index.loc[sorted_index]
print(movie_list.loc[sorted_items[0]])

                                                     title  \
movieId                                                      
137857                              The Jungle Book (2016)   
118696    The Hobbit: The Battle of the Five Armies (2014)   
106489         Hobbit: The Desolation of Smaug, The (2013)   
98809            Hobbit: An Unexpected Journey, The (2012)   
4896     Harry Potter and the Sorcerer's Stone (a.k.a. ...   
...                                                    ...   
4148                                       Hannibal (2001)   
53953                                          1408 (2007)   
8947                                    Grudge, The (2004)   
39446                                        Saw II (2005)   
4876       Thirteen Ghosts (a.k.a. Thir13en Ghosts) (2001)   

                             genres  
movieId                              
137857      Adventure|Drama|Fantasy  
118696            Adventure|Fantasy  
106489            Adventure|Fantasy  
988