In [33]:
!pip install pickle5



In [34]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import pickle5 as pickle

In [35]:
item_train=pd.read_csv('content_item_train.csv',header=None)
user_train=pd.read_csv('content_user_train.csv',header=None)
y_train=pd.read_csv('content_y_train.csv',header=None)

item_train=item_train.drop([0], axis=1)
user_train=user_train.drop([0,1,2], axis=1)
# print(y_train)

In [36]:
num_user_features = user_train.shape[1]
num_item_features = item_train.shape[1]

# We scale the users and items by standard scaler of scikit learn and scale y or ratings also but with min max scaler

item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.to_numpy().reshape(-1, 1))
y_train = scalerTarget.transform(y_train.to_numpy().reshape(-1, 1))
# ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

# As we have set same random_state then all of them will be similarly shuffled

item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)

In [37]:
# We use here keras functional API which lends us more flexibility to make layers with non-linear topology and multiple i/p and o/p

user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256,activation='relu'),
    tf.keras.layers.Dense(units=128,activation='relu'),
    tf.keras.layers.Dense(units=32)
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256,activation='relu'),
    tf.keras.layers.Dense(units=128,activation='relu'),
    tf.keras.layers.Dense(units=32)
])
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1) # o/p=x/sqrt(max(sum(x**2),epsilon))

input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

output = tf.keras.layers.Dot(axes=1)([vu, vm])

model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_7 (InputLayer)        [(None, 14)]                 0         []                            
                                                                                                  
 input_8 (InputLayer)        [(None, 16)]                 0         []                            
                                                                                                  
 sequential_2 (Sequential)   (None, 32)                   40864     ['input_7[0][0]']             
                                                                                                  
 sequential_3 (Sequential)   (None, 32)                   41376     ['input_8[0][0]']             
                                                                                            

In [38]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01),loss=tf.keras.losses.MeanSquaredError()) # Mostly dont need to change the learning rate as adam so change automatically
model.fit([user_train, item_train], y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7d91c2817640>

In [39]:
model.evaluate([user_test, item_test], y_test) # Check if the error is comparable to training error or the model would have overfit



0.08110948652029037

In [40]:
item_vecs=pd.read_csv('content_item_vecs.csv',header=None)
item_vecs_with_index=pd.read_csv('content_item_vecs.csv',header=None)
item_vecs=item_vecs.drop([0],axis=1)
movie_list=pd.read_csv('content_movie_list.csv')

movie_list.set_index("movieId", drop=True, inplace=True)
# movie_dict = movie_list.to_dict(orient="index")

print(movie_list)
print(item_vecs)

                                        title                           genres
movieId                                                                       
4054               Save the Last Dance (2001)                    Drama|Romance
4069              Wedding Planner, The (2001)                   Comedy|Romance
4148                          Hannibal (2001)                  Horror|Thriller
4149     Saving Silverman (Evil Woman) (2001)                   Comedy|Romance
4153                     Down to Earth (2001)           Comedy|Fantasy|Romance
...                                       ...                              ...
174055                         Dunkirk (2017)            Action|Drama|Thriller
176371               Blade Runner 2049 (2017)                           Sci-Fi
177765                            Coco (2017)     Adventure|Animation|Children
179819        Star Wars: The Last Jedi (2017)  Action|Adventure|Fantasy|Sci-Fi
187593                      Deadpool 2 (2018)       

In [41]:
# To make predictions for a new user we first construct the user vector with all fields as user_train
# We also take the items vector which has all or some(retrieval step) of movies(all or some of v_m) in the training set
# Then we normalize both of them using standard scaler
# Then we pass both of them through model.predict() but before that we also have to expand the user vec to the size of item vec to pass in the network
# We inverse transform the predicted value and then output the items with highest prediction ratings scaler.inverse_transform(y_new)

user_vec = np.array([[0.0,5.0,0.0,0.0,0.0, 0.0, 0.0,0.0, 5.0, 0.0, 0.0,0.0, 0.0, 0.0]])
print(user_vec.shape)
num_items=item_vecs.shape[0]
user_vecs = np.tile(user_vec, (num_items, 1))

suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

y_p = model.predict([suser_vecs, sitem_vecs])

y_pu = scalerTarget.inverse_transform(y_p)

# print(y_pu)

sorted_index = np.argsort(-y_pu.reshape(-1))
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs_with_index.loc[sorted_index]

print(movie_list.loc[sorted_items[0]])

(1, 14)
                                                     title  \
movieId                                                      
54001     Harry Potter and the Order of the Phoenix (2007)   
98809            Hobbit: An Unexpected Journey, The (2012)   
8368       Harry Potter and the Prisoner of Azkaban (2004)   
4896     Harry Potter and the Sorcerer's Stone (a.k.a. ...   
106489         Hobbit: The Desolation of Smaug, The (2013)   
...                                                    ...   
4386                                    Cats & Dogs (2001)   
6958                           Haunted Mansion, The (2003)   
32031                                        Robots (2005)   
8373                            Stepford Wives, The (2004)   
49274                                    Happy Feet (2006)   

                                                    genres  
movieId                                                     
54001                              Adventure|Drama|Fantasy  
98

In [42]:
# For existing user lets say id 15 we do the same process and compare it with the existing y_train
# We also need to find the user vec here from our user csv and tile it
user_train=pd.read_csv('content_user_train.csv',header=None)
# print(user_train)

dbfile = open('content_user_to_genre.pickle', 'rb')
db = pickle.load(dbfile)

uid=15
user_vec=[]

rated_by=db[uid]["movies"]
# print(rated_by)

for i in range(user_train.shape[0]):
  if user_train.iloc[i][0]==uid:
    user_vec=user_train.iloc[i]

user_vecs = np.tile(user_vec[3:], (num_items, 1))

# print(user_vecs.shape)

suser_vecs = scalerUser.transform(user_vecs)

y_p = model.predict([suser_vecs, sitem_vecs])

y_pu = scalerTarget.inverse_transform(y_p)

sorted_index = np.argsort(-y_pu.reshape(-1))
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs_with_index.loc[sorted_index]

# print(movie_list.loc[sorted_items[0]])
# print(sorted_items[0].shape[0])

for i in range(sorted_items[0].shape[0]):
  title=movie_list.loc[sorted_items[0][i]]["title"]
  if sorted_items[0][i] in rated_by:
    print(f"The movie {title} has predicted rating {sorted_ypu[i]} and actual rating {rated_by[sorted_items[0][i]]}")
  else:
    print(f"The movie {title} has predicted rating {sorted_ypu[i]}")

# result=[]
# print(sorted_ypu)

The movie Save the Last Dance (2001) has predicted rating [4.5289345]
The movie Wedding Planner, The (2001) has predicted rating [4.4401]
The movie Hannibal (2001) has predicted rating [4.420075]
The movie Saving Silverman (Evil Woman) (2001) has predicted rating [4.37271]
The movie Down to Earth (2001) has predicted rating [4.250629]
The movie Mexican, The (2001) has predicted rating [4.2313957]
The movie 15 Minutes (2001) has predicted rating [4.197208]
The movie Enemy at the Gates (2001) has predicted rating [4.124496]
The movie Heartbreakers (2001) has predicted rating [4.123955]
The movie Spy Kids (2001) has predicted rating [4.119831]
The movie Along Came a Spider (2001) has predicted rating [4.097034]
The movie Blow (2001) has predicted rating [4.092375]
The movie Bridget Jones's Diary (2001) has predicted rating [4.087576]
The movie Joe Dirt (2001) has predicted rating [4.084279]
The movie Crocodile Dundee in Los Angeles (2001) has predicted rating [4.0795884]
The movie Mummy R

In [43]:
def sq_dist(a,b):
    d=(np.linalg.norm(a-b))**2
    return d

In [44]:
# To find items similar to a given item we pass the v_m of all items one by one along with the v_m of to whom we want similar items to the above function
# Then we get the required no of items with smallest distance

# OR

# We can directly create a distance matrix between all items once item is trained which can be used to make recommendations
# For this we need the item vec for each item for this we make another network model mimicking item_NN

In [45]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))
vm_m = item_NN(input_item_m)
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)
model_m = tf.keras.Model(input_item_m, vm_m)
model_m.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 16)]              0         
                                                                 
 sequential_3 (Sequential)   (None, 32)                41376     
                                                                 
 tf.math.l2_normalize_7 (TF  (None, 32)                0         
 OpLambda)                                                       
                                                                 
Total params: 41376 (161.62 KB)
Trainable params: 41376 (161.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [46]:
# Then we pass the item_vector which will be the matrix consisting of all the features of all the items as in item_train to obtain the v_m of all the movies after scaling it

scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs)
print(vms)

[[ 0.08939263 -0.10368168 -0.03426018 ...  0.08620497 -0.01695278
  -0.1347203 ]
 [ 0.10606884  0.0608131   0.0881143  ...  0.2198571   0.14163622
  -0.20064487]
 [ 0.18678407 -0.33190748 -0.40716535 ...  0.25558203 -0.00437876
  -0.2651303 ]
 ...
 [ 0.08762054  0.15172888  0.24749033 ... -0.01925861 -0.1874625
   0.17375723]
 [ 0.16485946  0.06454979  0.02477586 ... -0.09305555  0.19359736
  -0.03531052]
 [ 0.1829373   0.13197045  0.14020656 ...  0.047936    0.13337064
   0.00806157]]


In [47]:
# This precomputation of vms can also be used to make faster recommendations to users(new or existing) as we now will only need to make their own network(carry out inference) like
# we did for vm and then only run it to get the vu for that specific user(which will be a vector as it will be for a specific user(existing or new)) and then do matmul to get prob

input_user_u = tf.keras.layers.Input(shape=(num_user_features))
vu_u = user_NN(input_user_u)
vu_u = tf.linalg.l2_normalize(vu_u, axis=1)
model_u = tf.keras.Model(input_user_u, vu_u)
model_u.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 14)]              0         
                                                                 
 sequential_2 (Sequential)   (None, 32)                40864     
                                                                 
 tf.math.l2_normalize_8 (TF  (None, 32)                0         
 OpLambda)                                                       
                                                                 
Total params: 40864 (159.62 KB)
Trainable params: 40864 (159.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [51]:
user_vec = np.array([[0.0,5.0,0.0,0.0,0.0, 0.0, 0.0,0.0, 5.0, 0.0, 0.0,0.0, 0.0, 0.0]])
num_items=item_vecs.shape[0]

suser_vecs = scalerUser.transform(user_vec)

vu = model_u.predict(suser_vecs)

y_p=np.matmul(vu,vms.T)

y_pu = scalerTarget.inverse_transform(y_p)

# print(y_pu.reshape(-1).shape)

sorted_index = np.argsort(-y_pu.reshape(-1))
# print(sorted_index)
sorted_items = item_vecs_with_index.loc[sorted_index]

print(movie_list.loc[sorted_items[0]])

                                                     title  \
movieId                                                      
54001     Harry Potter and the Order of the Phoenix (2007)   
98809            Hobbit: An Unexpected Journey, The (2012)   
8368       Harry Potter and the Prisoner of Azkaban (2004)   
4896     Harry Potter and the Sorcerer's Stone (a.k.a. ...   
106489         Hobbit: The Desolation of Smaug, The (2013)   
...                                                    ...   
4386                                    Cats & Dogs (2001)   
6958                           Haunted Mansion, The (2003)   
32031                                        Robots (2005)   
8373                            Stepford Wives, The (2004)   
49274                                    Happy Feet (2006)   

                                                    genres  
movieId                                                     
54001                              Adventure|Drama|Fantasy  
98809     

In [52]:
# Then we make the distance matrix between each v_m
dim=len(vms)
dist = np.zeros((dim,dim))

for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])
print(dist)
# Then we can take from each row or each column minimum k distance items which wil be most similar to the respective row or column where k is the number of similar items we want to a particular item

[[0.         0.21072638 1.43609347 ... 1.86326701 1.25103824 1.69387143]
 [0.21072638 0.         1.54598095 ... 1.72965407 1.20960905 1.30378929]
 [1.43609347 1.54598095 0.         ... 2.41370763 1.93884556 1.88789979]
 ...
 [1.86326701 1.72965407 2.41370763 ... 0.         1.07828581 0.44328659]
 [1.25103824 1.20960905 1.93884556 ... 1.07828581 0.         0.65597931]
 [1.69387143 1.30378929 1.88789979 ... 0.44328659 0.65597931 0.        ]]


In [53]:
print(dist.max())

3.4634956022848797
