## First Load Packages

In [None]:
!pip install torch-geometric

In [2]:
import pandas as pd
import numpy as np
import os
import tqdm
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch.optim import Adam
import torch_geometric
from torch_geometric.data import Data,HeteroData
from torch_geometric.data import DataLoader
from torch_geometric.datasets import MovieLens
from torch_geometric.nn import RGCNConv
from torch_geometric.utils import dropout_adj
from sklearn.model_selection import train_test_split

## Let's load the data

In [None]:
MovieLens(root='.') #Skip if had an error

In [4]:
ratings=pd.read_csv('raw/ml-latest-small/ratings.csv')

In [5]:
ratings.drop_duplicates(inplace=True)

In [6]:
titles=pd.read_csv('raw/ml-latest-small/movies.csv')

In [7]:
titles.drop_duplicates(inplace=True,subset=['title'])

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
titles.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## basic Recommenation System

### basic Content-based Filtering based on Genres


In [10]:
genres=[]
for i in range(titles.shape[0]):
  genres.extend(titles.iloc[i]['genres'].split('|'))
genres=np.unique(genres)

In [11]:
genres

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'], dtype='<U18')

In [12]:
df=pd.DataFrame(np.zeros((titles.shape[0],len(genres))),index=titles['title'],columns=genres)

In [13]:
for i in tqdm.tqdm(range(titles.shape[0])):
  temp=titles.iloc[i]['genres'].split('|')
  for item in temp:
    df.loc[titles.iloc[i]['title'],item]=1

100%|██████████| 9737/9737 [00:04<00:00, 2386.33it/s]


In [14]:
def find_similar_movies(name):
  row=df[df.index==name].iloc[0]
  similarity=np.matmul(np.array(row),np.array(df.T))/np.sqrt((np.array(row)**2).sum())/np.sqrt((np.array(df)**2).sum(axis=1))
  new_df=titles.copy()
  new_df['sim']=similarity
  new_df=new_df.sort_values(by=['sim'],ascending=False)
  return new_df.iloc[:10]

In [15]:
find_similar_movies('Jumanji (1995)')

Unnamed: 0,movieId,title,genres,sim
6075,41566,"Chronicles of Narnia: The Lion, the Witch and ...",Adventure|Children|Fantasy,1.0
1617,2161,"NeverEnding Story, The (1984)",Adventure|Children|Fantasy,1.0
767,1009,Escape to Witch Mountain (1975),Adventure|Children|Fantasy,1.0
6751,59501,"Chronicles of Narnia: Prince Caspian, The (2008)",Adventure|Children|Fantasy,1.0
6629,56171,"Golden Compass, The (2007)",Adventure|Children|Fantasy,1.0
9336,160573,Pete's Dragon (2016),Adventure|Children|Fantasy,1.0
9294,158813,Alice Through the Looking Glass (2016),Adventure|Children|Fantasy,1.0
109,126,"NeverEnding Story III, The (1994)",Adventure|Children|Fantasy,1.0
3574,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,1.0
53,60,"Indian in the Cupboard, The (1995)",Adventure|Children|Fantasy,1.0


## There is no specific way to evaluate such a model, but we try and see for each user based on one movie they have seen, have many recommended they have seen, and what is the average score they have given to those movies

In [30]:
results={'user':[],'number_of_movies_seen':[],'average_score':[],"highest_score":[]}
users=ratings.userId.unique()
for user in tqdm.tqdm(users):
  results['user'].append(user)
  user_liked_movies=ratings[(ratings.userId==user) & (ratings.rating==5)]
  if user_liked_movies.shape[0]>0:
    movieId=user_liked_movies.iloc[0]['movieId']
    movie_name=titles[titles.movieId==movieId].iloc[0]['title']
    similar_movies=list(find_similar_movies(movie_name)['movieId'])
    searched_movies=ratings[(ratings.userId==user) & (ratings.movieId.isin(similar_movies)) & (ratings.movieId!=movieId)] ## The already chosen movie excluded
    results['number_of_movies_seen'].append(searched_movies.shape[0])
    if searched_movies.shape[0]>0:
      results['average_score'].append(searched_movies.rating.mean())
      results['highest_score'].append(searched_movies.rating.max())
    else:
      results['average_score'].append(np.nan)
      results['highest_score'].append(np.nan)
  else:
    results['number_of_movies_seen'].append(np.nan)
    results['average_score'].append(np.nan)
    results['highest_score'].append(np.nan)
results=pd.DataFrame(results)

100%|██████████| 610/610 [00:05<00:00, 106.99it/s]


In [34]:
results.tail()

Unnamed: 0,user,number_of_movies_seen,average_score,highest_score
605,606,2.0,4.0,4.0
606,607,0.0,,
607,608,2.0,3.75,4.5
608,609,,,
609,610,3.0,3.666667,4.0


In [32]:
results[['number_of_movies_seen','average_score','highest_score']].describe()

Unnamed: 0,number_of_movies_seen,average_score,highest_score
count,573.0,145.0,145.0
mean,0.390925,3.803103,3.982759
std,0.832505,0.885082,0.876811
min,0.0,1.0,1.0
25%,0.0,3.166667,3.5
50%,0.0,4.0,4.0
75%,1.0,4.5,4.5
max,6.0,5.0,5.0


#### So for each user based on a single movie they have seen, they have given 3.8 in average to the recommended movies, if they have seen it. And, in average, they have given at least one of the movies, almost 4, if they have seen any of the recommended movies.
#### Also there are many other useful information in the above table that shows have good the output is.



## Now we aim for learning the embedding and use the rates given by people (Colborative)

### This is a little more complicated as it needs to be trained

In [None]:
df=ratings.copy()[['userId','movieId','rating']]

In [None]:
df['is_accepted']=(df['rating']>3).astype(float)

In [None]:
movieId_dict={i:j for j,i in enumerate(df['movieId'].unique())}
userId_dict={i:j for j,i in enumerate(df['userId'].unique())}

In [None]:

# start_movies_id=np.unique(ratings['userId']).max()+1
# ids=list(np.unique(ratings['userId']))
# ids.extend(list(np.unique(ratings['movieId'])+start_movies_id))
# connections=[[],[]]
# for i in tqdm.tqdm(range(ratings.shape[0])):
#   temp_x=ratings.iloc[i]['userId']
#   temp_y=ratings.iloc[i]['movieId']+start_movies_id
#   connections[0].append(temp_x)
#   connections[1].append(temp_y)
# data=Data(x='')

In [None]:
np.array(df[df['is_accepted']==0][['userId','movieId']])

array([[     1,     70],
       [     1,    223],
       [     1,    296],
       ...,
       [   610, 160571],
       [   610, 160836],
       [   610, 170875]])

In [None]:
data = HeteroData()
data['userId'].x=torch.tensor(np.unique(df['userId']))
data['movieId'].x=torch.tensor(np.unique(df['movieId']))
data['edge_index_positive']=torch.tensor(np.array(df[df['is_accepted']==1][['userId','movieId']]))
data['edge_index_negative']=torch.tensor(np.array(df[df['is_accepted']==0][['userId','movieId']]))

In [None]:
class Recommender1(torch.nn.Module):
    def __init__(
        self,
        num_users,
        num_items,
        embedding_size=128
    ):
        super(Recommender1, self).__init__()
        self.user_embedding = torch.nn.Embedding(
            num_embeddings=num_items,
            embedding_dim=embedding_size
        )
        
        self.item_embedding = torch.nn.Embedding(
            num_embeddings=num_items,
            embedding_dim=embedding_size
        )

    def forward(
        self,
        data
    ):
        user_embedding = F.dropout(F.relu(self.user_embedding(data['userId'].x)))
        item_embedding = self.item_embedding(data['movieId'].x)
        return user_embedding, item_embedding


In [None]:
ground_truth_interaction_matrix = torch.tensor(df.is_accepted, dtype=torch.float)

positive_samples = torch.where(ground_truth_interaction_matrix == 1)
negative_samples = torch.where(ground_truth_interaction_matrix == 0)

In [None]:
def loss_fn(
    user_embedding,
    item_embedding,
    data,
    n_samples=200,
    emphasis_on_positive_samples=0.5
):  
    tp=0
    fp=0
    tn=0
    fn=0
    loss=0
    for i in tqdm.tqdm(range(data['edge_index_positive'].size()[0])):
      # print(user_embedding[data['edge_index_positive'][i,0]] @ item_embedding[data['edge_index_positive'][i,1]].T)
      out=torch.sigmoid(user_embedding[userId_dict[data['edge_index_positive'][i,0].item()]] @ item_embedding[movieId_dict[data['edge_index_positive'][i,1].item()]].T)
      loss += F.binary_cross_entropy(
          out,torch.ones(()).to('cuda'))* emphasis_on_positive_samples
      if out>=0.5:
        tp+=1
      else:
        fp+=1
    # Negative samples
    for i in tqdm.tqdm(range(data['edge_index_negative'].size()[0])):
      out=torch.sigmoid(user_embedding[userId_dict[data['edge_index_negative'][i,0].item()]] @ item_embedding[movieId_dict[data['edge_index_negative'][i,1].item()]].T)
      loss += F.binary_cross_entropy(
          out,torch.zeros(()).to('cuda')) * (1 - emphasis_on_positive_samples)
      if out<=0.5:
        tn+=1
      else:
        fn+=1
    acc=(tp+tn)/(data['edge_index_positive'].size()[0]+data['edge_index_negative'].size()[0])
    precision=tp/(tp+fp)
    recall=tp/(tp+fn)
    return loss,acc,precision,recall

In [None]:
def train(
    data,
    n_samples=200,
    epochs=200,
    emphasis_on_positive_samples=0.5,
):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = Recommender1(
        num_users=torch.max(data['userId'].x)+1,
        num_items=torch.max(data['movieId'].x)+1,
        embedding_size=128
    ).to(device)

    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


    loss_log = []

    for epoch in range(epochs):
        user_embedding, item_embedding = model(
            data
        )
        
        loss,acc,precision,recall= loss_fn(
            user_embedding,
            item_embedding,
            data,
            n_samples=200,
            emphasis_on_positive_samples=emphasis_on_positive_samples,
        )
        loss_log.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f'Epoch: {epoch}, Loss: {loss.item()}, "Acc:{acc}, "precision:{precision}, "recall:{recall}')

    return loss_log, model

In [None]:
train(data)

100%|██████████| 61716/61716 [00:19<00:00, 3106.95it/s]
100%|██████████| 39120/39120 [00:12<00:00, 3022.67it/s]


Epoch: 0, Loss: 326331.0, "Acc:0.5017454083858939, "precision:0.5026087238317454, "recall:0.6134601692904043


100%|██████████| 61716/61716 [00:18<00:00, 3285.52it/s]
100%|██████████| 39120/39120 [00:12<00:00, 3212.55it/s]


Epoch: 1, Loss: 322198.09375, "Acc:0.5023801023444008, "precision:0.5006643333981463, "recall:0.6147831277357739


100%|██████████| 61716/61716 [00:20<00:00, 3030.99it/s]
100%|██████████| 39120/39120 [00:11<00:00, 3307.48it/s]


Epoch: 2, Loss: 316114.28125, "Acc:0.5042841842199215, "precision:0.5043910817292112, "recall:0.616074255858138


 12%|█▏        | 7484/61716 [00:02<00:17, 3031.07it/s]


KeyboardInterrupt: ignored

### https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.RGCNConv.html 

### Now RGCN conv can be used to improve the model using edges for feature learning