In [None]:
!pip install sentence-transformers
!pip install torch-geometric

In [368]:
import pandas as pd
import numpy as np
import os
import tqdm
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch.optim import Adam
import torch_geometric
from torch_geometric.data import Data,HeteroData
from torch_geometric.data import DataLoader
from torch_geometric.datasets import MovieLens
from torch_geometric.nn import RGCNConv
from torch_geometric.utils import dropout_adj
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [369]:
data=MovieLens(root='.') 
ratings=pd.read_csv('raw/ml-latest-small/ratings.csv')
ratings.drop_duplicates(inplace=True)
titles=pd.read_csv('raw/ml-latest-small/movies.csv')
titles.drop_duplicates(inplace=True,subset=['title'])

In [370]:
df=ratings.copy()[['userId','movieId','rating']]
movieId_dict={i:j for j,i in enumerate(df['movieId'].unique())}
userId_dict={i:j for j,i in enumerate(df['userId'].unique())}

In [371]:
class ML(Dataset):
    def __init__(self, df):
        self.users=np.unique(df['userId'])
        self.movies=np.unique(df['movieId'])
        self.edges=df
        self.edges.rating=(self.edges.rating/5).astype('float32')
        self.movieId_dict={i:j for j,i in enumerate(df['movieId'].unique())}
        self.userId_dict={i:j for j,i in enumerate(df['userId'].unique())}
    def __len__(self):
        return len(self.edges)

    def __getitem__(self, idx):
        row=self.edges.iloc[idx]
        userId=torch.tensor(self.userId_dict[row['userId']])
        movieId=torch.tensor(self.movieId_dict[row['movieId']])
        rating=torch.tensor(row['rating'])
        rating=rating.type(torch.cuda.FloatTensor)
        # rating.type('torch.FloatTensor')
        return {'userId':userId,
         'movieId': movieId,
         'rating': rating}


In [372]:
dataset=ML(df)

In [373]:
class Recommender1(torch.nn.Module):
    def __init__(
        self,
        num_users,
        num_items,
        embedding_size=128
    ):
        super(Recommender1, self).__init__()
        self.user_embedding = torch.nn.Embedding(
            num_embeddings=num_users,
            embedding_dim=embedding_size
        )
        
        self.item_embedding = torch.nn.Embedding(
            num_embeddings=num_items,
            embedding_dim=embedding_size
        )

    def forward(
        self,
        users,movies
    ):
        user_embedding = F.dropout(F.relu(self.user_embedding(users)))
        item_embedding = F.dropout(F.relu(self.item_embedding(movies)
        ))
        return user_embedding, item_embedding

     

In [374]:
data_loader=DataLoader(dataset,batch_size=32,shuffle=True)

In [375]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Recommender1(num_users=len(userId_dict),num_items=len(movieId_dict),embedding_size=128).to(device)

In [376]:
def loss_fn(user_embedding,movie_embedding,ratings):
    out=torch.sigmoid((user_embedding*movie_embedding).sum(dim=-1))
    loss = F.binary_cross_entropy(out, ratings)
    mae = F.l1_loss(out, ratings)
    return loss,mae

In [377]:
def train(num_epochs=100):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    best_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_mae = 0.0
        for batch in tqdm.tqdm(data_loader):
            userId=batch['userId'].to(device)
            movieId=batch['movieId'].to(device)
            rating=batch['rating'].to(device)
            optimizer.zero_grad()

            user_embedding, item_embedding = model(userId, movieId)
            loss , mae= loss_fn(user_embedding, item_embedding,rating)
            train_loss += loss.item()
            train_mae += mae.item()

            loss.backward()
            optimizer.step()
        avg_train_loss = train_loss / len(data_loader)
        avg_mae = train_mae / len(data_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, mae: {avg_mae:.4f}")



In [378]:
train(20)

100%|██████████| 3152/3152 [00:35<00:00, 89.48it/s]
100%|██████████| 3152/3152 [00:35<00:00, 89.87it/s]
100%|██████████| 3152/3152 [00:35<00:00, 88.93it/s]
100%|██████████| 3152/3152 [00:35<00:00, 89.97it/s]
100%|██████████| 3152/3152 [00:34<00:00, 91.71it/s]
100%|██████████| 3152/3152 [00:34<00:00, 90.80it/s]
100%|██████████| 3152/3152 [00:35<00:00, 89.96it/s]
100%|██████████| 3152/3152 [00:34<00:00, 90.65it/s]
100%|██████████| 3152/3152 [00:35<00:00, 89.72it/s]
100%|██████████| 3152/3152 [00:35<00:00, 90.05it/s]
100%|██████████| 3152/3152 [00:35<00:00, 89.96it/s]
100%|██████████| 3152/3152 [00:34<00:00, 91.18it/s]
100%|██████████| 3152/3152 [00:35<00:00, 89.90it/s]
100%|██████████| 3152/3152 [00:34<00:00, 90.66it/s]
 36%|███▋      | 1149/3152 [00:12<00:21, 92.18it/s]


Epoch 1/20, Train Loss: 15.8275, mae: 0.2953
Epoch 2/20, Train Loss: 10.2382, mae: 0.2823
Epoch 3/20, Train Loss: 6.5506, mae: 0.2624
Epoch 4/20, Train Loss: 4.4130, mae: 0.2449
Epoch 5/20, Train Loss: 3.1784, mae: 0.2316
Epoch 6/20, Train Loss: 2.4334, mae: 0.2229
Epoch 7/20, Train Loss: 1.9316, mae: 0.2149
Epoch 8/20, Train Loss: 1.6448, mae: 0.2093
Epoch 9/20, Train Loss: 1.4246, mae: 0.2058
Epoch 10/20, Train Loss: 1.2768, mae: 0.2033
Epoch 11/20, Train Loss: 1.1631, mae: 0.2008
Epoch 12/20, Train Loss: 1.0721, mae: 0.1988
Epoch 13/20, Train Loss: 0.9885, mae: 0.1973
Epoch 14/20, Train Loss: 0.9328, mae: 0.1960


KeyboardInterrupt: 

In [379]:
embeddings=(model.item_embedding(torch.tensor(range(len(movieId_dict))).to('cuda'))).detach().cpu().numpy()

In [380]:
titles_2=titles.copy()
titles_2.index=titles_2.movieId
df['name_']=df.movieId.apply(lambda x:titles_2.loc[x]['title'] if x in titles_2.index else 'no_name')


In [381]:
movies_=pd.DataFrame({'movieId':list(dataset.movieId_dict.keys())})

movies_['name_']=movies_.movieId.apply(lambda x:titles_2.loc[x]['title'] if x in titles_2.index else 'no_name')

In [382]:
def get_similarity(name):
  row_movie=df[df.name_.apply(lambda x: name in x.lower() )].iloc[0]
  movieId=dataset.movieId_dict[row_movie['movieId']]
  row=embeddings[movieId]
  similarity=np.matmul(row,embeddings.T)/np.sqrt((row**2).sum())/np.sqrt((embeddings**2).sum(axis=1))
  new_df=movies_.copy()
  new_df['sim']=similarity
  new_df=new_df.sort_values(by='sim',ascending=False)
  return new_df.iloc[:20]

In [384]:
get_similarity('harry potter')

Unnamed: 0,movieId,name_,sim
457,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,1.0
1258,47491,Adam's Apples (Adams æbler) (2005),0.344224
1052,50872,Ratatouille (2007),0.317355
12,223,Clerks (1994),0.304903
734,2717,Ghostbusters II (1989),0.299852
465,253,Interview with the Vampire: The Vampire Chroni...,0.298314
1812,4299,"Knight's Tale, A (2001)",0.297458
531,165,Die Hard: With a Vengeance (1995),0.297325
6289,60522,"Machine Girl, The (Kataude mashin gâru) (2008)",0.295369
6699,3330,Splendor in the Grass (1961),0.295124
