In [None]:
!pip install torchmetrics

import pandas as pd
import numpy as np
import random
from collections import OrderedDict
import datetime
import os 
from torchmetrics import MeanAbsoluteError

##\
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Working on device: ", device)
# Download MovieLens data.
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile
urlretrieve("https://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
print("Extracting...")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()
print("Loading Files to Memory")


Working on device:  cpu
Downloading movielens data...
Extracting...
Loading Files to Memory


In [None]:
movie_header = ["movieId","title","genres"]
movie_pd = pd.read_csv("ml-1m/movies.dat",names=movie_header, sep=':{2}', engine='python')
user_header=["userId", "gender", "age", "occupation", "zip"]
user_pd = pd.read_csv("ml-1m/users.dat", names=user_header, sep=':{2}', engine='python')
ratings_header = ["userId","movieId","rating","timestamp"]
ratings_pd = pd.read_csv("ml-1m/ratings.dat", names=ratings_header, sep=':{2}', engine='python')

In [None]:
#DATA PREPROCESSING#
print("Coverting Movie Id")
#Convert Movie Id to zero index
id_to_index = dict()
index_to_id = dict()
for i in range(0, len(movie_pd)) :
    index_to_id[i] = movie_pd.iloc[i].movieId
    id_to_index[movie_pd.iloc[i].movieId] = i
movie_pd['movieId'] = movie_pd['movieId'].map(lambda id: id_to_index[id])
ratings_pd['movieId'] = ratings_pd['movieId'].map(lambda id: id_to_index[id])

print("Coverting User Id")
#Covert User Id to zero index
for i in range(0, len(user_pd)) :
    id_to_index[user_pd.iloc[i].userId] = i
user_pd['userId'] = user_pd['userId'].map(lambda id: id_to_index[id])
ratings_pd['userId'] = ratings_pd['userId'].map(lambda id: id_to_index[id])

#Get data lengths
movies_size= len(movie_pd)
user_size = ratings_pd["userId"].nunique()
#normalize rating
ratings_pd['rating'] = (ratings_pd['rating'] - ratings_pd['rating'].min()) / (ratings_pd['rating'].max() - ratings_pd['rating'].min())    
print("Preprocess Complete")




Coverting Movie Id
Coverting User Id
Preprocess Complete


In [None]:
#Load Previous Sample
host_path="/content/drive/MyDrive/Samples/"
train_pd = pd.read_csv(host_path + 'train_pd.csv')
test_pd = pd.read_csv(host_path + 'test_pd.csv')


In [None]:
#HYPER PARAMS
args = {
    "batch_size" : 512,
    "embedding_size" : 25,
    "epoch": 1,
    "lr": 0.1,
    }


In [None]:
class ItemDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        movie_id = torch.tensor(ratings_pd.iloc[index]["movieId"], dtype=torch.long)
        user_id =  torch.tensor(ratings_pd.iloc[index]["userId"], dtype=torch.long)
        label = torch.tensor(ratings_pd.iloc[index]["rating"], dtype=torch.float32)

        return movie_id, user_id, label

    def __len__(self):
        return len(self.data)

train_dataset = ItemDataset(train_pd)
train_loader = DataLoader(train_dataset, batch_size=args["batch_size"], shuffle=True)
test_dataset = ItemDataset(test_pd)
test_loader = DataLoader(test_dataset, batch_size=args["batch_size"], shuffle=True)





In [None]:
loss_vals = []
loss_validation = []
class RatingPredModel(nn.Module):
    def __init__(self, args):
        super(RatingPredModel, self).__init__()
        self.user_embed = nn.Embedding(user_size, args["embedding_size"], device=device)       
        self.movie_embed = nn.Embedding(movies_size, args["embedding_size"], device=device)
        self.optimizer = optim.Adam(self.parameters())
        self.loss_fn = nn.MSELoss()
        self.mean_absolute_error = MeanAbsoluteError()


        self.fc1 = nn.Sequential(OrderedDict([
          ('ll2', nn.Linear(128 ,256)),
          ("drop",  nn.Dropout(p=0.5)),
          ('relu1', nn.ReLU()),
          ('norm', nn.BatchNorm1d(256)),
        ]))
        self.fc2 = nn.Sequential(OrderedDict([
          ('ll2', nn.Linear(256 ,128)),
          ("drop",  nn.Dropout(p=0.5)),
          ('relu1', nn.ReLU()),
          ('norm', nn.BatchNorm1d(128)),
        ]))
        self.fc3 = nn.Sequential(OrderedDict([
          ('ll2', nn.Linear(128 ,64)),
          ("drop",  nn.Dropout(p=0.5)),
          ('relu1', nn.ReLU()),
          ('norm', nn.BatchNorm1d(64)),
        ]))
        self.combined_mlp = nn.Sequential(OrderedDict([
          ('ll1', nn.Linear(args["embedding_size"] * 2, 128)),
          ("drop",  nn.Dropout(p=0.5)),
          ('relu1', nn.ReLU()),
          ('norm', nn.BatchNorm1d(128)),
          ('fc1',  self.fc1),
          ('fc2',  self.fc2),
          ('fc3',  self.fc3),
          ('llo', nn.Linear(64 , 1)),

        ]))


        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
        self.aggregation_layer = torch.nn.Conv1d(in_channels=20, out_channels=1, kernel_size=1)

    def forward(self, movie_id, user_id):


        movie_vector = self.movie_embed(movie_id)
        user_vector = self.user_embed(user_id)


        #concat
        combined = torch.cat((movie_vector,user_vector), dim=1)
        return  self.combined_mlp(combined)

    
    
    def one_epoch(self,train_loader):
        running_loss = 0.
        last_loss = 0.
        for i,data in enumerate(train_loader):
            movie_id, user_id, label  = data
            movie_id, user_id, label = movie_id.to(device), user_id.to(device), label.to(device)

            self.zero_grad()
            outputs = self.forward(movie_id, user_id).squeeze()
            loss = self.loss_fn(outputs, label)
            loss.backward()
            self.optimizer.step()
            running_loss += loss.item()
            if i % 100 == 99:
                last_loss = running_loss / 100 # loss per batch
                print('  batch {}/{} loss: {}'.format(i + 1, len(train_loader), last_loss))

                running_loss = 0.
        return last_loss
    def fit(self, train_loader, epoch=args["epoch"], lr=args["lr"]):
        
        for ep in range(epoch) :
            print('EPOCH {}:'.format(ep + 1))
            self.train(True)
            avg_loss = self.one_epoch(train_loader)
            self.train(False)
            running_vloss = 0.0
            for i, vdata in enumerate(test_loader):
                vmovie_id, vuser_id, vlabels  = vdata 
                vmovie_id, vuser_id, vlabels = vmovie_id.to(device), vuser_id.to(device), vlabels.to(device)

                
                voutputs = self.forward(vmovie_id, vuser_id).squeeze()
                vloss = self.loss_fn(voutputs, vlabels)
                running_vloss += vloss
            avg_vloss = running_vloss / (i + 1)
            print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
            loss_vals.append(avg_loss)
            loss_validation.append(avg_vloss)
            timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
            model_path = '/content/drive/MyDrive/weights/movierec/simplenorm_{}_{}_{}_{}'.format(timestamp, ep+1 ,avg_loss, avg_vloss)
            torch.save(model.state_dict(), model_path)
    def get_metrics(self):
      with torch.no_grad():
            running_rmse = 0
            running_mae = 0
            for i, vdata in enumerate(test_loader):
                vmovie_id, vuser_id, vlabels  = vdata 
                vmovie_id, vuser_id, vlabels = vmovie_id.to(device), vuser_id.to(device), vlabels.to(device)

                
                voutputs = self.forward(vmovie_id, vuser_id).squeeze()
                running_rmse +=  torch.sqrt(self.loss_fn(voutputs, vlabels))
                running_mae += self.mean_absolute_error(voutputs,vlabels)
            rmse =  running_rmse / (i + 1)
            mae = running_mae / (i + 1)
            print('RMSE loss: {}\tMAE{} '.format(rmse, mae))

In [None]:
model = RatingPredModel(args).to(device)


In [None]:
PATH = "/content/drive/MyDrive/weights/movierec/simplenorm_20230331_181435_9_0.7740550613403321_0.4808712303638458"
model.load_state_dict(torch.load(PATH, map_location=torch.device(device)), strict=False)

<All keys matched successfully>

In [None]:
model.fit(train_loader, epoch=10)


In [None]:
def ranking(query_liked):
    with torch.no_grad():
        weights = model.movie_embed.weight.detach().cpu().numpy()
        query_res = weights[query_liked].sum(axis=0)

        outs = list()
        for idx,movie in movie_pd["title"].items():
            movie = weights[idx]
            vector_dot = np.dot(movie, query_res)
            movie_1_length = np.linalg.norm(movie,2)
            query_length = np.linalg.norm(query_res,2)
            cosine_dist =  (vector_dot / (movie_1_length * query_length))
            outs.append(cosine_dist)
        return torch.tensor(np.stack(outs,0))
    
def display_top_k(score, indices, k=5):
    top_scores = score[:k]
    top_indic = indices[:k]
    top_names = []
    top_generes = []
    for movie_id in top_indic:
        top_names.append(movie_pd.loc[movie_id.item()]["title"])
        top_generes.append(movie_pd.loc[movie_id.item()]["genres"])
        
    df = pd.DataFrame({
        "score_key": top_scores.numpy(),
        'titles':top_names,
        'genres': top_generes
    })
    print(df)

In [None]:
model.get_metrics()

  movie_id = torch.tensor(ratings_pd.iloc[index]["movieId"], dtype=torch.long)
  user_id =  torch.tensor(ratings_pd.iloc[index]["userId"], dtype=torch.long)


RMSE loss: 0.884240984916687	MAE0.7217465043067932 


In [None]:
query_liked = [0] #Toy Story
# query_liked = [224] #Star  Wars
query_liked = [257] #Pulp Fiction
# query_liked = [1192] 


topk = 10

print("===========COSINE SIMILARITY===================")
score, indices = ranking(query_liked).sort(descending=True)
display_top_k(score,indices, topk)


   score_key                                     titles  \
0   1.000000  Star Wars: Episode IV - A New Hope (1977)   
1   0.666918                    Army of Darkness (1993)   
2   0.638077                            Bad Boys (1995)   
3   0.617600                 Bedrooms & Hallways (1998)   
4   0.608532                 Looking for Richard (1996)   
5   0.588477                       Stealing Home (1988)   
6   0.577025                    Harold and Maude (1971)   
7   0.572028                              Picnic (1955)   
8   0.555117           Silence of the Lambs, The (1991)   
9   0.541753                         Clean Slate (1994)   

                                  genres  
0        Action|Adventure|Fantasy|Sci-Fi  
1  Action|Adventure|Comedy|Horror|Sci-Fi  
2                                 Action  
3                         Comedy|Romance  
4                      Documentary|Drama  
5                                  Drama  
6                                 Comedy  
7     

In [None]:
def get_metrics(k, thresh):
  score = 0.0
  total = 0.0

  precisions = dict()
  recalls = dict()
  for id, pred in predictions.items():
    predictions[id].sort(key = lambda x: x[1], reverse=True)

    n_rel = sum((true_r >= thresh) for (_, true_r) in pred)
    n_rec_k = sum((est >= thresh) for (est, _) in pred[:k])
    n_rel_and_rec_k = sum(
        ((true_r >= thresh) and (est >= thresh))
        for (est, true_r) in pred[:k]
    )

    precisions[id] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
    recalls[id] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
  return precisions, recalls
  #   for rating in pred[:k]:
  #     total += 1.0
  #     if rating[1] >= thresh:
  #       score += 1.0
  # print(score/total)


p, r = get_metrics(5, 0.365)
  
avg_p = sum(p.values()) / len(p.values()) * 100
avg_r = sum(r.values()) / len(r.values()) * 100


print(f"Precision: {avg_p}\tRecall: {avg_r}")

Precision: 49.00000000000001	Recall: 66.33928571428571


In [None]:
from collections import defaultdict

predictions = defaultdict(list)
model.eval()
for userid, user in user_pd[:20].iterrows():
  print(f"{userid+1}/{len(user_pd[:20])}")
  for movieid, movie in movie_pd.iterrows():
    movie_id = torch.tensor([movie['movieId']], dtype=torch.long).to(device)
    user_id = torch.tensor([user['userId']] , dtype=torch.long ).to(device)

    pred = model.forward(movie_id, user_id).item()
    predictions[user_id.item()].append((movie_id.item(), pred))

1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
20/20
