In [None]:
!pip install torchmetrics

import pandas as pd
import numpy as np
import random
from collections import OrderedDict,defaultdict
import datetime
import os 

##\
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torchmetrics import MeanAbsoluteError

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Working on device: ", device)
# Download MovieLens data.
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile
urlretrieve("https://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
print("Extracting...")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()
print("Loading Files to Memory")


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.4
Working on device:  cpu
Downloading movielens data...
Extracting...
Loading Files to Memory


In [None]:
with open("ml-1m/movies.dat", "r+", encoding = "utf-8") as file:
    file.seek(0, os.SEEK_END)
    pos = file.tell() - 1
    file.seek(pos, os.SEEK_SET)
    if pos > 0:
        file.seek(pos, os.SEEK_SET)
        file.truncate()

In [None]:
movie_header = ["movieId","title","genres"]
movie_pd = pd.read_csv("ml-1m/movies.dat",names=movie_header, sep=':{2}', engine='python')
user_header=["userId", "gender", "age", "occupation", "zip"]
user_pd = pd.read_csv("ml-1m/users.dat", names=user_header, sep=':{2}', engine='python')
ratings_header = ["userId","movieId","rating","timestamp"]
ratings_pd = pd.read_csv("ml-1m/ratings.dat", names=ratings_header, sep=':{2}', engine='python')
genere_list = {"","Action","Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"}
#Get data lengths
movies_size= len(movie_pd)
user_size = ratings_pd["userId"].nunique()
genere_size = len(genere_list)


In [None]:
#DATA PREPROCESSING#
print("Coverting Movie Id")
#Convert Movie Id to zero index
id_to_index = dict()
index_to_id = dict()
for i in range(0, len(movie_pd)) :
    index_to_id[i] = movie_pd.iloc[i].movieId
    id_to_index[movie_pd.iloc[i].movieId] = i
movie_pd['movieId'] = movie_pd['movieId'].map(lambda id: id_to_index[id])
ratings_pd['movieId'] = ratings_pd['movieId'].map(lambda id: id_to_index[id])

print("Coverting User Id")
#Covert User Id to zero index
for i in range(0, len(user_pd)) :
    id_to_index[user_pd.iloc[i].userId] = i
user_pd['userId'] = user_pd['userId'].map(lambda id: id_to_index[id])
ratings_pd['userId'] = ratings_pd['userId'].map(lambda id: id_to_index[id])

#Get Genere dicts
genere_to_id = dict()
id_to_genere = dict()
for idx,genere in enumerate(genere_list):
    genere_to_id[genere] = idx
    id_to_genere[idx] = genere

#Get data lengths
movies_size= len(movie_pd)
user_size = ratings_pd["userId"].nunique()
genere_size = len(genere_list)
#normalize rating
# ratings_pd['rating'] = (ratings_pd['rating'] - ratings_pd['rating'].min()) / (ratings_pd['rating'].max() - ratings_pd['rating'].min())    
#normalize year
def tryconvert(movie):
    try:
        return int(movie[-5:-1])
    except (ValueError, TypeError):
        return 2000
movie_pd['year'] = movie_pd['title'].map(lambda movie: tryconvert(movie))
# movie_pd['year'] = (movie_pd['year'] - movie_pd['year'].min()) / (movie_pd['year'].max() - movie_pd['year'].min())    
def encode_gender(gen):
    if gen == "M": return 1 
    else: return 0
user_pd['gender'] = user_pd['gender'].map(lambda gender: encode_gender(gender))
def encode_age(age):
  if age == 1: return 1 #under 18
  elif age == 18: return 2 #18-24
  elif age == 25: return 3 #25-34
  elif age == 35: return 4 # 35-44
  elif age == 45: return 5 # 45-49
  elif age == 50: return 6 # 50-55
  elif age == 56: return 7 # 56+
  else: return 0 #unkown
user_pd['age'] = user_pd['age'].map(lambda age: encode_age(age))
user_pd=user_pd.drop(labels="zip",axis=1)
print("Preprocess Complete")




Coverting Movie Id
Coverting User Id
Preprocess Complete


In [None]:
ratings_pd

Unnamed: 0,userId,movieId,rating,timestamp
0,0,1176,5,978300760
1,0,655,3,978302109
2,0,902,3,978301968
3,0,3339,4,978300275
4,0,2286,5,978824291
...,...,...,...,...
1000204,6039,1075,1,956716541
1000205,6039,1078,5,956704887
1000206,6039,558,5,956704746
1000207,6039,1080,4,956715648


In [None]:
# shuffle 90 | 10 split -- New Sample
new_samples = False
if new_samples:
  train_pd, test_pd = np.split(ratings_pd.sample(frac=1), [int(.9*len(ratings_pd))])
  train_pd.to_csv('train_pd.csv', header=False, index=False)
  test_pd.to_csv('test_pd.csv', header=False, index=False)
else:
  #Load Previous Sample
  host_path="/content/drive/MyDrive/Samples/"
  train_pd = pd.read_csv(host_path + 'train_pd.csv')
  test_pd = pd.read_csv(host_path + 'test_pd.csv')

In [None]:
#HYPER PARAMS
args = {
    "batch_size" : 512,
    "embedding_size" : 25,
    "layer_size": 1024,
    "epoch": 1,
    "lr": 0.1,
    }


In [None]:
class ItemDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        movie_id = torch.tensor(ratings_pd.iloc[index]["movieId"], dtype=torch.long)
        user_id =  torch.tensor(ratings_pd.iloc[index]["userId"], dtype=torch.long)

        nfeatures =  torch.tensor([movie_pd.iloc[movie_id.item()]["year"],user_pd.iloc[user_id.item()]["gender"] , user_pd.iloc[user_id.item()]["age"] , user_pd.iloc[user_id.item()]["occupation"]], dtype=torch.float32)

        genere_split = torch.tensor([genere_to_id[x] for x in movie_pd.iloc[movie_id.item()]["genres"].split("|")])
        genere_ids = torch.nn.functional.pad(genere_split,(0,20-genere_split.shape[0] ))
        label = torch.tensor(ratings_pd.iloc[index]["rating"], dtype=torch.float32)

        return movie_id, user_id, genere_ids, nfeatures, label

    def __len__(self):
        return len(self.data)

train_dataset = ItemDataset(train_pd)
train_loader = DataLoader(train_dataset, batch_size=args["batch_size"], shuffle=True)
test_dataset = ItemDataset(test_pd)
test_loader = DataLoader(test_dataset, batch_size=args["batch_size"], shuffle=True)





In [None]:
loss_vals = []
loss_validation = []
class RatingPredModel(nn.Module):
    def __init__(self, args):
        super(RatingPredModel, self).__init__()
        self.user_embed = nn.Embedding(user_size, args["embedding_size"], device=device)       
        self.movie_embed = nn.Embedding(movies_size, args["embedding_size"], device=device)
        self.genere_embed = nn.Embedding(genere_size, args["embedding_size"], device=device)

        self.optimizer = optim.Adam(self.parameters())
        self.loss_fn = nn.MSELoss()

        self.bottom_mlp =  nn.Sequential(OrderedDict([
          ('ll1', nn.Linear(4 ,args["embedding_size"])),
          ("drop",  nn.Dropout(p=0.25, inplace=False)),
          ('relu1', nn.ReLU()),
          ('norm', nn.BatchNorm1d(args["embedding_size"])),
        ]))

        self.fc1 = nn.Sequential(OrderedDict([
          ('ll2', nn.Linear(args["layer_size"] ,args["layer_size"])),
          ("drop",  nn.Dropout(p=0.25, inplace=False)),
          ('relu1', nn.ReLU()),
          ('norm', nn.BatchNorm1d(args["layer_size"])),
        ]))

        self.combined_mlp = nn.Sequential(OrderedDict([
          ('ll1', nn.Linear(args["embedding_size"] * 5 ,args["layer_size"])),
          ("drop",  nn.Dropout(p=0.25, inplace=False)),
          ('relu1', nn.ReLU()),
          ('norm', nn.BatchNorm1d(args["layer_size"])),
          ('fc1',  self.fc1),
          ('fc1',  self.fc1),
          ('fc1',  self.fc1),
          ('ll3', nn.Linear(args["layer_size"] , args["embedding_size"])),
          ("drop",  nn.Dropout(p=0.25, inplace=False)),
          ('relu1', nn.ReLU()),
          ('norm2', nn.BatchNorm1d(args["embedding_size"])),
        ]))

        self.logits = nn.Sequential(OrderedDict([
          ('ll1', nn.Linear(args["embedding_size"] * 2 ,args["embedding_size"] )),
          ('relu1', nn.ReLU()),
          ('ll2', nn.Linear(args["embedding_size"], 1)),
          # ('sig', nn.Sigmoid())

        ]))
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
        self.aggregation_layer = torch.nn.Conv1d(in_channels=20, out_channels=1, kernel_size=1)
        self.mean_absolute_error = MeanAbsoluteError()
    def forward(self, movie_id, user_id,genere_ids, nfeatures):


        movie_vector = self.movie_embed(movie_id)
        user_vector = self.user_embed(user_id)
        genere_vector = self.genere_embed(genere_ids)
        genere_vector = self.aggregation_layer(genere_vector).squeeze(1)
        
        #bottom_mlp
        nfeature_vector = self.bottom_mlp(nfeatures)
        #cross
        cross = movie_vector * user_vector * genere_vector * nfeature_vector

        #concat
        combined = torch.cat((movie_vector,user_vector, genere_vector, nfeature_vector,cross), dim=1)
        combined = self.combined_mlp(combined)

        #combine
        full = torch.cat((cross,combined), dim=1)
        return self.logits(full)
    
    def one_epoch(self,train_loader):
        running_loss = 0.
        last_loss = 0.
        for i,data in enumerate(train_loader):
            movie_id, user_id, genere_ids,nfeatures,label  = data
            movie_id, user_id, genere_ids,nfeatures, label = movie_id.to(device), user_id.to(device), genere_ids.to(device),nfeatures.to(device), label.to(device)

            self.zero_grad()
            outputs = self.forward(movie_id, user_id,genere_ids,nfeatures).squeeze()
            loss = self.loss_fn(outputs, label)
            loss.backward()
            self.optimizer.step()
            running_loss += loss.item()
            if i % 100 == 99:
                with torch.no_grad():
                  last_loss = running_loss / 100 # loss per batch
                  print('  batch {}/{} MSE loss: {} '.format(i + 1, len(train_loader), last_loss))

                  running_loss = 0.
        return last_loss
    def fit(self, train_loader, epoch=args["epoch"], lr=args["lr"]):
        
        for ep in range(epoch) :
            print('EPOCH {}:'.format(ep + 1))
            self.train(True)
            avg_loss = self.one_epoch(train_loader)
            self.train(False)
            running_vloss = 0.0
            with torch.no_grad():
              for i, vdata in enumerate(test_loader):
                  vmovie_id, vuser_id, vgenere_ids, vnfeatures, vlabels  = vdata 
                  vmovie_id, vuser_id, vgenere_ids, vnfeatures, vlabels = vmovie_id.to(device), vuser_id.to(device), vgenere_ids.to(device), vnfeatures.to(device), vlabels.to(device)

                  
                  voutputs = self.forward(vmovie_id, vuser_id, vgenere_ids,vnfeatures).squeeze()
                  vloss =  torch.sqrt(self.loss_fn(voutputs, vlabels))
                  running_vloss += vloss
            avg_vloss = running_vloss / (i + 1)
            print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
            loss_vals.append(avg_loss)
            loss_validation.append(avg_vloss)
            timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
            model_path = '/content/drive/MyDrive/weights/movierec/rmse_cross{}_{}_{}_{}'.format(timestamp, ep+1 ,avg_loss, avg_vloss)
            torch.save(model.state_dict(), model_path)
    def get_metrics(self):
      with torch.no_grad():
            running_rmse = 0
            running_mae = 0
            for i, vdata in enumerate(test_loader):
                vmovie_id, vuser_id, vgenere_ids, vnfeatures, vlabels  = vdata 
                vmovie_id, vuser_id, vgenere_ids, vnfeatures, vlabels = vmovie_id.to(device), vuser_id.to(device), vgenere_ids.to(device), vnfeatures.to(device), vlabels.to(device)

                
                voutputs = self.forward(vmovie_id, vuser_id, vgenere_ids,vnfeatures).squeeze()
                running_rmse +=  torch.sqrt(self.loss_fn(voutputs, vlabels))
                running_mae += self.mean_absolute_error(voutputs,vlabels)
            rmse =  running_rmse / (i + 1)
            mae = running_mae / (i + 1)
            print('RMSE loss: {}\tMAE{} '.format(rmse, mae))


In [None]:
model = RatingPredModel(args).to(device)


In [None]:
PATH = "/content/drive/MyDrive/weights/movierec/rmse_cross20230415_162933_3_0.37703907549381255_0.5121314525604248"
model.load_state_dict(torch.load(PATH, map_location=torch.device(device)), strict=False)

<All keys matched successfully>

In [None]:
model.train()
model.fit(train_loader, epoch=3)


EPOCH 1:
  batch 100/1759 MSE loss: 13.991343336105347 
  batch 200/1759 MSE loss: 13.997267017364502 
  batch 300/1759 MSE loss: 14.034649829864502 
  batch 400/1759 MSE loss: 14.030696182250976 
  batch 500/1759 MSE loss: 13.988768663406372 
  batch 600/1759 MSE loss: 13.980334978103638 
  batch 700/1759 MSE loss: 14.010077676773072 
  batch 800/1759 MSE loss: 13.990192108154297 
  batch 900/1759 MSE loss: 13.981048974990845 
  batch 1000/1759 MSE loss: 13.970403242111207 
  batch 1100/1759 MSE loss: 14.00684398651123 
  batch 1200/1759 MSE loss: 13.959441003799439 
  batch 1300/1759 MSE loss: 14.007606706619264 


KeyboardInterrupt: ignored

In [None]:
model.get_metrics()

  movie_id = torch.tensor(ratings_pd.iloc[index]["movieId"], dtype=torch.long)
  user_id =  torch.tensor(ratings_pd.iloc[index]["userId"], dtype=torch.long)


RMSE loss: 0.38286659121513367	MAE0.31443578004837036 


In [None]:
def ranking(query_liked):
    with torch.no_grad():
        weights = model.movie_embed.weight.detach().cpu().numpy()
        query_res = weights[query_liked].sum(axis=0)

        outs = list()
        for idx,movie in movie_pd["title"].items():
            movie = weights[idx]
            vector_dot = np.dot(movie, query_res)
            movie_1_length = np.linalg.norm(movie,2)
            query_length = np.linalg.norm(query_res,2)
            cosine_dist =  (vector_dot / (movie_1_length * query_length))
            outs.append(cosine_dist)
        return torch.tensor(np.stack(outs,0))
    
def display_top_k(score, indices, k=5):
    top_scores = score[:k]
    top_indic = indices[:k]
    top_names = []
    top_generes = []
    for movie_id in top_indic:
        top_names.append(movie_pd.loc[movie_id.item()]["title"])
        top_generes.append(movie_pd.loc[movie_id.item()]["genres"])
        
    df = pd.DataFrame({
        "score_key": top_scores.numpy(),
        'titles':top_names,
        'genres': top_generes
    })
    print(df)

In [None]:
query_liked = [0] #Toy Story
# query_liked = [224] #Star  Wars
# query_liked = [257] #Pulp Fiction
query_liked = [3648] 


topk = 10

print("===========COSINE SIMILARITY===================")
score, indices = ranking(query_liked).sort(descending=True)
display_top_k(score,indices, topk)


   score_key                              titles                    genres
0   1.000000           Gone in 60 Seconds (2000)              Action|Crime
1   0.741664                It's My Party (1995)                     Drama
2   0.688896                     Dinosaur (2000)      Animation|Children's
3   0.683477     Honey, I Blew Up the Kid (1992)  Children's|Comedy|Sci-Fi
4   0.678023                  Brassed Off (1996)      Comedy|Drama|Romance
5   0.673571                  White Sands (1992)            Drama|Thriller
6   0.663483  Kestrel's Eye (Falkens �ga) (1998)               Documentary
7   0.661258                    King Kong (1976)   Action|Adventure|Horror
8   0.657639                    Star Maps (1997)                     Drama
9   0.647694              Pather Panchali (1955)                     Drama


In [None]:
from collections import defaultdict

predictions = defaultdict(list)
model.eval()
for userid, user in user_pd[:600].iterrows():
  print(f"{userid+1}/{len(user_pd[:600])}")
  for movieid, movie in movie_pd.iterrows():
    movie_id = torch.tensor([movie['movieId']], dtype=torch.long).to(device)

    user_id = torch.tensor([user['userId']] , dtype=torch.long ).to(device)

    genere_split = torch.tensor([genere_to_id[x] for x in movie["genres"].split("|")])
    genere_ids = torch.nn.functional.pad(genere_split,(0,20-genere_split.shape[0] )).unsqueeze(0).to(device)
    
    nfeatures = torch.tensor([[movie["year"], user["gender"], user['age'], user['occupation']]] , dtype=torch.float32).to(device)


    pred = model.forward(movie_id, user_id,genere_ids,nfeatures).item()
    predictions[user_id.item()].append((movie_id.item(), pred))
#For each user 
  # For each movie
    #insert dict rating
  #organize ratings keep to k

In [None]:
print(len(predictions))

10


In [None]:
import pickle 
PATH = "/content/drive/MyDrive/weights/movierec/dict_600.pkl"
with open(PATH, 'wb') as f:
    pickle.dump(predictions, f)

In [None]:
import pickle 

PATH = "/content/drive/MyDrive/weights/movierec/dict_10.pkl"
with open(PATH, 'rb') as f:
    predictions = pickle.load(f)
print(len(predictions))

10


In [None]:
predictions[1]

In [None]:
def get_metrics(k, thresh):
  score = 0.0
  total = 0.0

  precisions = dict()
  recalls = dict()
  for id, pred in predictions.items():
    predictions[id].sort(key = lambda x: x[1], reverse=True)

    n_rel = sum((true_r >= thresh) for (_, true_r) in pred)
    n_rec_k = sum((est >= thresh) for (est, _) in pred[:k])
    n_rel_and_rec_k = sum(
        ((true_r >= thresh) and (est >= thresh))
        for (est, true_r) in pred[:k]
    )

    precisions[id] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
    recalls[id] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
  return precisions, recalls
  #   for rating in pred[:k]:
  #     total += 1.0
  #     if rating[1] >= thresh:
  #       score += 1.0
  # print(score/total)


p, r = get_metrics(5, 0.365)
  
avg_p = sum(p.values()) / len(p.values()) * 100
avg_r = sum(r.values()) / len(r.values()) * 100


print(f"Precision: {avg_p}\tRecall: {avg_r}")

Precision: 78.0	Recall: 72.36652236652236
