In [1]:
import pymc as pm
import pandas as pd
import numpy as np
import aesara.tensor as at
import pytensor.tensor as pt
from sklearn.model_selection import train_test_split
import random
import matplotlib.pyplot as plt


KeyboardInterrupt



In [None]:
ratings = pd.read_csv("./ratings.csv")
movies = pd.read_csv('./movies_metadata.csv')

In [None]:
ratings.head(5)

In [None]:
class HPFModel():

    def __init__(self, binary = False):
        self.N_users = 200
        
        self.N_movies = 200

        self.aprime = 0.2
        self.bprime = 0.1
        self.a = 0.2
        self.cprime = 0.2
        self.dprime = 0.1
        self.c = 0.2

        #self.keepObservations = 150


        self.negativeProportion = 0.2 
        
        self.K = 10 #the number of components 

       
        data = pd.read_csv('full_dataset.csv')
        data['rating'] = data['rating']*2
        data = data.drop(['score','Unnamed: 0'], axis=1)
        data['rating2'] = data['rating']
        if binary:
            data['rating'] = data['rating'].apply(lambda x:x>0)


        data = data.sample(frac=1).reset_index(drop=True)

        
        
        print(data.head(5))

        self.data = data

        self.trainData = pd.DataFrame({"userId":[],"movieId":[],"rating":[],"timestamp":[]})
        self.testData = pd.DataFrame({"userId":[],"movieId":[],"rating":[],"timestamp":[]})


        for u in range(self.N_users):

            currentUserInteractions = self.data[self.data["userId"]==u].astype('int')

            forTrain, forTest = train_test_split(currentUserInteractions, test_size = 0.1, random_state = 42)

            self.trainData = pd.concat([self.trainData,forTrain])
            self.testData = pd.concat([self.testData,forTest])
            
        #data = data.head(10000)
        #self.trainData, self.testData = train_test_split(data, test_size=0.1, random_state=42)

        self.negatives = int(self.negativeProportion * len(self.trainData))
        
        while(self.negatives>0):

        
            x = random.randint(0,self.N_users-1)
            y = random.randint(0,self.N_movies-1)

            if self.trainData[(self.data['userId']==x) & (self.data['movieId']==y)].shape[0]:
                continue 

          #  print("ceva")
            new_entry = {'userId':x,'movieId':y,'rating':0,'timestamp':"0"}

            self.trainData = pd.concat([self.trainData, pd.DataFrame([new_entry])])

            self.negatives-=1
                
        
        #print(len(self.testData))
        ratingDict = dict()

        for i,row in self.trainData.iterrows():
            ratingDict[int(row['userId']),int(row['movieId'])] = int(row['rating'])

        indices = [list(x) for x in list(ratingDict.keys())]

        self.row_indices, self.col_indices = zip(*indices)
        self.row_indices = np.array(self.row_indices)
        self.col_indices = np.array(self.col_indices)
        self.observations = list(ratingDict.values())

    def train(self):
        coords = {
            "no_users":np.arange(self.N_users),
            "no_movies":np.arange(self.N_movies),
            "no_features":np.arange(self.K),
            "unity":np.arange(1)
        }
        with pm.Model(coords=coords) as model:
            
            xi = pm.Gamma("xi",self.aprime,self.aprime/self.bprime,dims=("no_users","unity"))
            
            theta = pm.Gamma('theta', alpha=self.a, beta=xi, dims=("no_users", "no_features"))
            
            
            eta = pm.Gamma("eta",self.cprime,self.cprime/self.dprime,dims=("unity","no_movies"))
            
            beta = pm.Gamma('beta', alpha=self.c, beta=eta, dims=("no_features","no_movies"))
            
            
            lambdas = pm.Deterministic("lambdas",pt.dot(theta,beta))
        
            obs = pm.Poisson("obs",mu = lambdas[self.row_indices,self.col_indices],shape = len(self.row_indices), observed = self.observations)

        with model:
            step = pm.NUTS()
            
            self.trace = pm.sample(1000,tune=100,step=step,chains=1,return_inferencedata=False)

            #az.plot_energy(self.trace)
            
    def predict(self,user,keep=5):

        currentLambdas = np.zeros((self.N_users,self.N_movies))


        with pm.Model() as model:

            rMatrix = self.trace["lambdas"].mean(axis = 0)[user]

            predictions = rMatrix.argsort()[::-1]

            #print(self.observations)
            
            predictions = [x for x in predictions if (user,x) not in list(zip(self.row_indices,self.col_indices))]


        return predictions[:keep]
            
    def predict_item(self,user,movie):

        return self.trace["lambdas"].mean(axis = 0)[user][movie]
        

In [None]:
#list(zip(m.row_indices,m.col_indices))

In [None]:
m = HPFModel(binary = True)

In [None]:
m.train()

In [None]:
m.trainData.head(10)

In [None]:
test_labels = dict()


for user in range(m.N_users):
    test_labels[user] = []

    for movie in range(m.N_movies):

        aux = m.trainData[(m.trainData["userId"]==user) & (m.trainData["movieId"]==movie)]
        aux2 = m.testData[(m.testData["userId"]==user) & (m.testData["movieId"]==movie)]
        
        #print(aux)
        if aux.shape[0]>0: #It was already watched
            continue 
        #print(aux2)
        if aux2.shape[0]==0: #Not interesting
            continue 
        #print(aux2["rating"])
        #print(aux2)
        test_labels[user].append([movie,list(aux2["rating2"])[0]])


    test_labels[user].sort(key = lambda x: - x[1])

    #print(test_labels[user])

In [None]:
test_labels

In [None]:
predictions = dict()

for user in range(m.N_users):
    predictions[user] = m.predict(user,keep=5)
    

In [None]:
userOrder = list(range(m.N_users))

userOrder.sort(key = lambda x:- m.data[m.data['userId']==x].shape[0])

In [None]:
bins = np.linspace(0, m.N_users, 11, dtype=int)

# Computing MNP and MNR

In [None]:
currentPct = 0

percentages = []
MNP = []
MNR = []

sum = 0

for i,user in enumerate(userOrder):

    predicted  = predictions[user]
    groundTruth = [x[0] for x in test_labels[user]]
   

    totalRelevant = m.testData[m.testData["userId"]==user].shape[0]

    currentValue = len(set(predicted) & set(groundTruth))  # Number of correct predictions
    totalPredicted = len(predicted)  # Total number of predicted items

    precission =  currentValue / totalPredicted
    sum += currentValue/totalPredicted


    #sum = sum + currentValue/totalRelevant

    currentMNP = sum / (i+1)

    if i and i in bins:
        currentPct+=10
        percentages.append(f"{currentPct}%")
        MNP.append(currentMNP)
    

In [None]:
plt.plot(percentages,MNP)
plt.show()
print(f"The Mean Normalized Precission for the whole user set is {currentMNP}")

In [None]:
sum = 0
percentages = []
currentPct = 0
MNR = []

X = 0
Y = 0

for i, user in enumerate(userOrder):
    predicted = predictions[user]
    groundTruth = [x[0] for x in test_labels[user][:5]]
    totalRelevant = m.testData[m.testData["userId"] == user].shape[0]
    totalPredicted = len(predicted)
    currentValue = len(set(predicted) & set(groundTruth)) 

    X += currentValue
    Y += totalRelevant

    #recall = (currentValue / totalRelevant)
    normalized_recall = currentValue/len(groundTruth)

 
    sum += normalized_recall

 
    currentMNR = sum / (i + 1)

    if i and i in bins:
        currentPct += 10
        percentages.append(f"{currentPct}%")
        MNR.append(currentMNR)


In [None]:
plt.plot(percentages,MNR)
plt.show()
print(f"The Mean Normalized Recall for the whole user set is {currentMNR}")

