In [1]:
import pymc as pm
import pandas as pd
import numpy as np
import aesara.tensor as at
import pytensor.tensor as pt
from sklearn.model_selection import train_test_split
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

KeyboardInterrupt: 

In [None]:
ratings = pd.read_csv("./ratings.csv")
movies = pd.read_csv('./movies_metadata.csv')

In [None]:
ratings.head(5)

In [None]:
class HPFModel():

    def __init__(self, binary = False):
        self.N_users = 200
        
        self.N_movies = 200

        self.aprime = 0.3
        self.bprime = 0.3
        self.a = 0.3
        self.cprime = 0.3
        self.dprime = 0.3
        self.c = 0.3

        #self.keepObservations = 150


        self.negativeProportion = 0.2 
        
        self.K = 20 #the number of components 

       
        data = pd.read_csv('full_dataset.csv')
        data['rating'] = data['rating']*2
        data = data.drop(['score','Unnamed: 0'], axis=1)

        if binary:
            data['rating'] = data['rating'].apply(lambda x:x>0)


        data = data.sample(frac=1).reset_index(drop=True)

        
        
        print(data.head(5))

        self.data = data

        self.trainData = pd.DataFrame({"userId":[],"movieId":[],"rating":[],"timestamp":[]})
        self.testData = pd.DataFrame({"userId":[],"movieId":[],"rating":[],"timestamp":[]})


        for u in range(self.N_users):

            currentUserInteractions = self.data[self.data["userId"]==u].astype('int')

            forTrain, forTest = train_test_split(currentUserInteractions, test_size = 0.1, random_state = 42)

            self.trainData = pd.concat([self.trainData,forTrain])
            self.testData = pd.concat([self.testData,forTest])
            
        #data = data.head(10000)
        #self.trainData, self.testData = train_test_split(data, test_size=0.1, random_state=42)

        self.negatives = int(self.negativeProportion * len(self.trainData))
        
        while(self.negatives>0):

        
            x = random.randint(0,self.N_users-1)
            y = random.randint(0,self.N_movies-1)

            if self.trainData[(self.data['userId']==x) & (self.data['movieId']==y)].shape[0]:
                continue 

          #  print("ceva")
            new_entry = {'userId':x,'movieId':y,'rating':0,'timestamp':"0"}

            self.trainData = pd.concat([self.trainData, pd.DataFrame([new_entry])])

            self.negatives-=1
                
        
        #print(len(self.testData))
        ratingDict = dict()

        for i,row in self.trainData.iterrows():
            ratingDict[int(row['userId']),int(row['movieId'])] = int(row['rating'])

        indices = [list(x) for x in list(ratingDict.keys())]

        self.row_indices, self.col_indices = zip(*indices)
        self.row_indices = np.array(self.row_indices)
        self.col_indices = np.array(self.col_indices)
        self.observations = list(ratingDict.values())

    def train(self):
        coords = {
            "no_users":np.arange(self.N_users),
            "no_movies":np.arange(self.N_movies),
            "no_features":np.arange(self.K),
            "unity":np.arange(1)
        }
        with pm.Model(coords=coords) as model:
            
            xi = pm.Gamma("xi",self.aprime,self.aprime/self.bprime,dims=("no_users","unity"))
            
            theta = pm.Gamma('theta', alpha=self.a, beta=xi, dims=("no_users", "no_features"))
            
            
            eta = pm.Gamma("eta",self.cprime,self.cprime/self.dprime,dims=("unity","no_movies"))
            
            beta = pm.Gamma('beta', alpha=self.c, beta=eta, dims=("no_features","no_movies"))
            
            
            lambdas = pm.Deterministic("lambdas",pt.dot(theta,beta))
        
            obs = pm.Poisson("obs",mu = lambdas[self.row_indices,self.col_indices],shape = len(self.row_indices), observed = self.observations)

        with model:
            step = pm.NUTS()
            
            self.trace = pm.sample(10000,tune=1000,step=step,chains=1,return_inferencedata=False)

    def predict(self,user,keep=5):

        currentLambdas = np.zeros((self.N_users,self.N_movies))


        with pm.Model() as model:

            rMatrix = self.trace["lambdas"].mean(axis = 0)[user]

            predictions = rMatrix.argsort()[::-1]

            #print(self.observations)
            
            predictions = [x for x in predictions if (user,x) not in list(zip(self.row_indices,self.col_indices))]


        return predictions[:keep]
            
    def predict_item(self,user,movie):

        return self.trace["lambdas"].mean(axis = 0)[user][movie]
        

In [None]:
#list(zip(m.row_indices,m.col_indices))

In [None]:
m = HPFModel(binary = False)

In [None]:
m.train()

In [None]:
sumAE = 0
sumSE = 0

print(m.testData)

In [None]:
y = list(m.testData["rating"])
y_pred = []

In [None]:
for i,row in m.testData.iterrows():

    y_pred.append(m.predict_item(int(row["userId"]),int(row["movieId"])))

In [None]:
y = np.array(y)/2
y_pred = np.array(y_pred)/2

In [None]:
MAE=mean_absolute_error(y,y_pred)
print(f"The rating's MAE is: {MAE}")

In [None]:
MSE=mean_squared_error(y,y_pred)
print(f"The rating's MSE is: {MSE}")

In [None]:
RMSE=np.sqrt(mean_squared_error(y,y_pred))
print(f"The rating's RMSE is: {RMSE}")