In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import os.path
from sklearn.metrics import mean_squared_error

In [2]:
# env : ["Test", "Prod"]
# before testing run prepare_test_data.ipynb notebook
env = "Test"

In [3]:
if env == "Test":
    folder_path = "./data/test"
else: folder_path = "./data"
movies = pd.read_csv(f'{folder_path}/movies.csv')
ratings = pd.read_csv(f'{folder_path}/ratings.csv')

In [4]:
train_ratings, test_ratings = train_test_split(ratings, test_size=0.01, shuffle=True)

In [5]:
class Model:
    def __init__(self, k = 30):
        self.users: pd.DataFrame
        self.movies: pd.DataFrame
        self.ratings: pd.DataFrame
        self.usersFName = f'./data/users{env}.csv'
        self.k = k

    def fit(self, movies: pd.DataFrame, ratings: pd.DataFrame, update_users = False):
        movies.genres = movies.genres.str.split('|')
        self.movies = movies
        self.ratings = ratings
        if not os.path.isfile(self.usersFName) or update_users:
            selected_genres = self.get_relevant_genres()
            self.users = self.get_users(selected_genres)
        else:
            self.users = pd.read_csv(self.usersFName)
        self.users.to_csv(self.usersFName, index=False)
        self.clusterUsers()
    
    def get_relevant_genres(self, threshold = 0.001):
        genre_count = self.movies['genres'].explode().value_counts().reset_index()
        genre_count.columns = ['genre', 'count']
        genre_count = genre_count[genre_count["count"] > self.movies.size*threshold]
        selected_genres = genre_count['genre'].tolist()
        return selected_genres  
    
    def get_users(self, selected_genres):
        users = pd.DataFrame(self.ratings['userId'].unique(), columns=['userId'])
        df = pd.merge(self.movies, self.ratings, on='movieId')
        for genre in selected_genres:
            genre_ratings = df[df['genres'].apply(lambda x: genre in x)]
            users[f"{genre}"] = users.apply(lambda x: genre_ratings[genre_ratings['userId'] == x['userId']]['rating'].mean(), axis=1)
        return users
    
    def clusterUsers(self):
        kmeans = KMeans(n_clusters=self.k)
        self.users = self.users.fillna(self.users.mean())
        usersCluster = kmeans.fit(self.users.drop('userId', axis=1))
        self.users['cluster'] = usersCluster.labels_

    def predict(self, userID, movieID):
        clusterResult = self.predictByCLuster(userID, movieID)
        if not np.isnan(clusterResult):
            return clusterResult
        dummyResult = self.predictByDummy(userID, movieID)
        if not np.isnan(dummyResult):
            return dummyResult
        return self.ratings.rating.mean()
    
    def predictDummy(self, userID, movieID):
        dummyResult = self.predictByDummy(userID, movieID)
        if not np.isnan(dummyResult):
            return dummyResult
        return self.ratings.rating.mean()

    def predictByCLuster(self, userID, movieID):
        if userID not in self.users['userId'].values:
            return np.nan
        userCluster = self.users.loc[self.users['userId'] == userID, 'cluster'].values[0]
        usersInCluster = self.users.loc[self.users['cluster'] == userCluster, 'userId']
        ratings = self.ratings[(self.ratings['movieId'] == movieID) & (self.ratings['userId'].isin(usersInCluster))]
        return ratings['rating'].mean()
    
    def predictByDummy(self, userID, movieID):
        return self.ratings.loc[self.ratings['movieId'] == movieID].rating.mean()

    def getBaseline(self, test: pd.DataFrame):
        test["prediction"] = test.apply(lambda x: self.predictDummy(x["userId"], x["movieId"]), axis=1)
        return mean_squared_error(y_true = test.rating, y_pred = test.prediction)

    def getMSE(self, test: pd.DataFrame):
        test["prediction"] = test.apply(lambda x: self.predict(x["userId"], x["movieId"]), axis=1)
        return mean_squared_error(y_true = test.rating, y_pred = test.prediction)

In [6]:
model = Model()
model.fit(movies.copy(), train_ratings, update_users=True)

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
model = Model()
model.fit(movies.copy(), train_ratings, update_users=False)
model.getBaseline(test_ratings)
print("Baseline MSE: ", model.getBaseline(test_ratings))
for k in range(1, 100):
    model = Model(k)
    model.fit(movies.copy(), train_ratings)
    mse = model.getMSE(test_ratings)
    print(f"k = {k}, MSE = {mse}")


Baseline MSE:  0.9672877285762063
k = 1, MSE = 0.9672877285762063
k = 2, MSE = 0.8724634786336792
k = 3, MSE = 0.8551478001140462
k = 4, MSE = 0.8565559712252793
k = 5, MSE = 0.8582999781930115
k = 6, MSE = 0.8653358341327657


In [None]:
model = Model()
model.fit(movies, train_ratings, update_users = True)

In [None]:
model.getMSE(test_ratings)

0.873226867637611

In [None]:
df1 = pd.DataFrame([2, 5], columns=['Movie1'])
df2 = pd.DataFrame(np.array([[False, False], [True, False], [False, False], [False, False], [False, True]]).T, 
                  columns=['Movie1_1', 'Movie1_2', 'Movie1_3', 'Movie1_4', 'Movie1_5'])
print(df1)
print(df2)

   Movie1
0       2
1       5
   Movie1_1  Movie1_2  Movie1_3  Movie1_4  Movie1_5
0     False      True     False     False     False
1     False     False     False     False      True
