In [1]:
# if it doesn't work, try to install the https://rapids.ai/cudf-pandas/
%load_ext cudf.pandas

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

In [3]:
class Model:
    def __init__(self):
        self.users: pd.DataFrame
        self.movies: pd.DataFrame
        self.ratings: pd.DataFrame

    def fit(self, movies: pd.DataFrame, ratings: pd.DataFrame):
        movies.genres = movies.genres.str.split('|')
        self.movies = movies
        self.ratings = ratings
        selected_genres = self.get_relevant_genres()
        self.users = self.get_users(selected_genres)
        self.clusterUsers(20)
        return self.users
    
    def get_relevant_genres(self, threshold = 0.04):
        genre_count = self.movies['genres'].explode().value_counts().reset_index()
        genre_count.columns = ['genre', 'count']
        genre_count = genre_count[genre_count["count"] > self.movies.size*threshold]
        selected_genres = genre_count['genre'].tolist()
        return selected_genres  
    
    def get_users(self, selected_genres):
        users = pd.DataFrame(self.ratings['userId'].unique(), columns=['userId'])
        df = pd.merge(self.movies, self.ratings, on='movieId')
        for genre in selected_genres:
            genre_ratings = df[df['genres'].apply(lambda x: genre in x)]
            users[f"{genre}"] = users.apply(lambda x: genre_ratings[genre_ratings['userId'] == x['userId']]['rating'].mean(), axis=1)
        users = users.fillna(0)
        return users
    
    def clusterUsers(self, k = 50):
        kmeans = KMeans(n_clusters=k)
        usersCluster = kmeans.fit(self.users.drop('userId', axis=1))
        self.users['cluster'] = usersCluster.labels_

    def predict(self, userID, movieID):
        return self.predictByCLuster(userID, movieID)

    def predictByCLuster(self, userID, movieID):
        userCluster = self.users[self.users['userId'] == userID]['cluster'].values[0]
        usersInCluster = self.users[self.users['cluster'] == userCluster]['userId']
        return self.ratings[self.ratings['userId'].isin(usersInCluster)].loc[self.ratings['movieId'] == movieID].rating.mean()
    

In [4]:
movies = pd.read_csv('./data/movies.csv')
ratings = pd.read_csv('./data/ratings.csv')

train_ratings, test_ratings = train_test_split(ratings, test_size=0.95)

In [5]:
model = Model()
model.fit(movies, train_ratings)

In [None]:
test_ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
16720647,164249,30793,3.0,1270740671
16015006,157200,6713,3.0,1588718152
11402879,112278,1729,4.0,1137235396
870399,8347,466,4.0,846286578
4441355,43359,345,3.0,845385966
4330456,42298,1617,4.5,1459091600
20889042,204007,3911,3.5,1452309807
22909593,223289,1036,4.0,1043031213
9523951,93651,7792,3.0,1459158685
21276733,207671,62,3.5,1180841935


In [None]:
for row in test_ratings.iloc[:10, :].iterrows():
    print(model.predictByCLuster(row[1]['userId'], row[1]['movieId']), row[1]['rating'])

2.757751937984496 3.0
4.0 3.0
4.0 4.0
0.6666666666666666 4.0
4.01360544217687 3.0
3.6322674418604652 4.5
3.9824120603015074 3.5
3.8566929133858268 4.0
3.3636363636363638 3.0
3.1435406698564594 3.5


In [None]:
df1 = pd.DataFrame([2, 5], columns=['Movie1'])
df2 = pd.DataFrame(np.array([[False, False], [True, False], [False, False], [False, False], [False, True]]).T, 
                  columns=['Movie1_1', 'Movie1_2', 'Movie1_3', 'Movie1_4', 'Movie1_5'])
print(df1)
print(df2)

   Movie1
0       2
1       5
   Movie1_1  Movie1_2  Movie1_3  Movie1_4  Movie1_5
0     False      True     False     False     False
1     False     False     False     False      True
