## Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import optuna

In [None]:
SEED = 42
np.random.seed(SEED)

## Data PreProcessing

In [None]:
ratings = pd.read_csv('../Dataset/ratings.csv')

In [None]:
# create a ratings matrix
ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values='rating')

# get the number of ratings for each user
user_counts = pd.DataFrame(ratings_matrix.count(axis=1), columns=['count'])
user_counts['userId'] = user_counts.index
user_counts = user_counts.sort_values('count', ascending=False)

# Fill NaNs with 0
ratings_matrix.fillna(0, inplace=True)

# order the users by the number of ratings (descending)
ratings_matrix = ratings_matrix.reindex(ratings_matrix.astype(bool).sum(axis=1).sort_values(ascending=False).index)

# reindex the ratings matrix with the users ordered by the number of ratings
ratings_matrix = ratings_matrix.reindex(user_counts.index)

# split into (50/20/30) train/(validation/test folds)
train, val_test_folds = train_test_split(ratings_matrix, test_size=0.5, random_state=SEED, shuffle=False)

In [None]:
val_combinations = [(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
test_combinations = [(2, 3, 4), (1, 3, 4), (1, 2, 4), (1, 2, 3), (0, 3, 4), (0, 2, 4), (0, 2, 3), (0, 1, 4), (0, 1, 3), (0, 1, 2)]

In [None]:
# shuffle the val_test
val_test_folds = val_test_folds.sample(frac=1, random_state=SEED)

# split into 5 folds for cross validation
val_test_folds = np.array_split(val_test_folds, 5)

In [None]:
test_set = None

## Data processing

In [None]:
# tests
best_params = {'n_clusters': 100, 'max_iter': 300}

In [None]:
kmeans = KMeans(n_clusters=best_params['n_clusters'], max_iter=best_params['max_iter'], random_state=SEED)

# tests training
kmeans.fit(train)

labels = kmeans.labels_

In [None]:
for i, folds in enumerate(test_combinations):

    # get the test set
    test_folds = [val_test_folds[x] for x in folds]
    test_set = pd.concat(test_folds)

    users_precision = {}
    users_precision[3] = []
    users_precision[5] = []
    users_precision[10] = []

    for user in test_set.index:
        rated_movies = ratings_matrix.loc[user][ratings_matrix.loc[user] > 0].index.tolist()

        
        # order the movies by timestamp (descending)
        rated_movies = ratings.loc[
            ratings['movieId'].isin(rated_movies) & 
            (ratings['userId'] == user)].sort_values(
                by=['timestamp'], ascending=False
                )['movieId'].tolist()
        
        for n in [3, 5, 10]:
            user_movies_watched = rated_movies[:-n]
            user_movies_to_predict = rated_movies[-n:]

            # create an user with the movies watched
            user_data = np.zeros(len(test_set.columns))

            for movie in user_movies_watched:
                user_data[ratings_matrix.columns.get_loc(movie)] = ratings_matrix.loc[user][movie]

            user_data = np.array(user_data).reshape(1, -1)
            
            
            prediction = kmeans.predict(user_data)
            users_alike = np.where(labels == prediction)[0]

            movies_rate = np.zeros(len(test_set.columns))
            
            for u in users_alike:
                movies_rate += ratings_matrix.loc[u].values
            
            movies_rate = pd.Series(movies_rate, index=test_set.columns).sort_values(ascending=True)

            # get the top 10 movies_rate that the user has not watched
            movies_rate = movies_rate[~movies_rate.index.isin(user_movies_watched)]
            movies_rate = movies_rate[:n]

            hits = len(set(movies_rate).intersection(user_movies_to_predict))

            
            precision = hits / n
            users_precision[n].append(precision)

    for n in [3, 5, 10]:
        precision = np.array(users_precision[n])
        average = precision.mean()
        std = precision.std()
        with open('KMeans_test.csv', 'a') as f:
            f.write(f'{folds},{n},{average},{std}\n')