## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from skopt import gp_minimize
import csv

In [2]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## Data PreProcessing

In [3]:
movies = pd.read_csv('../Dataset/movies.csv')
ratings = pd.read_csv('../Dataset/ratings.csv')

In [5]:
movies_data = movies.copy()

In [6]:
# remove year from title and add to a new column
movies_data['year'] = movies_data.title.str.extract("\((\d{4})\)", expand=True)
movies_data['year'] = pd.to_datetime(movies_data['year'], format='%Y')
movies_data['year'] = movies_data['year'].dt.year
movies_data['title'] = movies_data.title.str[:-7]

# one hot encoding for genres
movies_data = movies_data.join(movies_data.genres.str.get_dummies(sep='|'))
                     
# create a TF-IDF vectorizer for the titles and insert into the movies dataframe
tfidf = TfidfVectorizer(stop_words='english')
movies_data['title'] = movies_data['title'].str.lower()
tfidf_matrix = tfidf.fit_transform(movies_data['title'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=movies_data.index.tolist())
movies_data = pd.concat([movies_data, tfidf_df], axis=1, join='inner').drop(['genres', 'title'], axis=1)

# remove movies with the genre 'no genres listed'
movies_data = movies_data[movies_data['(no genres listed)'] == 0]
movies_data = movies_data.drop('(no genres listed)', axis=1)

# remove movies with no year
movies_data = movies_data[movies_data['year'].notna()]

In [7]:
# apply min-max scaling to the year column
scaler = MinMaxScaler()
movies_data['year'] = scaler.fit_transform(movies_data[['year']])
movies_data['year'] = movies_data['year'].round(4)

In [8]:
movies_data = movies_data[movies_data.movieId.isin(ratings.movieId.unique())]
movies_data.index = movies_data['movieId']
movies_data = movies_data.drop('movieId', axis=1)

In [9]:
ratings = ratings[ratings.movieId.isin(movies_data.index)]

In [10]:
# create a ratings matrix
ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values='rating')

# get the number of ratings for each user
user_counts = pd.DataFrame(ratings_matrix.count(axis=1), columns=['count'])
user_counts['userId'] = user_counts.index
user_counts = user_counts.sort_values('count', ascending=False)

# Fill NaNs with 0
ratings_matrix.fillna(0, inplace=True)

# order the users by the number of ratings (descending)
ratings_matrix = ratings_matrix.reindex(ratings_matrix.astype(bool).sum(axis=1).sort_values(ascending=False).index)

# reindex the ratings matrix with the users ordered by the number of ratings
ratings_matrix = ratings_matrix.reindex(user_counts.index)

# split into (50/20/30) train/(validation/test folds)
train, val_test_folds = train_test_split(ratings_matrix, test_size=0.5, random_state=SEED, shuffle=False)

In [11]:
val_combinations = [(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
test_combinations = [(2, 3, 4), (1, 3, 4), (1, 2, 4), (1, 2, 3), (0, 3, 4), (0, 2, 4), (0, 2, 3), (0, 1, 4), (0, 1, 3), (0, 1, 2)]

In [12]:
# shuffle the val_test
val_test_folds = val_test_folds.sample(frac=1, random_state=SEED)

# split into 5 folds for cross validation
val_test_folds = np.array_split(val_test_folds, 5)

## Data processing

In [13]:
val_set = None

In [14]:
def objective(params):
    n_neighbors = params[0]
    metric = str(params[1])

    nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, n_jobs=-1)
    nearest_neighbors.fit(train.values)

    users_precision = []

    n = 10

    for user in val_set.index:
        user_data = val_set.loc[user]
        # get the movies the user has rated
        rated_movies = user_data[user_data > 0].index.tolist()

        rated_movies = ratings.loc[ratings['movieId'].isin(rated_movies) & (ratings['userId'] == user)].sort_values(by=['timestamp'], ascending=False)['movieId'].tolist()

        user_movies_watched = rated_movies[:-n]
        user_movies_to_predict = rated_movies[-n:]

        user_data_temp = user_data.copy()
        user_data_temp[user_movies_to_predict] = 0

        distances, indices = nearest_neighbors.kneighbors(user_data_temp.values.reshape(1, -1), n_neighbors=n_neighbors)

        movies_to_recommend = pd.DataFrame(columns=['recommendations'])
        movies_to_recommend['recommendations'] = np.zeros(val_set.shape[1])
        movies_to_recommend.index = movies_data.index

        # for each neighbor
        for i in range(np.shape(indices)[1]):
            # get the movies the neighbor has rated
            neighbor_rated_movies = train.iloc[indices[0][i]][train.iloc[indices[0][i]] > 0].index.tolist()

            for movie in neighbor_rated_movies:
                # increment the number of recommendations for the movie
                value = movies_to_recommend.at[movie, 'recommendations']
                value += 1
                movies_to_recommend.loc[movie, 'recommendations'] = value

        movies_to_recommend = movies_to_recommend.sort_values(by=['recommendations'], ascending=False)

        movies_to_recommend = movies_to_recommend[~movies_to_recommend.index.isin(user_movies_watched)]

        movies_to_recommend = movies_to_recommend[:n]

        hits = len(set(movies_to_recommend.index.tolist()) & set(user_movies_to_predict))

        precision = hits / n

        users_precision.append(precision)
    
    final_precision = np.mean(users_precision)
    return 1 - final_precision
    

In [15]:
# space=[list(range(3,31)), ['euclidean', 'manhattan', 'cosine']]

In [16]:
# # create csv file
# with open('best_FC_NN.csv', 'w', newline='') as file:
#     writer = csv.writer(file)
#     writer.writerow(["val_combination", "n_neighbors", "metric", "precision"])

In [17]:
# for i, folds in enumerate(val_combinations):
#     val_folds = [val_test_folds[i] for i in folds]
#     val_set = pd.concat(val_folds)
#     result = gp_minimize(objective, space, verbose=1, n_calls=20, n_random_starts=5, n_jobs=-1, random_state=SEED)

#     # save the best parameters
#     best_params = {}    
#     best_params['n_neighbors'] = result.x[0]
#     best_params['metric'] = result.x[1]
#     best_params['folds'] = folds
#     best_params['precision'] = 1 - result.fun

#     # csv file
#     with open('best_FC_NN.csv', 'a', newline='') as file:
#         writer = csv.writer(file)
#         writer.writerow([best_params['folds'], best_params['n_neighbors'], best_params['metric'], best_params['precision']])

## Evaluation

In [18]:
n_neighbors = [15,16,15,15,17,29,15,29,16,29]
metric = 'cosine'

In [19]:
nearest_neighbors = NearestNeighbors(algorithm='brute', metric=metric, n_jobs=-1)
nearest_neighbors.fit(train.values)

In [20]:
for idx, folds in enumerate(test_combinations):
    # create the test fold
    test_folds = [val_test_folds[i] for i in folds]
    test_set = pd.concat(test_folds)

    users_precision = {}
    users_precision[3] = []
    users_precision[5] = []
    users_precision[10] = []

    for user in test_set.index:
        user_data = test_set.loc[user]
        # get the movies the user has rated
        rated_movies = user_data[user_data > 0].index.tolist()

        rated_movies = ratings.loc[ratings['movieId'].isin(rated_movies) & (ratings['userId'] == user)].sort_values(by=['timestamp'], ascending=False)['movieId'].tolist()

        for n in [3, 5, 10]:
            user_movies_watched = rated_movies[:-n]
            user_movies_to_predict = rated_movies[-n:]

            user_data_temp = user_data.copy()
            user_data_temp[user_movies_to_predict] = 0

            distances, indices = nearest_neighbors.kneighbors(user_data_temp.values.reshape(1, -1), n_neighbors=n_neighbors[idx])

            movies_to_recommend = pd.DataFrame(columns=['recommendations'])
            
            movies_to_recommend['recommendations'] = np.zeros(test_set.shape[1])
            movies_to_recommend.index = movies_data.index

            # for each neighbor
            for i in range(np.shape(indices)[1]):
                # get the movies the neighbor has rated
                neighbor_rated_movies = train.iloc[indices[0][i]][train.iloc[indices[0][i]] > 0].index.tolist()
                
                for movie in neighbor_rated_movies:
                    # increment the number of recommendations for the movie
                    value = movies_to_recommend.at[movie, 'recommendations']
                    value += 1
                    movies_to_recommend.loc[movie, 'recommendations'] = value

            movies_to_recommend = movies_to_recommend.sort_values(by=['recommendations'], ascending=False)

            movies_to_recommend = movies_to_recommend[~movies_to_recommend.index.isin(user_movies_watched)]

            movies_to_recommend = movies_to_recommend[:n]

            hits = len(set(movies_to_recommend.index.tolist()) & set(user_movies_to_predict))

            precision = hits / n

            users_precision[n].append(precision)

    # save in a file csv - the average and std for each n
    for n in [3, 5, 10]:
        precision = np.array(users_precision[n])
        average = precision.mean()
        std = precision.std()
        with open('test_FC_nn.csv', 'a') as f:
            f.write(f'{folds},{n},{average},{std}\n')
