In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from skopt import gp_minimize
import csv

In [None]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)


## Data PreProcessing

In [None]:
movies = pd.read_csv('../Dataset/movies.csv')
ratings = pd.read_csv('../Dataset/ratings.csv')

In [None]:
movies_data = movies.copy()

In [None]:
# remove year from title and add to a new column
movies_data['year'] = movies_data.title.str.extract("\((\d{4})\)", expand=True)
movies_data['year'] = pd.to_datetime(movies_data['year'], format='%Y')
movies_data['year'] = movies_data['year'].dt.year
movies_data['title'] = movies_data.title.str[:-7]

# one hot encoding for genres
movies_data = movies_data.join(movies_data.genres.str.get_dummies(sep='|'))
                     
# create a TF-IDF vectorizer for the titles and insert into the movies dataframe
tfidf = TfidfVectorizer(stop_words='english')
movies_data['title'] = movies_data['title'].str.lower()
tfidf_matrix = tfidf.fit_transform(movies_data['title'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=movies_data.index.tolist())
movies_data = pd.concat([movies_data, tfidf_df], axis=1, join='inner').drop(['genres', 'title'], axis=1)

# remove movies with the genre 'no genres listed'
movies_data = movies_data[movies_data['(no genres listed)'] == 0]
movies_data = movies_data.drop('(no genres listed)', axis=1)

# remove movies with no year
movies_data = movies_data[movies_data['year'].notna()]

In [None]:
# apply min-max scaling to the year column
scaler = MinMaxScaler()
movies_data['year'] = scaler.fit_transform(movies_data[['year']])
movies_data['year'] = movies_data['year'].round(4)

In [None]:
movies_data = movies_data[movies_data.movieId.isin(ratings.movieId.unique())]
movies_data.index = movies_data['movieId']
movies_data = movies_data.drop('movieId', axis=1)

In [None]:
ratings = ratings[ratings.movieId.isin(movies_data.index)]

In [None]:
# create a ratings matrix
ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values='rating')

# get the number of ratings for each user
user_counts = pd.DataFrame(ratings_matrix.count(axis=1), columns=['count'])
user_counts['userId'] = user_counts.index
user_counts = user_counts.sort_values('count', ascending=False)

# Fill NaNs with 0
ratings_matrix.fillna(0, inplace=True)

# order the users by the number of ratings (descending)
ratings_matrix = ratings_matrix.reindex(ratings_matrix.astype(bool).sum(axis=1).sort_values(ascending=False).index)

# reindex the ratings matrix with the users ordered by the number of ratings
ratings_matrix = ratings_matrix.reindex(user_counts.index)

# split into (50/20/30) train/(validation/test folds)
train, val_test_folds = train_test_split(ratings_matrix, test_size=0.5, random_state=SEED, shuffle=False)

In [None]:
val_combinations = [(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
test_combinations = [(2, 3, 4), (1, 3, 4), (1, 2, 4), (1, 2, 3), (0, 3, 4), (0, 2, 4), (0, 2, 3), (0, 1, 4), (0, 1, 3), (0, 1, 2)]

In [None]:
# shuffle the val_test
val_test_folds = val_test_folds.sample(frac=1, random_state=SEED)

# split into 5 folds for cross validation
val_test_folds = np.array_split(val_test_folds, 5)

In [None]:
test_set = None

## Data processing

In [None]:
cf_nearest_neighbors = NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1)
cf_nearest_neighbors.fit(train.values)
cf_n_neighbors = [15,16,15,15,17,29,15,29,16,29]

def cf(user_movies_watched, user_data_temp, idx):
    distances, indices = cf_nearest_neighbors.kneighbors(user_data_temp.values.reshape(1, -1), n_neighbors=cf_n_neighbors[idx])

    movies_to_recommend = pd.DataFrame(columns=['recommendations'])
    movies_to_recommend['recommendations'] = np.zeros(test_set.shape[1])
    movies_to_recommend.index = movies_data.index

    
    # for each neighbor
    for i in range(np.shape(indices)[1]):
        # get the movies the neighbor has rated
        neighbor_rated_movies = train.iloc[indices[0][i]][train.iloc[indices[0][i]] > 0].index.tolist()
        
        for movie in neighbor_rated_movies:
            # increment the number of recommendations for the movie
            value = movies_to_recommend.at[movie, 'recommendations']
            value += 1
            movies_to_recommend.loc[movie, 'recommendations'] = value


    movies_to_recommend = movies_to_recommend.sort_values(by=['recommendations'], ascending=False)

    movies_to_recommend = movies_to_recommend[~movies_to_recommend.index.isin(user_movies_watched)]

    movies_to_recommend = movies_to_recommend[:30]

    return movies_to_recommend.index.tolist()

In [None]:
cb_n_neighbors = [4, 5, 4, 19, 4, 4, 4, 4, 25, 7]
cb_metric = ['euclidean', 'manhattan', 'euclidean', 'cosine', 'euclidean', 'euclidean', 'euclidean', 'euclidean', 'cosine', 'cosine']

def cb(n, user_movies_watched, idx):
     cb_nearest_neighbors = NearestNeighbors(n_neighbors=cb_n_neighbors[idx] , algorithm='brute', metric=cb_metric[idx], n_jobs=-1)
     cb_nearest_neighbors.fit(movies_data.values)


     similar_movies = np.zeros(movies_data.shape[0])
     mask_similar_movies = np.zeros(movies_data.shape[0])
          
     data_to_predict = movies_data.loc[user_movies_watched[-n:]].values

     distances, indices = cb_nearest_neighbors.kneighbors(data_to_predict)

     for i in range(indices.shape[0]):
          for j, idx in enumerate(indices[i]):
               mask_similar_movies[idx] = 1
               similar_movies[idx] += distances[i][j]
                
     similar_movies_data = pd.DataFrame()
     similar_movies_data.index = movies_data.index
     similar_movies_data['distance'] = similar_movies
     similar_movies_data['mask'] = mask_similar_movies
     similar_movies_data = similar_movies_data[similar_movies_data['mask'] == 1]
     similar_movies_data = similar_movies_data.sort_values(by=['distance'], ascending=True)

     
     similar_movies_data = similar_movies_data[~similar_movies_data.index.isin(user_movies_watched)]

     similar_movies_data = similar_movies_data.head(30)

     similar_movies = similar_movies_data.index.tolist()

     return similar_movies


In [None]:
for idx, folds in enumerate(test_combinations):
    test_folds = [val_test_folds[i] for i in folds]
    test_set = pd.concat(test_folds)

    users_precision = {}
    users_precision[3] = []
    users_precision[5] = []
    users_precision[10] = []

    for user in test_set.index:
        user_data = test_set.loc[user]
        
        rated_movies = ratings_matrix.loc[user][ratings_matrix.loc[user] > 0].index.tolist()
        
        # order the movies by timestamp (ascending)
        rated_movies = ratings.loc[
            ratings['movieId'].isin(rated_movies) & 
            (ratings['userId'] == user)].sort_values(
                by=['timestamp'], ascending=False
                )['movieId'].tolist()
        
        for n in [3, 5, 10]:
            # For Both
            user_movies_watched = rated_movies[:-n]
            user_movies_to_predict = rated_movies[-n:]

            # For CF
            user_data_temp = user_data.copy()
            user_data_temp[user_movies_to_predict] = 0

            cf_top30 = cf(user_movies_watched, user_data_temp, idx)
            cb_top30 = cb(n, user_movies_watched, idx)

            cf_top30 = cf_top30[::-1]
            cb_top30 = cb_top30[::-1]

            cf_top30 = pd.DataFrame(cf_top30, columns=['movieId'])
            cb_top30 = pd.DataFrame(cb_top30, columns=['movieId'])

            movies_cb = np.shape(cb_top30)[0]
            movies_cf = np.shape(cf_top30)[0]

            cf_top30['weight'] = np.arange(30, 30 - movies_cf, -1)
            cb_top30['weight'] = np.arange(30, 30 - movies_cb, -1) 

            # ponderate the scores 70% for cf and 30% for bc
            cf_top30['weight'] = cf_top30['weight'] * 0.7
            cb_top30['weight'] = cb_top30['weight'] * 0.3

            # create the hybrid list
            hybrid_list = pd.concat([cf_top30, cb_top30])
            hybrid_list = hybrid_list.groupby(['movieId']).sum()

            # order the hybrid list by score (descending)
            hybrid_list = hybrid_list.sort_values(by=['weight'], ascending=False)

            # get the top n movies from the hybrid recommender
            hybrid_list = hybrid_list.index.tolist()
            hybrid_list = hybrid_list[:n]

            # calculate the precision
            hits = len(set(hybrid_list).intersection(set(user_movies_to_predict)))

            precision = hits / n

            users_precision[n].append(precision)
    
    for n in [3, 5, 10]:
        precision = np.array(users_precision[n])
        average = precision.mean()
        std = precision.std()

        with open('test_hybrid_nn.csv', 'a') as f:
            f.write(f'{folds},{n},{average},{std}\n')
