In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
SEED = 42
np.random.seed(SEED)

## Data PreProcessing

In [3]:
# Reading Datasets
movies = pd.read_csv('../Dataset/movies.csv')
ratings = pd.read_csv('../Dataset/ratings.csv')

In [4]:
movies_data = movies.copy()

In [5]:
# remove year from title and add to a new column
movies_data['year'] = movies_data.title.str.extract("\((\d{4})\)", expand=True)
movies_data['year'] = pd.to_datetime(movies_data['year'], format='%Y')
movies_data['year'] = movies_data['year'].dt.year
movies_data['title'] = movies_data.title.str[:-7]

# one hot encoding for genres
movies_data = movies_data.join(movies_data.genres.str.get_dummies(sep='|'))
                     
# create a TF-IDF vectorizer for the titles and insert into the movies dataframe
tfidf = TfidfVectorizer(stop_words='english')
movies_data['title'] = movies_data['title'].str.lower()
tfidf_matrix = tfidf.fit_transform(movies_data['title'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=movies_data.index.tolist())
movies_data = pd.concat([movies_data, tfidf_df], axis=1, join='inner').drop(['genres', 'title'], axis=1)

# remove movies with the genre 'no genres listed'
movies_data = movies_data[movies_data['(no genres listed)'] == 0]
movies_data = movies_data.drop('(no genres listed)', axis=1)

# remove movies with no year
movies_data = movies_data[movies_data['year'].notna()]

In [6]:
movies_data = movies_data[movies_data.movieId.isin(ratings.movieId.unique())]
movies_data.index = movies_data['movieId']
movies_data = movies_data.drop('movieId', axis=1)

In [7]:
ratings = ratings[ratings.movieId.isin(movies_data.index)]

In [8]:
# create a ratings matrix
ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values='rating')

# get the number of ratings for each user
user_counts = pd.DataFrame(ratings_matrix.count(axis=1), columns=['count'])
user_counts['userId'] = user_counts.index
user_counts = user_counts.sort_values('count', ascending=False)

# Fill NaNs with 0
ratings_matrix.fillna(0, inplace=True)

# order the users by the number of ratings (descending)
ratings_matrix = ratings_matrix.reindex(ratings_matrix.astype(bool).sum(axis=1).sort_values(ascending=False).index)

# reindex the ratings matrix with the users ordered by the number of ratings
ratings_matrix = ratings_matrix.reindex(user_counts.index)

# split into (50/20/30) train/(validation/test folds)
train, val_test_folds = train_test_split(ratings_matrix, test_size=0.5, random_state=SEED, shuffle=False)

In [9]:
val_combinations = [(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
test_combinations = [(2, 3, 4), (1, 3, 4), (1, 2, 4), (1, 2, 3), (0, 3, 4), (0, 2, 4), (0, 2, 3), (0, 1, 4), (0, 1, 3), (0, 1, 2)]

In [10]:
# shuffle the val_test
val_test_folds = val_test_folds.sample(frac=1, random_state=SEED)

# split into 5 folds for cross validation
val_test_folds = np.array_split(val_test_folds, 5)

## Processing

In [11]:
#top 10 movies with the highest average rating of the train set
top_movies = train.mean(axis=0).sort_values(ascending=False)

In [12]:
for folds in test_combinations:
    # create the test fold
    test_folds = [val_test_folds[i] for i in folds]
    test_set = pd.concat(test_folds)
    
    users_precision = {}
    users_precision[3] = []
    users_precision[5] = []
    users_precision[10] = []
    
    # for each user
    for user in test_set.index:
        rated_movies = ratings_matrix.loc[user][ratings_matrix.loc[user] > 0].index.tolist()
        
        # order the movies by timestamp
        rated_movies = ratings.loc[
            ratings['movieId'].isin(rated_movies) & 
            (ratings['userId'] == user)].sort_values(
                by=['timestamp'], ascending=False
                )['movieId'].tolist()
        
        for n in [3, 5, 10]:
        
            user_movies_watched = rated_movies[:-n]
            user_movies_to_predict = rated_movies[-n:]

            #top n movies of the top_movies which the user hasn't watched
            top_movies_user = top_movies[
                ~top_movies.index.isin(user_movies_watched)
                ].index.tolist()[:n]
            
            #number of hits
            hits = len(set(top_movies_user).intersection(user_movies_to_predict))

            # precision
            precision = hits / 10
            users_precision[n].append(precision)
    
    
    # save in a file csv - the average and std for each n
    for n in [3, 5, 10]:
        precision = np.array(users_precision[n])
        average = precision.mean()
        std = precision.std()
        with open('topN.csv', 'a') as f:
            f.write(f'{folds},{n},{average},{std}\n')