In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error

In [2]:
movies = pd.read_csv("https://github.com/tiagofassoni/useful-datasets/raw/main/ml-latest-small/movies.csv")
ratings = pd.read_csv("https://github.com/tiagofassoni/useful-datasets/raw/main/ml-latest-small/ratings.csv")
links = pd.read_csv("https://github.com/tiagofassoni/useful-datasets/raw/main/ml-latest-small/links.csv")
tags = pd.read_csv("https://github.com/tiagofassoni/useful-datasets/raw/main/ml-latest-small/tags.csv")

In [3]:
data = ratings.merge(movies, on="movieId", how="left")

In [4]:
movie_user = data.pivot_table(index='userId',columns='title',values='rating')

In [5]:
movie_user.fillna(0, inplace=True)

In [6]:
ratings_pos = pd.DataFrame(np.nonzero(np.array(movie_user))).T
ratings_pos

In [7]:
train_pos, test_pos = train_test_split(ratings_pos, random_state=42, test_size=.2)

In [8]:
train_pos

Unnamed: 0,0,1
8344,56,5983
84029,533,8701
30407,211,5380
32375,220,3415
13938,88,7405
...,...,...
6265,41,8766
54886,363,5711
76820,479,6713
860,5,9271


In [9]:
train = np.zeros(movie_user.shape)
for pos in train_pos.values: 
    index = pos[0]
    col = pos[1]
    train[index, col] = movie_user.iloc[index, col]
train = pd.DataFrame(train, columns=movie_user.columns, index=movie_user.index).apply(pd.to_numeric, downcast='integer')

test = np.zeros(movie_user.shape)
for pos in test_pos.values: 
    index = pos[0]
    col = pos[1]
    test[index, col] = movie_user.iloc[index, col]
test = pd.DataFrame(test, columns=movie_user.columns, index=movie_user.index).apply(pd.to_numeric, downcast='integer')

In [10]:
train_similarity = pd.DataFrame(cosine_similarity(train), columns=movie_user.index, index=movie_user.index)

In [11]:
def recommender(index_name, column_name, sim_df, data): 
    results = (
    pd.DataFrame({
        'ratings': data.loc[:,column_name], 
        'similarities' : sim_df.loc[index_name,:].tolist()
    })
        .assign(weighted_ratings = lambda x: x.ratings * x.similarities)
        .query('ratings != 0')
        .agg({
            'weighted_ratings':'sum', 
            'similarities':'sum'
        })
    )
    if results[1] == 0:
        return 9001
    pred_rating = results[0] / results[1]
    return pred_rating

    #pred_rating = results[0] / results[1]
    #return pred_rating

In [12]:
recommendations = pd.DataFrame(np.zeros(movie_user.shape), columns=movie_user.columns, index=movie_user.index)
recommendations

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
for col in train.columns: 
    for index in train.index:
        recommendations.loc[index, col] = round(recommender(index, col, train_similarity, train),1)
recommendations

In [None]:
for pos in test_pos.values: 
    index = test.index[pos[0]]
    column = test.columns[pos[1]]
    print(f'Prediction: {round(recommender(index, column, train_similarity, train),1)}\t Real value: {test.loc[index,column]}')

In [None]:
predictions = [round(recommender(test.index[pos[0]], test.columns[pos[1]], train_similarity, train),1) for pos in test_pos.values]
true_values = [test.iloc[pos[0],pos[1]] for pos in test_pos.values]

plt.title('Error analysis')
plt.xlabel('Predicted ratings')
plt.ylabel('True ratings')
sns.lineplot(x=[0,10], y=[0,10], color='red')
sns.scatterplot(x=predictions, y=true_values)
plt.show()

other ways to evaluate the model

In [None]:
results = pd.DataFrame({
    'predictions':predictions,
    'true_values':true_values
})
results

In [None]:
mse_pre = (results
               .assign(diff = lambda x: (x.true_values - x.predictions)**2)
               .agg({'diff':{'sum','count'}}))
mse = mse_pre.loc['sum'] / mse_pre.loc['count']
mse

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(results.true_values, results.predictions)

In [None]:
rmse = (mse_pre.loc['sum'] / mse_pre.loc['count'])**0.5
rmse

In [None]:
mean_squared_error(results.true_values, results.predictions)**0.5

In [None]:
mae_pre = (results
               .assign(diff = lambda x: abs(x.true_values - x.predictions))
               .agg({'diff':{'sum','count'}}))
mae = mae_pre.loc['sum'] / mae_pre.loc['count']
mae

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(results.true_values, results.predictions)

In [None]:
def predictions_analysis(y_true, preds): 
    print(
        f"""
        MSE: {mean_squared_error(y_true, preds)}
        RMSE: {mean_squared_error(y_true, preds)**0.5}
        MAE: {mean_absolute_error(y_true, preds)}
        """
    )
    plt.title('Error analysis')
    plt.xlabel('Predicted ratings')
    plt.ylabel('True ratings')
    sns.lineplot(x=[0,10], y=[0,10], color='red')
    sns.scatterplot(x=preds, y=y_true)
    plt.show()
    
predictions_analysis(true_values, predictions)

In [None]:
"""
def rec_iter1(user_id, n):
    '''
    similarity matrix: has to be previously computed
    user_id: the user for which to provide recommendations
    n: the number of recommendations to provide
    '''
    
    # compute the weights for the inputed user
    weights = user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id])
    
    # select movies that the inputed user has not seen
    unseen_movies = movie_user.loc[movie_user.index!=user_id, movie_user.loc[user_id,:]==0]
    
    # multiply the unseen movies and the weights
    weighted_averages = unseen_movies.T.dot(weights)
    
    # get the top n movies
    recommendations = weighted_averages.sort_values(ascending=False).head(n).index.tolist()
    
    return recommendations
"""

In [None]:
"""
def rec_iter3(user_id):
    # compute the weights for the inputed user
    weights = similarity.query('userId!=@user_id')[user_id] / sum(similarity.query('userId!=@user_id')[user_id])
    
    ## remove users who didn't rate movies


    #redistributing weights
    weights = weights + abs(min(weights))
    weights = weights - min(weights)
    if len(weights)>1 and sum(weights)!=0:
        weights=weights/sum(weights)
    elif sum(weights)==0:
        weights = weights/len(weights)

    # select movies that the inputed user has not seen
    unseen_movies = imp_ratings.loc[imp_ratings.index!=user_id, imp_ratings.loc[user_id,:]==0]
    
    # multiply the unseen movies and the weights
    weighted_averages = unseen_movies.T.dot(weights)
    
    # get the top n movies
    recommendations = weighted_averages.sort_values(ascending=False)
    
    return recommendations
"""