In [None]:
%pip install spacy > /dev/null
%pip install scikit-learn > /dev/null
%pip install pandas > /dev/null

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings = pd.read_csv("data/colab_ratings.csv", index_col=0)
ratings.fillna(0, inplace=True)
ratings

Unnamed: 0,Articulo1,Articulo2,Articulo3,Articulo4,Articulo5
Usuario1,5.0,3.0,0.0,1,0.0
Usuario2,4.0,0.0,0.0,1,0.0
Usuario3,1.0,1.0,0.0,5,0.0
Usuario4,1.0,0.0,0.0,4,0.0
Usuario5,0.0,1.0,5.0,4,0.0


In [3]:
user_similarity_df = pd.DataFrame(
    cosine_similarity(ratings), index=ratings.index, columns=ratings.index
)
user_similarity_df

Unnamed: 0,Usuario1,Usuario2,Usuario3,Usuario4,Usuario5
Usuario1,1.0,0.860916,0.42289,0.368964,0.182574
Usuario2,0.860916,1.0,0.420084,0.470588,0.149696
Usuario3,0.42289,0.420084,1.0,0.980196,0.62361
Usuario4,0.368964,0.470588,0.980196,1.0,0.598785
Usuario5,0.182574,0.149696,0.62361,0.598785,1.0


In [4]:
def calculate_recommendations(user_id: str) -> pd.Series:
    user = ratings.loc[user_id]
    articles_to_pred = user[user == 0].index
    similar_users = user_similarity_df.loc[user_id]

    predicted_ratings = {}

    for article in articles_to_pred:
        art_classif = ratings[article]
        art_classif.drop(user_id, inplace=True)
        similarity = similar_users.drop(user_id)

        # The numerator is the sum of the product of the ratings and the similarities
        # So if a user has a rating of 5 but is not similar to the user, the rating will less important
        num = (art_classif * similarity).sum()

        # The denominator is the sum of the similarities
        # We only want to consider the users that have rated the
        # article we are trying to predict
        den = similarity[art_classif != 0].sum()

        if den == 0:
            den = 0.0001

        predicted_ratings[article] = num / den
        predicted_ratings[article] = predicted_ratings[article]

    return pd.Series(predicted_ratings).sort_values(ascending=False)

In [5]:
for user in ratings.index:
    recommendations = calculate_recommendations(user)
    print(f"Recommendations for user {user}")
    print(recommendations)
    print("\n\n")

Recommendations for user Usuario1
Articulo3    5.0
Articulo5    0.0
dtype: float64



Recommendations for user Usuario2
Articulo3    5.000000
Articulo2    2.203492
Articulo5    0.000000
dtype: float64



Recommendations for user Usuario3
Articulo3    5.0
Articulo5    0.0
dtype: float64



Recommendations for user Usuario4
Articulo3    5.000000
Articulo2    1.378824
Articulo5    0.000000
dtype: float64



Recommendations for user Usuario5
Articulo1    1.758611
Articulo5    0.000000
dtype: float64





## Normalization of the users classification

In [6]:
ratings_mean = ratings[ratings != 0].mean(axis=1)
ratings_mean

Usuario1    3.000000
Usuario2    2.500000
Usuario3    2.333333
Usuario4    2.500000
Usuario5    3.333333
dtype: float64

In [7]:
ratings_normalized = ratings[ratings != 0].sub(ratings_mean, axis=0)
ratings_normalized

Unnamed: 0,Articulo1,Articulo2,Articulo3,Articulo4,Articulo5
Usuario1,2.0,0.0,,-2.0,
Usuario2,1.5,,,-1.5,
Usuario3,-1.333333,-1.333333,,2.666667,
Usuario4,-1.5,,,1.5,
Usuario5,,-2.333333,1.666667,0.666667,


In [8]:
def calculate_recommendations_normalized(user_id: str) -> pd.Series:
    user = ratings_normalized.loc[user_id]
    articles_to_pred = user[user.isna()].index
    similar_users = user_similarity_df.loc[user_id]

    predicted_ratings = {}

    for article in articles_to_pred:
        art_classif = ratings_normalized[article]
        art_classif.drop(user_id, inplace=True)
        similarity = similar_users.drop(user_id)

        # The numerator is the sum of the product of the ratings and the similarities
        # So if a user has a rating of 5 but is not similar to the user, the rating will less important
        num = (art_classif * similarity).sum()

        # The denominator is the sum of the similarities
        # We only want to consider the users that have rated the
        # article we are trying to predict
        den = similarity[art_classif != 0].sum()

        if den == 0:
            den = 0.0001

        predicted_ratings[article] = num / den
        predicted_ratings[article] = predicted_ratings[article]

    return pd.Series(predicted_ratings).sort_values(ascending=False)

In [9]:
for user in ratings.index:
    recommendations = calculate_recommendations(user)
    recommendations_normalized = calculate_recommendations_normalized(user)
    print(f"Recommendations for user {user}")
    print("Non normalized")
    print(recommendations)
    print("Normalized")
    print(recommendations_normalized)
    print("\n\n")

Recommendations for user Usuario1
Non normalized
Articulo3    5.0
Articulo5    0.0
dtype: float64
Normalized
Articulo3    0.165795
Articulo5    0.000000
dtype: float64



Recommendations for user Usuario2
Non normalized
Articulo3    5.000000
Articulo2    2.203492
Articulo5    0.000000
dtype: float64
Normalized
Articulo3    0.131224
Articulo5    0.000000
Articulo2   -0.874116
dtype: float64



Recommendations for user Usuario3
Non normalized
Articulo3    5.0
Articulo5    0.0
dtype: float64
Normalized
Articulo3    0.424783
Articulo5    0.000000
dtype: float64



Recommendations for user Usuario4
Non normalized
Articulo3    5.000000
Articulo2    1.378824
Articulo5    0.000000
dtype: float64
Normalized
Articulo3    0.412636
Articulo5    0.000000
Articulo2   -1.319347
dtype: float64



Recommendations for user Usuario5
Non normalized
Articulo1    1.758611
Articulo5    0.000000
dtype: float64
Normalized
Articulo5    0.000000
Articulo1   -0.733254
dtype: float64



