In [None]:
%pip install spacy > /dev/null
%pip install scikit-learn > /dev/null
%pip install pandas > /dev/null

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.es.stop_words import STOP_WORDS as STOP_WORDS_ES

In [2]:
ratings = pd.read_csv("data/colab_ratings.csv", index_col=0)
ratings

Unnamed: 0,Articulo1,Articulo2,Articulo3,Articulo4,Articulo5
Usuario1,5.0,3.0,,1,
Usuario2,4.0,,,1,
Usuario3,1.0,1.0,,5,
Usuario4,1.0,,,4,
Usuario5,,1.0,5.0,4,


In [3]:
products = pd.read_csv("data/content_data.csv")
products["Content"] = products["Description"] + " " + products["Keywords"]
products

Unnamed: 0,ArticleID,Title,Description,Keywords,Content
0,Articulo1,Aprendiendo Python,"Este libro cubre los fundamentos de Python, in...",Python programación desarrollo,"Este libro cubre los fundamentos de Python, in..."
1,Articulo2,Introducción a Machine Learning,Un curso sobre los conceptos básicos de Machin...,Machine Learning IA algoritmos,Un curso sobre los conceptos básicos de Machin...
2,Articulo3,Guía de Data Science,Explora técnicas de Data Science y cómo trabaj...,Data Science análisis datos,Explora técnicas de Data Science y cómo trabaj...
3,Articulo4,Aprende programación en R,Un tutorial para comenzar a programar en R par...,R estadística programación,Un tutorial para comenzar a programar en R par...
4,Articulo5,Deep Learning avanzado,Profundiza en técnicas avanzadas de Deep Learn...,Deep Learning redes neuronales,Profundiza en técnicas avanzadas de Deep Learn...


In [4]:
# Info for the colaborative filtering part
user_means = ratings.mean(axis=1)
df_normalized = ratings.sub(user_means, axis=0)
df_normalized_filled = df_normalized.fillna(0)
user_similarity = cosine_similarity(df_normalized_filled)
user_similarity_df = pd.DataFrame(
    user_similarity, index=ratings.index, columns=ratings.index
)

In [5]:
# Info for the content-based filtering part
tfidf = TfidfVectorizer(stop_words=list(STOP_WORDS_ES))
tfidf_matrix = tfidf.fit_transform(products["Content"])
cosine_sim = cosine_similarity(tfidf_matrix)

In [6]:
ratings.loc["Usuario1"]

Articulo1    5.0
Articulo2    3.0
Articulo3    NaN
Articulo4    1.0
Articulo5    NaN
Name: Usuario1, dtype: float64

In [18]:
def predict_ratings(user_id, df_ratings, user_similarity_df):
    user_ratings = df_ratings.loc[user_id]
    user_mean = user_ratings.mean()

    items_to_predict = user_ratings[user_ratings.isna()].index

    user_similarities = user_similarity_df[user_id]

    predictions = {}

    for item in items_to_predict:
        item_ratings = df_ratings[item]
        valid_ratings = item_ratings[item_ratings.notna()]
        valid_similarities = user_similarities[valid_ratings.index]

        if not valid_ratings.empty:
            pred = (
                np.dot(
                    valid_similarities,
                    valid_ratings - df_ratings.loc[valid_ratings.index].mean(axis=1),
                )
                / valid_similarities.abs().sum()
            )
            pred += user_mean
            predictions[item] = pred
    return predictions

In [56]:
def content_based_recommendations(user_id, df_ratings, df_items, cosine_sim):
    user_ratings = df_ratings.loc[user_id]
    rated_items = user_ratings[user_ratings.notna()].index

    if rated_items.empty:
        return df_items["ArticleID"]

    item_indices = df_items[df_items["ArticleID"].isin(rated_items)].index

    sim_scores = cosine_sim[item_indices].mean(axis=0)

    sim_scores = pd.Series(sim_scores, index=df_items["ArticleID"])

    sim_scores = sim_scores.drop(labels=rated_items)

    return sim_scores


def hybrid_recommendations(
    user_id,
    df_ratings,
    df_items,
    user_similarity_df,
    cosine_sim,
    alpha=0.5,
    n_recommendations=5,
):
    cf_predictions = predict_ratings(user_id, df_ratings, user_similarity_df)
    cf_series = pd.Series(cf_predictions)

    cb_scores = content_based_recommendations(user_id, df_ratings, df_items, cosine_sim)

    cf_series_scaled = alpha * cf_series
    cb_scores_scaled = (1 - alpha) * cb_scores

    combined_scores = cb_scores_scaled.add(cf_series_scaled, fill_value=0)

    recommendations = combined_scores.sort_values(ascending=False).head(
        n_recommendations
    )
    return recommendations

In [57]:
for user in ratings.index:
    recommendations = hybrid_recommendations(
        user,
        ratings,
        products,
        user_similarity_df,
        cosine_sim,
        alpha=0.5,  # Weight for the filtering
        n_recommendations=5,
    )

    print(f"Recommendations for {user}")
    print(recommendations)
    print("\n\n")

Recomendations for Usuario1
Articulo3    0.694845
Articulo5    0.029426
dtype: float64



Recomendations for Usuario2
Articulo2    1.627151
Articulo3    0.458934
Articulo5    0.000000
dtype: float64



Recomendations for Usuario3
Articulo3    2.028178
Articulo5    0.029426
dtype: float64



Recomendations for Usuario4
Articulo3    2.125601
Articulo2    0.872849
Articulo5    0.000000
dtype: float64



Recomendations for Usuario5
Articulo1    0.947944
Articulo5    0.036430
dtype: float64



