In [None]:
%run Preprocessing_v2.ipynb

# Create TF-IDF matrix from preprocessed tag columns
from sklearn.feature_extraction.text import TfidfTransformer

games_processed_all = games_processed.copy()
tag_columns = games_processed_all.columns.difference(["app_id", "title", "date_release", "rating", "user_reviews", "price_final"])
content_matrix = games_processed_all[tag_columns]

# Apply TF-IDF transformation to encoded tags
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(content_matrix)

# Create DataFrame for game-level TF-IDF vectors
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=games_processed_all["app_id"], columns=content_matrix.columns)

print("✅ tfidf_df successfully created.")

In [None]:

#  Build user profiles using training set and TF-IDF tag vectors

user_profiles_tfidf = {}
for user_id in df_train["user_id"].unique():
    app_ids = df_train[df_train["user_id"] == user_id]["app_id"]
    valid_ids = app_ids[app_ids.isin(tfidf_df.index)]
    vectors = tfidf_df.loc[valid_ids]
    if not vectors.empty:
        user_profiles_tfidf[user_id] = vectors.mean(axis=0).values.reshape(1, -1)

print(f"✅ Built TF-IDF profiles for {len(user_profiles_tfidf)} users.")


In [None]:

# ✅ recommend_tfidf_contentwith fallback for cold start users

def recommend_tfidf_content(user_id, top_n=10):
    if user_id not in user_profiles_tfidf:
        fallback = games_processed_all.sort_values(by="user_reviews", ascending=False).head(top_n)
        recs = fallback[["app_id", "title"]].copy()
        recs.insert(0, "user_id", user_id)
        return recs

    user_vector = user_profiles_tfidf[user_id]
    similarities = cosine_similarity(user_vector, tfidf_df.values).flatten()

    played = set(df_train[df_train["user_id"] == user_id]["app_id"])
    sorted_indices = similarities.argsort()[::-1]
    recommended_ids = [tfidf_df.index[i] for i in sorted_indices if tfidf_df.index[i] not in played][:top_n]

    recs = games_processed_all[games_processed_all["app_id"].isin(recommended_ids)][["app_id", "title"]].copy()
    recs.insert(0, "user_id", user_id)
    return recs

print("✅ Recommender supports fallback for cold users and can be evaluated.")


In [None]:

# 🔍 Demo: View a TF-IDF user profile and their top 5 content-based recommendations

# Pick a sample user with a TF-IDF profile
#sample_user_id = next(iter(user_profiles_tfidf.keys()))
# 🔍 Demo: Recommend games for any user (TF-IDF or fallback)

sample_user_id = 20525  # Change this to test any user

if sample_user_id in user_profiles_tfidf:
    # ✅ TF-IDF path
    user_vector = user_profiles_tfidf[sample_user_id]
    similarities = cosine_similarity(user_vector, tfidf_df.values).flatten()

    played = set(df_train[df_train["user_id"] == sample_user_id]["app_id"])
    sorted_indices = similarities.argsort()[::-1]
    recommended_ids = [tfidf_df.index[i] for i in sorted_indices if tfidf_df.index[i] not in played][:5]

    recommended_games = games_processed_all[games_processed_all["app_id"].isin(recommended_ids)][["app_id", "title"]].reset_index(drop=True)

    # Show user's top tags
    user_tag_scores = pd.Series(user_vector.flatten(), index=tfidf_df.columns).sort_values(ascending=False)
    top_user_tags = user_tag_scores.head(10)

    print(f"🧑‍💻 TF-IDF Recommendation for User ID: {sample_user_id}")
    print("\n🔝 Top Tags for this User:")
    display(top_user_tags)

    print("\n🎮 Top 5 Game Recommendations:")
    display(recommended_games)

else:
    # 🧊 Fallback for cold start user
    recommendations = recommend_tfidf_content(sample_user_id, top_n=5)
    print(f"🧊 Fallback Recommendation for Cold Start User ID: {sample_user_id}")
    display(recommendations)


In [None]:
# ✅ Autoencoder-based Content-Based Recommender (Tag Embedding)

from sklearn.neural_network import MLPRegressor
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Use tag matrix from already encoded tags
tag_matrix = games_processed_all[tag_cols].values
app_ids = games_processed_all["app_id"].values

# Step 2: Train a shallow autoencoder
autoencoder = MLPRegressor(hidden_layer_sizes=(50,), max_iter=2000, random_state=42)
autoencoder.fit(tag_matrix, tag_matrix)

# Step 3: Get latent representations (encoded tag vectors)
encoded_vectors = autoencoder.predict(tag_matrix)
latent_df = pd.DataFrame(encoded_vectors, index=app_ids)


# Step 4: Build user profiles in latent space
user_profiles_autoenc = {}
for user_id in df_train["user_id"].unique():
    app_ids_user = df_train[df_train["user_id"] == user_id]["app_id"]
    valid_ids = app_ids_user[app_ids_user.isin(latent_df.index)]
    user_vector = latent_df.loc[valid_ids].mean(axis=0)
    if not user_vector.isna().any():
        user_profiles_autoenc[user_id] = user_vector.values.reshape(1, -1)

# Step 5: Recommendation function

def recommend_autoencoder_content(user_id, top_n=10):
    if user_id not in user_profiles_autoenc:
        fallback = games_processed_all.sort_values(by="user_reviews", ascending=False).head(top_n)
        recs = fallback[["app_id", "title"]].copy()
        recs.insert(0, "user_id", user_id)
        return recs

    user_vector = user_profiles_autoenc[user_id]
    similarities = cosine_similarity(user_vector, latent_df.values).flatten()

    played = set(df_train[df_train["user_id"] == user_id]["app_id"])
    sorted_indices = similarities.argsort()[::-1]
    recommended_ids = [latent_df.index[i] for i in sorted_indices if latent_df.index[i] not in played][:top_n]

    recs = games_processed_all[games_processed_all["app_id"].isin(recommended_ids)][["app_id", "title"]].copy()
    recs.insert(0, "user_id", user_id)
    return recs



In [None]:
# 🔍 Demo: Recommend games for any user using Autoencoder (with fallback)

sample_user_id = 20525  # Change this to test another user

recommendations = recommend_autoencoder_content(sample_user_id, top_n=5)

if sample_user_id in user_profiles_autoenc:
    print(f"\U0001f9e0 Autoencoder-Based Recommendations for User ID: {sample_user_id}")
        
    # Show top 10 latent dimensions for the user
    user_vector = user_profiles_autoenc[sample_user_id]
    user_latent_scores = pd.Series(user_vector.flatten(), index=[f"latent_{i}" for i in range(user_vector.shape[1])])
    top_latent_features = user_latent_scores.sort_values(ascending=False).head(10)

    print("\n🔝 Top Latent Dimensions for this User (Autoencoder):")
    display(top_latent_features)
else:
    print(f"\U0001f9ca Fallback for Cold Start User ID: {sample_user_id}")

display(recommendations)

In [None]:



from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from collections import defaultdict
import numpy as np

def evaluate_recommender(model_function, df_test, k=10):
    """
    Evaluate a content-based recommender system using precision, recall, f1, accuracy, precision@k, ndcg@k.
    
    Parameters:
    - model_function: recommendation function taking user_id and top_n=k
    - df_test: test set with user_id and app_id columns
    - k: number of recommendations per user

    Returns:
    - metrics: dictionary with average scores
    """
    true_positives = 0
    total_recommended = 0
    total_relevant = 0

    precision_scores = []
    recall_scores = []
    f1_scores = []
    accuracy_scores = []
    ndcg_scores = []

    users_evaluated = 0

    grouped_test = df_test.groupby("user_id")
    
    for user_id, group in grouped_test:
        true_items = set(group["app_id"])
        if not true_items:
            continue

        recs = model_function(user_id, top_n=k)
        predicted_items = list(recs["app_id"].dropna())
        
        if not predicted_items:
            continue

        y_true = [1 if app_id in true_items else 0 for app_id in predicted_items]
        y_pred = [1] * len(predicted_items)  # recommender always predicts relevance

        # Basic metrics
        precision_scores.append(precision_score(y_true, y_pred, zero_division=0))
        recall_scores.append(recall_score(y_true, y_pred, zero_division=0))
        f1_scores.append(f1_score(y_true, y_pred, zero_division=0))
        accuracy_scores.append(accuracy_score(y_true, y_pred))

        # Precision@k
        hits = sum(y_true)
        precision_at_k = hits / k
        precision_scores.append(precision_at_k)

        # NDCG@k
        dcg = sum([int(relevant) / np.log2(idx + 2) for idx, relevant in enumerate(y_true)])
        idcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(true_items), k))])
        ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcg_scores.append(ndcg)

        users_evaluated += 1

    # Aggregate scores
    metrics = {
        "Users Evaluated": users_evaluated,
        "Avg Precision": round(np.mean(precision_scores), 4),
        "Avg Recall": round(np.mean(recall_scores), 4),
        "Avg F1": round(np.mean(f1_scores), 4),
        "Avg Accuracy": round(np.mean(accuracy_scores), 4),
        "Avg NDCG@k": round(np.mean(ndcg_scores), 4)
    }

    return metrics
results = evaluate_recommender(recommend_autoencoder_content, df_test_expanded, k=10)
print("✅ Evaluation Results (Autoencoder Recommender):")
for metric, value in results.items():
    print(f"{metric}: {value}")