# 1. Import & Setup

In [40]:
import pandas as pd
import numpy as np

## Load Data & Recommendations

In [41]:
cbf_recommendations = pd.read_csv('recommendations_for_all_players.csv')
cbf_recommendations.head()

Unnamed: 0,playerid,gameid,title,similarity_score,cluster
0,76561198060698936,239200,Amnesia: A Machine for Pigs,0.714217,12
1,76561198060698936,231160,The Swapper,0.708787,12
2,76561198060698936,365590,Tom Clancy’s The Division™,0.654243,14
3,76561198060698936,285900,Gang Beasts,0.648161,14
4,76561198060698936,481110,The Bunker,0.69339,5


In [42]:
games_feature_cluster_df = pd.read_csv('game_features_and_clusters.csv')
games_feature_cluster_df.head()

Unnamed: 0,gameid,release_age_days,base_price,avg_discount,weighted_sentiment,360 Video,Accounting,Action,Adventure,Animation & Modeling,...,Simulation,Software Training,Sports,Strategy,Tutorial,Utilities,Video Production,Violent,Web Publishing,cluster
0,3281560,-1.185347,-0.513495,-0.772341,-0.473848,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
1,3280930,-1.181805,-0.513495,-0.772341,-0.473848,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
2,3280770,-1.178263,-0.513495,-0.772341,-0.473848,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
3,3279790,-1.178263,-0.513495,-0.772341,-0.473848,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
4,3278320,-1.177378,-0.513495,-0.772341,-0.473848,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15


# 2. Calulate Catalog Coverage 

- Measures the proportion of the total game catalog that the recommender is capable of recommending. This evaluates the breadth of the recommendation systems' reach.
- Interpretation: 
    - Higher coverage -> More games out of the entire catalog are recommended to players. 
- Formula is given by:
    $$
    \text{Catalog Coverage} = \frac{|\text{Unique Games Recommended}|}{|\text{Total Games in Catalog}|} \times 100\%
    $$

- Since our content-based filtering approach also makes use of clustering to make recommendations, the catalog coverage for each cluster is also calculated.
    $$
    \text{Catalog Coverage per Cluster} = \frac{|\text{Unique Items Recommended in Cluster}|}{|\text{Total Games in Cluster}|} \times 100\%
    $$

In [43]:
def calculate_catalog_coverage(recs_df, games_df):
    # Total games in catalog
    total_games = games_df['gameid'].nunique()
    
    # Number of Unique games recommended
    recommended_games = recs_df['gameid'].nunique()
    
    # Overall coverage
    coverage = (recommended_games / total_games) * 100
    
    return coverage

In [44]:
def calulate_cluster_coverage(recs_df, games_df):
    cluster_coverage = []
        
    for cluster in sorted(games_df['cluster'].unique()):
        # Games in this cluster
        cluster_games = games_df[games_df['cluster'] == cluster]['gameid'].unique()
        
        # Recommended games from this cluster
        cluster_recs = recs_df[recs_df['cluster'] == cluster]['gameid'].unique()

        total_in_cluster = len(cluster_games)
        recommended_in_cluster = len(cluster_recs)
        unrecommended_in_cluster = total_in_cluster - recommended_in_cluster
        cluster_cov = (recommended_in_cluster / total_in_cluster)*100 if total_in_cluster > 0 else 0

        cluster_coverage.append({
            'total_games': total_in_cluster,
            'recommended_games': recommended_in_cluster,
            'unrecommended_games': unrecommended_in_cluster,
            'coverage (in %)': round(cluster_cov, 2)
        })
    cluster_coverage_df = pd.DataFrame(cluster_coverage)
    # Sort by coverage in ascending order
    cluster_coverage_df = cluster_coverage_df.sort_values('coverage (in %)', ascending=False).reset_index(drop=True)

    return cluster_coverage_df

In [49]:
# Overall Catalog Coverage
overall_coverage = calculate_catalog_coverage(cbf_recommendations, games_feature_cluster_df)
print(f"Overall Catalog Coverage: {overall_coverage:.2f}%")

# Cluster-wise Coverage
print("\nCluster-wise Coverage:")
cluster_coverage_df = calulate_cluster_coverage(cbf_recommendations, games_feature_cluster_df)
cluster_coverage_df

Overall Catalog Coverage: 3.34%

Cluster-wise Coverage:


Unnamed: 0,total_games,recommended_games,unrecommended_games,coverage (in %)
0,964,538,426,55.81
1,1243,554,689,44.57
2,1411,337,1074,23.88
3,1868,255,1613,13.65
4,2534,333,2201,13.14
5,2264,284,1980,12.54
6,1008,89,919,8.83
7,2378,153,2225,6.43
8,1483,67,1416,4.52
9,555,23,532,4.14


# 3. Calulate Distributional Coverage

- Measures how widely a recommendation system spreads its recommendations across all available items. It evaluates whether the system focuses only on popular items or offers a diverse range from the entire catalog.
- Interpretation:
    - Higher distributional coverage → Recommendations are more diverse and evenly spread across items.
    - Lower distributional coverage → Recommendations are concentrated on a small subset of items (less variety).

- Formula is given by: 
    $$
    DC = -\sum_{i=1}^{N} p(i)\log_2 p(i)
    $$
    where **$p(i)$** is the probability of recommending item **$i$**, estimated from its frequency in the recommendation lists.

In [46]:
def calculate_distributional_coverage(recs_df, games_df):
    
    # Count how many times each game was recommended
    item_counts = recs_df["gameid"].value_counts()
    
    # Convert counts to probabilities
    p = item_counts / item_counts.sum()
    
    # Compute entropy
    entropy = -np.sum(p * np.log2(p))

    # Normalize entropy
    total_games_in_catalog = games_df["gameid"].nunique()
    entropy /= np.log2(total_games_in_catalog)

    return round(entropy,3)

In [47]:
# Distributional Coverage
print("\nDistributional Coverage:")
print(calculate_distributional_coverage(cbf_recommendations, games_feature_cluster_df))


Distributional Coverage:
0.485


# 4. Overall Evaluation
- Catalog coverage = 3.34% -> Only a very small fraction of the total catalog is being recommended.
- Cluster coverage = 0.09% to 55.81% -> Some clusters dominate, some are barely represented.
- Distributional coverage = 0.485 ->  Among the items that are being recommended, the system distributes recommendations somewhat evenly.

The model intentionally sacrifices catalog breadth to prioritize relevance and efficiency, achieving strong internal diversity within recommended clusters. While the “rich-get-richer” effect is mitigated within these clusters, it persists across the broader catalog. This low coverage is a deliberate trade-off, resulting from aggressive filtering that limits recommendations to a user’s Top 3 Preferred Clusters, ensuring focused and highly relevant suggestions aligned with established user interests.