In [None]:
import pandas as pd
import numpy as np

In [27]:
review_df=pd.read_csv("./Processed Data/sentiment_reviews_18oct.csv")
purchase_df=pd.read_csv("./Processed Data/3_purchase_features.csv")

In [31]:
#unique player ids
player_ids_collab = review_df['playerid'].unique().tolist()
player_ids_content = purchase_df['playerid'].unique().tolist()

# common player ids
common_player_ids = set(player_ids_collab).intersection(set(player_ids_content))
print(f"Number of common player ids: {len(common_player_ids)}")
only_in_collab = set(player_ids_collab) - common_player_ids
only_in_content = set(player_ids_content) - common_player_ids
print(f"Number of player ids only in collaborative df: {len(only_in_collab)}")
print(f"Number of player ids only in content df: {len(only_in_content)}")


Number of common player ids: 16241
Number of player ids only in collaborative df: 49416
Number of player ids only in content df: 86307


In [15]:
collaborative_df = pd.read_csv('all_players_top10_recommendations.csv')
content_df = pd.read_csv('recommendations_for_all_players.csv')

In [16]:
display(collaborative_df.head())

Unnamed: 0,Player ID,Rank,Game ID,Title
0,76561197960266642,1,400040,ShareX
1,76561197960266642,2,2609610,CONVRGENCE
2,76561197960266642,3,1919460,Seraph's Last Stand
3,76561197960266642,4,822240,Animal Jam
4,76561197960266642,5,900040,ELEX II


In [None]:
# Ensure similarity_score is numeric, then rank based on similarity_score
content_df['similarity_score'] = pd.to_numeric(content_df['similarity_score'])
content_df['rank'] = content_df.groupby('playerid')['similarity_score'].rank(ascending=False, method='first').astype(int)
ranked_content_df = content_df[['playerid', 'gameid', 'title', 'rank']]
print(ranked_content_df.head())


            playerid  gameid                        title  rank
0  76561198060698936  239200  Amnesia: A Machine for Pigs     1
1  76561198060698936  231160                  The Swapper     2
2  76561198060698936  365590   Tom Clancy’s The Division™     5
3  76561198060698936  285900                  Gang Beasts     6
4  76561198060698936  481110                   The Bunker     3


In [22]:
#unique player ids
player_ids_collab = collaborative_df['Player ID'].unique().tolist()
player_ids_content = ranked_content_df['playerid'].unique().tolist()

# common player ids
common_player_ids = set(player_ids_collab).intersection(set(player_ids_content))
print(f"Number of common player ids: {len(common_player_ids)}")
only_in_collab = set(player_ids_collab) - common_player_ids
only_in_content = set(player_ids_content) - common_player_ids
print(f"Number of player ids only in collaborative df: {len(only_in_collab)}")
print(f"Number of player ids only in content df: {len(only_in_content)}")


Number of common player ids: 9894
Number of player ids only in collaborative df: 55763
Number of player ids only in content df: 37038


In [None]:
def _pick_col(df, candidates, required=False):
    """Pick the first existing column name from candidates (case-insensitive)."""
    cols = {c.lower(): c for c in df.columns}
    for c in candidates:
        if c.lower() in cols:
            return cols[c.lower()]
    if required:
        raise ValueError(f"None of the columns {candidates} found in DataFrame.")
    return None

def fuse_cf_cb(
    collaborative_df: pd.DataFrame,
    ranked_content_df: pd.DataFrame,
    topk: int = 10,
    w_cf: float = 0.6,
    w_cb: float = 0.4,
    rrf_k: int = 60
) -> pd.DataFrame:

    ucol  = _pick_col(collaborative_df, ["Player Id"], required=True)
    gcol  = _pick_col(collaborative_df, ["Game Id"], required=True)
    rcol  = _pick_col(collaborative_df, ["Rank", "cf_rank", "position"], required=True)
    tcol_cf = _pick_col(collaborative_df, ["Title"])

    ucol_cb = _pick_col(ranked_content_df, ["playerid"], required=True)
    gcol_cb = _pick_col(ranked_content_df, ["gameid"], required=True)
    scol_cb = _pick_col(ranked_content_df, ["rank"], required=True)
    tcol_cb = _pick_col(ranked_content_df, ["title"])

    # ---- CF: Reciprocal Rank Fusion score from ranks ----
    cf = collaborative_df[[ucol, gcol, rcol]].copy()
    cf = cf.dropna(subset=[ucol, gcol, rcol])
    cf["cf_rank"] = cf[rcol].astype(float)
    cf["cf_rrf"]  = 1.0 / (rrf_k + cf["cf_rank"])
    cf = cf[[ucol, gcol, "cf_rank", "cf_rrf"]]

    if tcol_cf:
        cf = cf.merge(
            collaborative_df[[ucol, gcol, tcol_cf]].drop_duplicates([ucol, gcol]),
            on=[ucol, gcol],
            how="left"
        ).rename(columns={tcol_cf: "title_cf"})

    # ---- CB: per-user min–max normalization of similarity ----
    cb = ranked_content_df[[ucol_cb, gcol_cb, scol_cb]].copy()
    cb = cb.dropna(subset=[ucol_cb, gcol_cb, scol_cb])
    cb = cb.rename(columns={ucol_cb: ucol, gcol_cb: gcol, scol_cb: "cb_sim"})

    # group-wise min–max
    cb["cb_norm"] = cb.groupby(ucol)["cb_sim"].transform(
        lambda s: (s.max() - s) / (s.max() - s.min()) if s.max() > s.min() else 1.0
    )
    cb = cb[[ucol, gcol, "cb_sim", "cb_norm"]]

    # keep CB titles if present
    if tcol_cb:
        cb = cb.merge(
            ranked_content_df[[ucol_cb, gcol_cb, tcol_cb]].rename(columns={ucol_cb: ucol, gcol_cb: gcol, tcol_cb: "title_cb"}).drop_duplicates([ucol, gcol]),
            on=[ucol, gcol],
            how="left"
        )

    # ---- merge candidates (union of CF & CB per user) ----
    fused = pd.merge(cf, cb, on=[ucol, gcol], how="outer")
    fused["title"] = fused.get("title_cf") if "title_cf" in fused else None
    if "title" in fused:
        if "title_cb" in fused:
            fused["title"] = fused["title"].fillna(fused["title_cb"])

    # missing scores → 0
    fused["cf_rrf"]  = fused["cf_rrf"].fillna(0.0)
    fused["cb_norm"] = fused["cb_norm"].fillna(0.0)

    # ---- weighted blend ----
    fused["hybrid_score"] = w_cf * fused["cf_rrf"] + w_cb * fused["cb_norm"]

    # ---- rank per user and take top-k ----
    fused["final_rank"] = fused.groupby(ucol)["hybrid_score"].rank(method="first", ascending=False)
    top = fused.loc[fused["final_rank"] <= topk].copy()

    # tidy columns
    keep_cols = [ucol, gcol, "title"] + [c for c in ["cf_rank", "cb_sim", "cf_rrf", "cb_norm", "hybrid_score", "final_rank"] if c in top]
    top = top[keep_cols].sort_values([ucol, "final_rank"])

    top = top.rename(columns={ucol: "playerid", gcol: "gameid"})
    return top.reset_index(drop=True)


final_top10 = fuse_cf_cb(collaborative_df, ranked_content_df, topk=10, w_cf=0.6, w_cb=0.4, rrf_k=60)


In [24]:
display(final_top10.head(20))

Unnamed: 0,playerid,gameid,title,cf_rank,cb_sim,cf_rrf,cb_norm,hybrid_score,final_rank
0,76561197960266642,400040,ShareX,1.0,,0.016393,0.0,0.009836,1.0
1,76561197960266642,2609610,CONVRGENCE,2.0,,0.016129,0.0,0.009677,2.0
2,76561197960266642,1919460,Seraph's Last Stand,3.0,,0.015873,0.0,0.009524,3.0
3,76561197960266642,822240,Animal Jam,4.0,,0.015625,0.0,0.009375,4.0
4,76561197960266642,900040,ELEX II,5.0,,0.015385,0.0,0.009231,5.0
5,76561197960266642,2539960,Orbo's Odyssey,6.0,,0.015152,0.0,0.009091,6.0
6,76561197960266642,736570,The Crooked Man,7.0,,0.014925,0.0,0.008955,7.0
7,76561197960266642,2511500,Dominions 6 - Rise of the Pantokrator,8.0,,0.014706,0.0,0.008824,8.0
8,76561197960266642,440540,Ara Fell: Enhanced Edition,9.0,,0.014493,0.0,0.008696,9.0
9,76561197960266642,1182020,WGT Golf,10.0,,0.014286,0.0,0.008571,10.0
