In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

FILE_PATH = r"/Users/thatc/Downloads/archive/Video_Games_Sales_as_at_22_Dec_2016.csv"

df = pd.read_csv(FILE_PATH, encoding="latin-1")

# Coerce numerics (handles 'tbd' → NaN), align score scales
for col in ["User_Score", "Critic_Score", "Global_Sales"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")
df["User_Score"]   = df["User_Score"] / 10.0
df["Critic_Score"] = df["Critic_Score"] / 100.0

# Keep rows with needed fields
need = ["Name", "Genre", "Platform", "User_Score", "Critic_Score", "Global_Sales"]
df = df.dropna(subset=need).reset_index(drop=True)

# One-hot categoricals + scale numerics
X_cat = pd.get_dummies(df[["Genre", "Platform"]], drop_first=False)
X_num = pd.DataFrame(
    MinMaxScaler().fit_transform(df[["User_Score", "Critic_Score", "Global_Sales"]]),
    columns=["User_Score", "Critic_Score", "Global_Sales"],
    index=df.index
)

X = pd.concat([X_cat, X_num], axis=1)
S = cosine_similarity(X.values)

name_to_idx = {name: i for i, name in enumerate(df["Name"])}

def top_k_similar(name: str, k: int = 10) -> pd.DataFrame:
    """Return top-k most similar games to `name` using cosine similarity."""
    if name not in name_to_idx:
        raise ValueError(f"'{name}' not found (CSV ends in 2016).")
    i = name_to_idx[name]
    sims = S[i]
    idxs = np.argsort(-sims)
    idxs = idxs[idxs != i][:k]
    return pd.DataFrame({
        "Rank": range(1, k+1),
        "Game": df["Name"].iloc[idxs].to_list(),
        "Similarity": sims[idxs].round(4)
    })

for q in ["Grand Theft Auto V", "The Last of Us", "Mario Kart 8", "Mass Effect 3", "Batman: Arkham City"]:
    try:
        print(f"\nTop similar to: {q}")
        print(top_k_similar(q))
    except ValueError as err:
        print(err)



Top similar to: Grand Theft Auto V
   Rank                                  Game  Similarity
0     1  Metal Gear Solid V: The Phantom Pain      0.9996
1     2                          Guild Wars 2      0.9994
2     3                  Grand Theft Auto III      0.9994
3     4         Assassin's Creed: Brotherhood      0.9987
4     5           Grand Theft Auto: Vice City      0.9987
5     6                   Batman: Arkham City      0.9987
6     7        Final Fantasy XIV: Heavensward      0.9986
7     8                         Saints Row IV      0.9985
8     9                 Batman: Arkham Asylum      0.9985
9    10                            Dead Space      0.9984

Top similar to: The Last of Us
   Rank                                      Game  Similarity
0     1                       Red Dead Redemption      0.9999
1     2                Uncharted 2: Among Thieves      0.9998
2     3  Metal Gear Solid 4: Guns of the Patriots      0.9998
3     4                            God of War 