## Notes

- Fasttext is case sensitive
- Try using k=-1
- Try keeping stop_words
- Try excluding unpolular tags and software tags

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import fasttext
from ast import literal_eval
from collections import Counter

In [2]:
# Load the data into pandas DataFrames
print("Loading datasets...")
reviews_df = pd.read_csv("./archive/dataset.csv")  # columns: app_id, app_name, review_text
# reviews_df = pd.read_csv("similarity_adjusted_reviews.csv")  # columns: app_id, app_name, review_text

Loading datasets...


In [5]:
import pandas as pd
from ast import literal_eval

# Load your main tags data
tags_df = pd.read_csv("output.csv")  # columns: APPID, tags
tags_df['tags'] = [literal_eval(x) for x in tags_df['tags']]

# Ensure all tags are lists (extract keys if dicts)
new_tags = []
for x in tags_df['tags']:    
    if isinstance(x, dict):
        new_tags.append(list(x.keys())) 
    else:
        new_tags.append(x if isinstance(x, list) else [])
tags_df['tags'] = new_tags

# Load compound tags mapping
compounds_df = pd.read_csv("compound_tags.csv")  # columns: 'Compound tag', 'Compounds'
compounds_df['Compounds'] = compounds_df['Compounds'].apply(literal_eval)

# Convert to dictionary for faster lookup
compound_map = dict(zip(compounds_df['Compound tag'], compounds_df['Compounds']))

# Decompound tags
decompounded_tags = []
for tags in tags_df['tags']:
    new_list = []
    for tag in tags:
        if tag in compound_map:
            # Replace compound tag with its components
            new_list.extend(compound_map[tag])
        else:
            new_list.append(tag)
    decompounded_tags.append(new_list)

tags_df['tags'] = decompounded_tags

print(tags_df.head())


    APPID                                               tags
0      10  [Action, First-person, Shooter, Multiplayer, S...
1    1002  [2D, Fighting, Martial Arts, Intentionally Awk...
2  100400                   [Animation & Modeling, Software]
3   10090  [Zombies, World War II, First-person, Shooter,...
4  100980  [Animation & Modeling, Utilities, Design & Ill...


In [58]:
tags_df = pd.read_csv("output.csv")        # columns: APPID, tags

# Map the string representation of lists/dicts back to actual lists/dicts using literal_eval
tags_df['tags'] = [literal_eval(x) for x in tags_df['tags']]

# Enable this line if you want to ignore the count of tags
new_tags = []
for x in tags_df['tags']:    
    if isinstance(x, dict):
        keys_list = list(x.keys())  
        new_tags.append(keys_list) 
    else:
        # If no tags
        new_tags.append([])
tags_df['tags'] = new_tags

print(tags_df.head())

    APPID                                               tags
0      10  [Action, FPS, Multiplayer, Shooter, Classic, T...
1    1002  [2D Fighter, Martial Arts, Intentionally Awkwa...
2  100400                   [Animation & Modeling, Software]
3   10090  [Zombies, World War II, FPS, Multiplayer, Acti...
4  100980  [Animation & Modeling, Utilities, Design & Ill...


In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

exclude_tags = [
    "Software", "Utilities", "Design & Illustration", "Photo Editing", 
    "Education", "Game Development", "Programming", "Software Training",
    "Web Publishing", "Video Production", "Audio Production", "Tutorial",
    "Coding", "Hardware", "360 Video"
]

# nltk.download('stopwords')
# nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

print("Aggregating reviews per game...")

reviews_df['app_id'] = reviews_df['app_id'].astype(int)
tags_df['APPID'] = tags_df['APPID'].astype(int)

# Filter reviews to only include games present in tags_df
reviews_with_tags = reviews_df[reviews_df['app_id'].isin(tags_df['APPID'])]
print(reviews_with_tags.head(5))

# Function to clean review text: lowercase + remove URLs
def clean_review(text):
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r"<[^>]*>", " ", text)
    # Remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)
    # Replace numbers with <num>
    text = re.sub(r"\d+", " <num> ", text)
    # Remove special characters (keep words and spaces)
    text = re.sub(r"[^a-z\s]", " ", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    # Optional: lemmatize and remove stopwords
    words = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(words)
    return text

# Apply cleaning with tqdm progress bar
tqdm.pandas(desc="Cleaning reviews")
reviews_with_tags['review_text'] = reviews_with_tags['review_text'].progress_apply(clean_review)

# Optional: filter out reviews shorter than 3 words
# reviews_with_tags = reviews_with_tags.loc[reviews_with_tags['review_text'].str.split().str.len() >= 3]

print("Filtered reviews shape:", reviews_with_tags.shape)

# Aggregate reviews per game and keep app_name
game_reviews = (
    reviews_with_tags.groupby('app_id')
    .agg({
        'review_text': lambda texts: " ".join(texts),
        'app_name': 'first'  # keep the first app_name per app_id
    })
    .reset_index()
)
print("Aggregated reviews shape:", game_reviews.shape)

# Merge with filtered tags
game_reviews = game_reviews.merge(tags_df, left_on='app_id', right_on='APPID')
print("Merged game_reviews shape:", game_reviews.shape)

print("Sample data:")
print(game_reviews.head())
game_reviews.to_csv("game_reviews.csv", index=False)


Aggregating reviews per game...
   app_id        app_name                                        review_text  \
0      10  Counter-Strike                                    Ruined my life.   
1      10  Counter-Strike  This will be more of a ''my experience with th...   
2      10  Counter-Strike                      This game saved my virginity.   
3      10  Counter-Strike  • Do you like original games? • Do you like ga...   
4      10  Counter-Strike           Easy to learn, hard to master.             

   review_score  review_votes  
0             1             0  
1             1             1  
2             1             0  
3             1             0  
4             1             1  


Cleaning reviews: 100%|██████████| 6417106/6417106 [12:37<00:00, 8466.83it/s] 


Filtered reviews shape: (6417106, 5)
Aggregated reviews shape: (9972, 3)
Merged game_reviews shape: (9972, 5)
Sample data:
   app_id                                        review_text  \
0      10  ruined life experience game type review saying...   
1      20  got christmas num along half life even know go...   
2      30  even though old better call duty ghost want ww...   
3      40  buy game join community one game great fast pa...   
4      50  first game created gearbox legendary developer...   

                    app_name  APPID  \
0             Counter-Strike     10   
1      Team Fortress Classic     20   
2              Day of Defeat     30   
3         Deathmatch Classic     40   
4  Half-Life: Opposing Force     50   

                                                tags  
0  [Action, First-person, Shooter, Multiplayer, S...  
1  [Action, First-person, Shooter, Multiplayer, C...  
2  [First-person, Shooter, World War II, Multipla...  
3  [Action, First-person, Shooter, Cl

In [None]:
# Process tags 
all_tags = [tag for tags in tags_df['tags'] for tag in tags]
tag_counts = Counter(all_tags)

allowed_tags = {tag for tag, count in tag_counts.items() if count > 20}
print(len(tag_counts), tag_counts)
tag_counts_df = pd.DataFrame(tag_counts.items(), columns=['tag', 'count'])
tag_counts_df.to_csv("tag_counts.csv", index=False)

446 Counter({'Indie': 6304, 'Action': 5048, 'Singleplayer': 4482, 'Adventure': 4335, 'Casual': 3422, 'Strategy': 2523, 'Multiplayer': 2068, 'Simulation': 2068, 'RPG': 1991, '2D': 1990, 'Atmospheric': 1737, 'Puzzle': 1722, 'Great Soundtrack': 1630, 'Story Rich': 1327, 'Shooter': 1198, 'Sci-fi': 1196, 'First-Person': 1134, 'Platformer': 1117, 'Funny': 1100, 'Fantasy': 1089, 'Difficult': 1086, 'Open World': 1061, 'Co-op': 1060, 'Pixel Graphics': 1049, 'Horror': 1044, 'Female Protagonist': 1041, 'Arcade': 953, 'Retro': 950, 'Free to Play': 887, 'Point & Click': 824, 'Anime': 813, 'Classic': 802, 'FPS': 776, 'Early Access': 774, 'Exploration': 757, 'Comedy': 755, 'Survival': 723, 'Family Friendly': 722, 'Sandbox': 712, 'Third Person': 690, 'Cute': 646, 'Turn-Based': 627, 'Space': 626, 'VR': 605, 'Gore': 579, 'Controller': 538, 'Tactical': 495, 'Local Co-Op': 484, 'Local Multiplayer': 483, 'Colorful': 480, 'Psychological Horror': 466, 'Sports': 453, 'Visual Novel': 440, 'Side Scroller': 435,

In [1]:
from sklearn.model_selection import train_test_split
import ast
game_reviews['tags'] = game_reviews['tags'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Build FastText training lines
def to_fasttext_format(row):
    # Convert spaces in tags to underscores to make them single tokens
    labels = " ".join([f"__label__{tag.replace(' ', '_')}" for tag in row['tags']])
    return f"{labels} {row['review_text']}"


game_reviews["fasttext_format"] = game_reviews.apply(to_fasttext_format, axis=1)

train_df, test_df = train_test_split(
    game_reviews, test_size=0.33, random_state=42
)
train_df["fasttext_format"].to_csv("fasttext_train.txt", index=False, header=False)
test_df["fasttext_format"].to_csv("fasttext_test.txt", index=False, header=False)

NameError: name 'game_reviews' is not defined

## Undersampling

In [None]:
# from collections import Counter
# import pandas as pd
# import random

# # Count how many times each tag appears
# all_tags = [tag for tags in train_df['tags'] for tag in tags]
# tag_counts = Counter(all_tags)
# max_count = max(tag_counts.values())

# undersampled_rows = []

# for _, row in train_df.iterrows():
#     if not row['tags']:  # skip empty tags
#         continue

#     # Compute weight based on the most frequent tag in this review
#     # Reviews with frequent tags are less likely to be kept
#     max_tag_count = max([tag_counts[t] for t in row['tags']])
#     keep_prob = min(max_count / max_tag_count, 1.0)  # probability to keep this review

#     if random.random() < keep_prob:
#         undersampled_rows.append(row)

# train_df_balanced = pd.DataFrame(undersampled_rows)
# train_df_balanced["fasttext_format"].to_csv(
#     "fasttext_train_balanced.txt", index=False, header=False
# )


## Oversampling 


In [8]:
# from collections import Counter

# # Count tag frequencies
# all_tags = [tag for tags in train_df['tags'] for tag in tags]
# tag_counts = Counter(all_tags)
# max_count = max(tag_counts.values())

# balanced_rows = []

# for _, row in train_df.iterrows():
#     if not row['tags']:  # skip or handle empty tag lists
#         continue  # skip this review
#     repeat_factor = max([max_count // tag_counts[t] for t in row['tags']])
#     balanced_rows.extend([row] * repeat_factor)

# train_df_balanced = pd.DataFrame(balanced_rows)
# train_df_balanced["fasttext_format"].to_csv("fasttext_train_balanced.txt", index=False, header=False)


In [16]:
model = fasttext.train_supervised(
    # input="fasttext_train.txt",
    input="fasttext_train.txt",
    lr=0.1,
    epoch=60,
    wordNgrams=3,
    dim=200,
    minCount = 2,
    bucket=2000000,
    ws = 25,
    loss="ova",
    verbose=2 
)

model.save_model("review_to_tag_model2.ftz")

def evaluate(model, test_df):
    result = model.test(test_df)
    print(f"Samples: {result[0]}")
    print(f"Precision@1: {result[1]:.4f}")
    print(f"Recall@1: {result[2]:.4f}")

In [8]:
model = fasttext.train_supervised(
    # input="fasttext_train.txt",
    input="fasttext_train.txt",
    lr=0.2,
    epoch=50,
    wordNgrams=3,
    dim=200,
    minCount = 7,
    ws = 10,
    loss="ova",
    verbose=2 
)

model.save_model("review_to_tag_model_decompounding.ftz")



In [9]:
import pandas as pd
from ast import literal_eval

model = fasttext.load_model("review_to_tag_model_decompounding.ftz")

# Ensure original tags are lists
tags_df['tags'] = tags_df['tags'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
tags_df = tags_df.rename(columns={'APPID': 'app_id'})

# Predict tags per game with dynamic k
predictions = []
for _, row in test_df.iterrows():
    # Determine k based on number of true tags for this game
    true_tags = tags_df.loc[tags_df['app_id'] == row['app_id'], 'tags'].values[0]
    k = len(true_tags) if len(true_tags) > 0 else 1  # ensure at least 1

    # Predict top-k tags
    labels, probs = model.predict(row['review_text'], k=k) 
    labels = [l.replace("__label__", "") for l in labels]

    predictions.append({
        'app_id': row['app_id'],
        'predicted_tags_with_probs': [f"{tag}:{p:.2f}" for tag, p in zip(labels, probs)]
    })

# Merge with original tags
merged_df = pd.DataFrame(predictions).merge(tags_df[['app_id', 'tags']], on='app_id')
display(merged_df)
merged_df.to_csv("merged_predictions.csv", index=False)


Unnamed: 0,app_id,predicted_tags_with_probs,tags
0,298160,"[RPG:0.92, Action:0.69, Adventure:0.69, Multip...","[Free to Play, RPG, Massively Multiplayer, Mas..."
1,261030,"[Adventure:0.93, Singleplayer:0.90, Story:0.89...","[Zombies, Adventure, Story Rich, Episodic, Poi..."
2,333380,"[Indie:0.82, Action:0.56, Early:0.40, Simulati...","[RPG, Indie, Early Access, Rogue-like]"
3,3450,"[Singleplayer:0.61, Casual:0.45, Action:0.44]","[Casual, Typing, Education]"
4,514240,"[Indie:0.92, Horror:0.89, Action:0.84, Adventu...","[Action, Adventure, Indie, Simulation, Horror,..."
...,...,...,...
3286,238240,"[Indie:0.90, Action:0.72, RPG:0.59, Adventure:...","[Open World, Survival, Crafting, Sandbox, Indi..."
3287,289840,"[Adventure:0.90, Point:0.76, Singleplayer:0.74...","[Adventure, Casual, Point & Click, Vampire, Ho..."
3288,269770,"[Indie:0.76, Action:0.73, Multiplayer:0.67, Ea...","[Adventure, Singleplayer, Multiplayer, Indie, ..."
3289,526780,"[Casual:0.96, Singleplayer:0.89, Indie:0.87, A...","[Precision Platformer, Platformer, Action, Adv..."


In [49]:
import pandas as pd
from ast import literal_eval

# Ensure original tags are lists
tags_df['tags'] = tags_df['tags'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
tags_df = tags_df.rename(columns={'APPID': 'app_id'})

# Predict tags per game
predictions = []
for _, row in test_df.iterrows():
    labels, probs = model.predict(row['review_text'], k=-1)  # get top 20 predictions
    labels = [l.replace("__label__", "") for l in labels]
    predictions.append({
        'app_id': row['app_id'],
        'predicted_tags_with_probs': [f"{tag}:{p:.2f}" for tag, p in zip(labels, probs)]
    })

# Merge with original tags
merged_df = pd.DataFrame(predictions).merge(tags_df[['app_id', 'tags']], on='app_id')
display(merged_df)
merged_df.to_csv("merged_predictions.csv", index=False)


Unnamed: 0,app_id,predicted_tags_with_probs,tags
0,298160,"[RPG:0.93, Adventure:0.70, Action:0.69, Multip...","[Free to Play, RPG, Massively Multiplayer, MMO..."
1,261030,"[Adventure:0.93, Singleplayer:0.90, Story:0.90...","[Zombies, Adventure, Story Rich, Episodic, Poi..."
2,333380,"[Indie:0.85, Action:0.49, Early:0.44, Strategy...","[RPG, Indie, Early Access, Rogue-like]"
3,3450,"[Singleplayer:0.61, Casual:0.40, Action:0.39, ...","[Casual, Typing, Education]"
4,514240,"[Indie:0.93, Horror:0.88, Action:0.83, Adventu...","[Action, Adventure, Indie, Simulation, Horror,..."
...,...,...,...
3286,238240,"[Indie:0.90, Action:0.72, RPG:0.62, Adventure:...","[Open World Survival Craft, Sandbox, Indie, Ad..."
3287,289840,"[Adventure:0.91, Point:0.76, Singleplayer:0.75...","[Adventure, Casual, Point & Click, Vampire, Ho..."
3288,269770,"[Indie:0.78, Action:0.67, Early:0.67, Multipla...","[Adventure, Singleplayer, Multiplayer, Indie, ..."
3289,526780,"[Casual:0.96, Action:0.90, Indie:0.90, Singlep...","[Precision Platformer, Platformer, Action, Adv..."


In [10]:
# ==============================
# Multi-label Evaluation Cell
# ==============================

from sklearn.metrics import f1_score, precision_score, recall_score, jaccard_score
import numpy as np

# Prepare true and predicted binary matrices
all_labels = sorted({l for tags in merged_df['tags'] for l in tags})
label_to_idx = {l: i for i, l in enumerate(all_labels)}

y_true = []
y_pred = []

for _, row in merged_df.iterrows():
    # Binary vector for true tags
    y_t = [0] * len(all_labels)
    for t in row['tags']:
        if t in label_to_idx:
            y_t[label_to_idx[t]] = 1

    # Binary vector for predicted tags
    pred_tags = [tp.split(":")[0] for tp in row['predicted_tags_with_probs']]
    y_p = [0] * len(all_labels)
    for p in pred_tags:
        if p in label_to_idx:
            y_p[label_to_idx[p]] = 1

    y_true.append(y_t)
    y_pred.append(y_p)

y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Compute metrics
subset_acc = np.mean(np.all(y_true == y_pred, axis=1))
jaccard_acc = jaccard_score(y_true, y_pred, average='samples')
micro_precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
micro_recall = recall_score(y_true, y_pred, average='micro', zero_division=0)
micro_f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
macro_precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
macro_recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
macro_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

# Print results
print("=== Multi-label Classification Metrics ===")
print(f"Subset Accuracy (exact match):       {subset_acc:.4f}")
print(f"Jaccard-based Accuracy (rank-free):  {jaccard_acc:.4f}")
print(f"Precision (micro):                   {micro_precision:.4f}")
print(f"Recall (micro):                      {micro_recall:.4f}")
print(f"F1 score (micro):                    {micro_f1:.4f}")
print(f"Precision (macro):                   {macro_precision:.4f}")
print(f"Recall (macro):                      {macro_recall:.4f}")
print(f"F1 score (macro):                    {macro_f1:.4f}")


=== Multi-label Classification Metrics ===
Subset Accuracy (exact match):       0.0273
Jaccard-based Accuracy (rank-free):  0.3797
Precision (micro):                   0.5576
Recall (micro):                      0.4868
F1 score (micro):                    0.5198
Precision (macro):                   0.1387
Recall (macro):                      0.1151
F1 score (macro):                    0.1134


In [None]:
# import pandas as pd
# from sklearn.preprocessing import MultiLabelBinarizer
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



# # Prepare true and predicted labels
# y_true = merged_df['tags'].tolist()  # list of lists
# y_pred = merged_df['predicted_tags_with_probs'].apply(
#     lambda tags_probs: [tp.split(':')[0] for tp in tags_probs]  # strip probabilities
# ).tolist()

# # Binarize labels
# mlb = MultiLabelBinarizer()
# y_true_bin = mlb.fit_transform(y_true)
# y_pred_bin = mlb.transform(y_pred)

# # --- Compute metrics ---

# # 1. Strict subset accuracy (exact match)
# subset_accuracy = accuracy_score(y_true_bin, y_pred_bin)

# # 2. Rank-agnostic Jaccard-based accuracy
# jaccard_accuracies = [
#     len(set(true) & set(pred)) / len(set(true) | set(pred)) if len(set(true) | set(pred)) > 0 else 1.0
#     for true, pred in zip(y_true, y_pred)
# ]
# jaccard_accuracy = sum(jaccard_accuracies) / len(jaccard_accuracies)

# # 3. Micro-averaged metrics
# precision_micro = precision_score(y_true_bin, y_pred_bin, average='micro')
# recall_micro = recall_score(y_true_bin, y_pred_bin, average='micro')
# f1_micro = f1_score(y_true_bin, y_pred_bin, average='micro')

# # 4. Macro-averaged metrics
# precision_macro = precision_score(y_true_bin, y_pred_bin, average='macro')
# recall_macro = recall_score(y_true_bin, y_pred_bin, average='macro')
# f1_macro = f1_score(y_true_bin, y_pred_bin, average='macro')


# # --- Print results ---
# print("=== Multi-label Classification Metrics ===")
# print(f"Subset Accuracy (exact match):       {subset_accuracy:.4f}")
# print(f"Jaccard-based Accuracy (rank-free):  {jaccard_accuracy:.4f}")
# print(f"Precision (micro):                   {precision_micro:.4f}")
# print(f"Recall (micro):                      {recall_micro:.4f}")
# print(f"F1 score (micro):                    {f1_micro:.4f}")
# print(f"Precision (macro):                   {precision_macro:.4f}")
# print(f"Recall (macro):                      {recall_macro:.4f}")
# print(f"F1 score (macro):                    {f1_macro:.4f}")




=== Multi-label Classification Metrics ===
Subset Accuracy (exact match):       0.0273
Jaccard-based Accuracy (rank-free):  0.3386
Precision (micro):                   0.5658
Recall (micro):                      0.4635
F1 score (micro):                    0.5096
Precision (macro):                   0.1315
Recall (macro):                      0.1061
F1 score (macro):                    0.1054


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np

# --- Combine reviews per game ---
reviews_per_game = (
    reviews_df
    .groupby("app_id")["review_text"]
    .apply(lambda x: " ".join(x))
    .reset_index()
)

# --- TF-IDF encoding ---
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(reviews_per_game["review_text"])

# --- Compute pairwise similarity with progress bar ---
rows = X.shape[0]
sims = []

for i in tqdm(range(rows), desc="Computing similarities"):
    sim_row = cosine_similarity(X[i], X).flatten()
    sims.append(sim_row.mean())  # average similarity for this game

print(f"Mean similarity: {np.mean(sims):.4f}")
print(f"Std deviation: {np.std(sims):.4f}")


TypeError: sequence item 505: expected str instance, float found