In [11]:
import pandas as pd
import numpy as np
import pandas as pd
import fasttext
from ast import literal_eval
from collections import Counter

In [12]:
# Load the data into pandas DataFrames
print("Loading datasets...")
reviews_df = pd.read_csv("./archive/dataset.csv")  # columns: app_id, app_name, review_text


Loading datasets...


In [None]:
tags_df = pd.read_csv("output.csv")        # columns: APPID, tags

# Map the string representation of lists/dicts back to actual lists/dicts using literal_eval
tags_df['tags'] = [literal_eval(x) for x in tags_df['tags']]

# # Enable this line if you want to ignore the count of tags
new_tags = []
for x in tags_df['tags']:    
    if isinstance(x, dict):
        keys_list = list(x.keys())  
        new_tags.append(keys_list) 
    else:
        # If no tags
        new_tags.append([])
tags_df['tags'] = new_tags

print(tags_df.head())

    APPID                                               tags
0      10  [Action, FPS, Multiplayer, Shooter, Classic, T...
1    1002  [2D Fighter, Martial Arts, Intentionally Awkwa...
2  100400                   [Animation & Modeling, Software]
3   10090  [Zombies, World War II, FPS, Multiplayer, Acti...
4  100980  [Animation & Modeling, Utilities, Design & Ill...


In [42]:
def filter_rare_tags(tags_df, min_count=5):
    """
    Filters out tags that occur less than min_count times across all games.
    
    Args:
        tags_df (pd.DataFrame): columns ['APPID', 'tags'], where 'tags' is a list of tag names
        min_count (int): minimum number of occurrences required to keep a tag
    
    Returns:
        pd.DataFrame: filtered tags_df
    """
    # Flatten all tags into a single list
    all_tags = [tag for tags in tags_df['tags'] for tag in tags]
    tag_counts = Counter(all_tags)
    print(len(tag_counts), tag_counts)
    # Keep only tags above threshold
    allowed_tags = {tag for tag, count in tag_counts.items() if count >= min_count}
    # print(len(allowed_tags), allowed_tags)
    # Filter tags per game
    def keep_tags(tag_list):
        return [tag for tag in tag_list if tag in allowed_tags]
    
    tags_df['tags'] = tags_df['tags'].apply(keep_tags)
    
    # Optionally, drop games with no tags left after filtering
    tags_df = tags_df[tags_df['tags'].str.len() > 0].reset_index(drop=True)
    
    return tags_df

# Apply rare tag filtering
tags_df = filter_rare_tags(tags_df, min_count=500)
# print("Tags after rare tag filtering:", tags_df.shape)
print(tags_df)

446 Counter({'Indie': 6304, 'Action': 5048, 'Singleplayer': 4482, 'Adventure': 4335, 'Casual': 3422, 'Strategy': 2523, 'Multiplayer': 2068, 'Simulation': 2068, 'RPG': 1991, '2D': 1990, 'Atmospheric': 1737, 'Puzzle': 1722, 'Great Soundtrack': 1630, 'Story Rich': 1327, 'Shooter': 1198, 'Sci-fi': 1196, 'First-Person': 1134, 'Platformer': 1117, 'Funny': 1100, 'Fantasy': 1089, 'Difficult': 1086, 'Open World': 1061, 'Co-op': 1060, 'Pixel Graphics': 1049, 'Horror': 1044, 'Female Protagonist': 1041, 'Arcade': 953, 'Retro': 950, 'Free to Play': 887, 'Point & Click': 824, 'Anime': 813, 'Classic': 802, 'FPS': 776, 'Early Access': 774, 'Exploration': 757, 'Comedy': 755, 'Survival': 723, 'Family Friendly': 722, 'Sandbox': 712, 'Third Person': 690, 'Cute': 646, 'Turn-Based': 627, 'Space': 626, 'VR': 605, 'Gore': 579, 'Controller': 538, 'Tactical': 495, 'Local Co-Op': 484, 'Local Multiplayer': 483, 'Colorful': 480, 'Psychological Horror': 466, 'Sports': 453, 'Visual Novel': 440, 'Side Scroller': 435,

In [7]:
# -----------------------------
# 2. Aggregate reviews per game (only games with tags available)
# -----------------------------

print("Aggregating reviews per game...")

reviews_df['app_id'] = reviews_df['app_id'].astype(int)
tags_df['APPID'] = tags_df['APPID'].astype(int)

# Filter reviews to only include games present in tags_df
reviews_with_tags = reviews_df[reviews_df['app_id'].isin(tags_df['APPID'])]
print(reviews_with_tags.head(5))

# Convert review_text to string
reviews_with_tags['review_text'] = reviews_with_tags['review_text'].astype(str)

# Filter out reviews shorter than 3 words
reviews_with_tags = reviews_with_tags.loc[reviews_with_tags['review_text'].str.split().str.len() >= 3]

print("Filtered reviews shape:", reviews_with_tags.shape)

# Aggregate reviews per game and keep app_name
game_reviews = (
    reviews_with_tags.groupby('app_id')
    .agg({
        'review_text': lambda texts: " ".join(texts),
        'app_name': 'first'  # keep the first app_name per app_id
    })
    .reset_index()
)
print("Aggregated reviews shape:", game_reviews.shape)

# Merge with filtered tags
game_reviews = game_reviews.merge(tags_df, left_on='app_id', right_on='APPID')
print("Merged game_reviews shape:", game_reviews.shape)

print("Sample data:")
print(game_reviews.head())


Aggregating reviews per game...
   app_id        app_name                                        review_text  \
0      10  Counter-Strike                                    Ruined my life.   
1      10  Counter-Strike  This will be more of a ''my experience with th...   
2      10  Counter-Strike                      This game saved my virginity.   
3      10  Counter-Strike  • Do you like original games? • Do you like ga...   
4      10  Counter-Strike           Easy to learn, hard to master.             

   review_score  review_votes  
0             1             0  
1             1             1  
2             1             0  
3             1             0  
4             1             1  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_with_tags['review_text'] = reviews_with_tags['review_text'].astype(str)


KeyboardInterrupt: 

In [None]:
# Rewrite in humman language

import ast
game_reviews['tags'] = game_reviews['tags'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Build FastText training lines
def to_fasttext_format(row):
    labels = " ".join([f"__label__{tag}" for tag in row['tags']])
    return f"{labels} {row['review_text']}"

game_reviews["fasttext_format"] = game_reviews.apply(to_fasttext_format, axis=1)

# Write to file for training
game_reviews["fasttext_format"].to_csv("fasttext_train.txt", index=False, header=False)

In [None]:
# Train FastText model
model = fasttext.train_supervised(
    input="fasttext_train.txt",
    lr=0.1,
    epoch=25,
    wordNgrams=2,
    dim=100,
    loss="ova"  # multi-label support
)

model.save_model("review_to_tag_model.ftz")

In [None]:
# game_reviews['embedding'] = game_reviews['review_text'].apply(model.get_sentence_vector)


In [None]:
import pandas as pd
from ast import literal_eval

# Ensure original tags are lists
tags_df['tags'] = tags_df['tags'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
tags_df = tags_df.rename(columns={'APPID': 'app_id'})

# Predict tags per game
predictions = []
for _, row in game_reviews.iterrows():
    labels, probs = model.predict(row['review_text'], k=10)
    labels = [l.replace("__label__", "") for l in labels]
    predictions.append({
        'app_id': row['app_id'],
        'predicted_tags_with_probs': [f"{tag}:{p:.2f}" for tag, p in zip(labels, probs)]
    })

# Merge with original tags
merged_df = pd.DataFrame(predictions).merge(tags_df[['app_id', 'tags']], on='app_id')
display(merged_df)


Unnamed: 0,app_id,predicted_tags_with_probs,tags
0,10,"[Multiplayer:0.91, Singleplayer:0.91, Classic:...","[Action, FPS, Multiplayer, Shooter, Classic, F..."
1,20,"[Multiplayer:0.77, Singleplayer:0.74, Action:0...","[Action, FPS, Multiplayer, Classic, Shooter, F..."
2,30,"[Multiplayer:0.85, Singleplayer:0.82, Action:0...","[FPS, Multiplayer, Shooter, Action, Classic, F..."
3,40,"[Multiplayer:0.84, Singleplayer:0.78, Action:0...","[Action, FPS, Classic, Multiplayer, Shooter, F..."
4,50,"[Singleplayer:0.84, Story:0.54, Adventure:0.52...","[FPS, Action, Classic, Sci-fi, Singleplayer, S..."
...,...,...,...
9827,563180,"[Indie:0.52, Singleplayer:0.42, Casual:0.37, P...","[Indie, Adventure, Difficult, Platformer, Grea..."
9828,563400,"[Indie:0.55, Casual:0.36, Adventure:0.34, Puzz...","[Free to Play, Adventure, Indie]"
9829,563430,"[Singleplayer:0.51, Indie:0.50, Action:0.37, C...","[RPG, Pixel Graphics, Sci-fi, Indie, Free to P..."
9830,563510,"[Indie:0.58, Casual:0.38, Action:0.35, Strateg...","[Action, VR]"


In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Prepare true and predicted labels
# true labels
y_true = merged_df['tags'].tolist()  # list of lists

# predicted labels
y_pred = merged_df['predicted_tags_with_probs'].apply(
    lambda tags_probs: [tp.split(':')[0] for tp in tags_probs]  # strip probabilities
).tolist()

# Binarize labels
mlb = MultiLabelBinarizer()
y_true_bin = mlb.fit_transform(y_true)
y_pred_bin = mlb.transform(y_pred)  # use the same classes as true

# Compute metrics
precision = precision_score(y_true_bin, y_pred_bin, average='micro')
recall = recall_score(y_true_bin, y_pred_bin, average='micro')
f1 = f1_score(y_true_bin, y_pred_bin, average='micro')

print(f"Precision (micro): {precision:.4f}")
print(f"Recall (micro): {recall:.4f}")
print(f"F1 score (micro): {f1:.4f}")


Accuracy (exact match): 0.0000
Precision (micro): 0.4087
Recall (micro): 0.5233
F1 score (micro): 0.4589




In [None]:
import requests

def get_tags(APPID:int):
    url = f"https://steamspy.com/api.php?request=appdetails&appid={APPID}"

    response = requests.get(url)

    if response.status_code == 200:
        return response.json()['tags']
    else:
        print(f"API fail for {APPID}")
        
