In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval
import random as r
import matplotlib.pyplot as plt
from collections import Counter
import tqdm
from sklearn.base import clone
import ast

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


In [None]:
# -----------------------------
# 1. Load your datasets
# -----------------------------
print("Loading datasets...")
reviews_df = pd.read_csv("./archive/dataset.csv")  # columns: app_id, app_name, review_text

Loading datasets...


In [25]:
tags_df = pd.read_csv("comp/tagGamesNoCompoundDF.csv")
tagCountsDF = pd.read_csv("comp/tagCountsDF.csv")
tags_df.head(10)

Unnamed: 0,app_id,tags
0,10,"['action', 'first-person', 'shooter', 'multipl..."
1,1002,"['2d', 'fighting', 'martial arts', 'intentiona..."
2,100400,"['animation & modeling', 'software']"
3,10090,"['zombies', 'world war ii', 'first-person', 's..."
4,100980,"['animation & modeling', 'utilities', 'design ..."
5,10100,"['adventure', 'point & click', 'classic', 'fan..."
6,10110,"['adventure', 'point & click', 'classic', 'com..."
7,10120,"['action', 'casual', 'space', 'on-rails shoote..."
8,10130,"['action', 'first-person', 'shooter', 'time ma..."
9,10140,"['sports', 'mini golf', 'golf', 'local multipl..."


In [26]:
def filter_rare_tags(filterDF, tagCountsDF, min_count=1, max_count=7000):
    # Keep tags within the occurrence thresholds
    allowed_tags = set(tagCountsDF.loc[
        (tagCountsDF['occurrences'] >= min_count) & (tagCountsDF['occurrences'] <= max_count),
        'tag'
    ])

    tags_df = filterDF.copy()

    # Filter tags per game
    tags_df['tags'] = tags_df['tags'].apply(lambda lst: [tag for tag in lst if tag in allowed_tags])
    
    # Drop games with no tags left
    tags_df = tags_df[tags_df['tags'].str.len() > 0]

    return tags_df


In [27]:
# -----------------------------
# 2. Aggregate reviews per game (only games with tags available)
# -----------------------------

def aggregate_reviews_per_game(reviews_df: pd.DataFrame, tags_df: pd.DataFrame) -> pd.DataFrame:
    print("Aggregating reviews per game...")

    # Standardize key column name
    reviews_df["app_id"] = reviews_df["app_id"].astype(int, copy=False)
    tags_df["app_id"] = tags_df["app_id"].astype(int, copy=False)

    # Filter reviews that have tags
    reviews_with_tags = (
        reviews_df.loc[reviews_df['app_id'].isin(tags_df['app_id']), ['app_id', 'app_name', 'review_text']]
        .assign(review_text=lambda df: df['review_text'].astype(str))
    )

    # Filter short reviews
    reviews_with_tags['review_text'] = reviews_with_tags['review_text'].astype(str)
    reviews_with_tags = reviews_with_tags.loc[
        reviews_with_tags['review_text'].str.count(r'\S+') >= 3
    ]
    print(f"Filtered reviews shape: {reviews_with_tags.shape}")

    # Aggregate per game
    game_reviews = (
        reviews_with_tags
        .groupby("app_id", as_index=False)
        .agg({
            'review_text': lambda texts: " ".join(texts),
            'app_name': 'first'
        })
    )
    print(f"Aggregated reviews shape: {game_reviews.shape}")

    # Merge and parse tags
    game_reviews = game_reviews.merge(tags_df, on="app_id", how='inner')
    game_reviews['tags'] = game_reviews['tags'].apply(ast.literal_eval)

    print(f"Merged game_reviews shape: {game_reviews.shape}")
    print("Sample of final aggregated data:")
    print(game_reviews.head(3))

    return game_reviews

game_reviews = aggregate_reviews_per_game(reviews_df, tags_df)

Aggregating reviews per game...
Filtered reviews shape: (8964327, 3)
Aggregated reviews shape: (10, 3)
Merged game_reviews shape: (10, 4)
Sample of final aggregated data:
   app_id                                        review_text  \
0      10  • Do you like original games? • Do you like ga...   
1      20  A great underrated classic that still has popu...   
2      30  One of the best skill-based and highly technic...   

                app_name                                               tags  
0         Counter-Strike  [action, first-person, shooter, multiplayer, s...  
1  Team Fortress Classic  [action, first-person, shooter, multiplayer, c...  
2          Day of Defeat  [first-person, shooter, world war ii, multipla...  


In [28]:

game_reviews_sample = game_reviews.head(10).copy()

tags_dfNew = filter_rare_tags(game_reviews, tagCountsDF, min_count=1, max_count=7000)

print(tags_df['tags'])
# Build the vocabulary of unique tags
import ast

# Convert string representation of lists to actual lists
tags_df['tags'] = tags_df['tags'].apply(ast.literal_eval)

# Now extract unique tags
all_tags = sorted(set(tag for sublist in tags_df['tags'] for tag in sublist))


excludedTags = [
    "Software", "Utilities", "Design & Illustration", "Photo Editing", 
    "Education", "Game Development", "Programming", "Software Training",
    "Web Publishing", "Video Production", "Audio Production", "Tutorial",
    "Coding", "Hardware", "360 Video"
]
excludedTags = {t.lower() for t in excludedTags}

all_tags = list(set(all_tags) - excludedTags)

print(all_tags)

0       ['action', 'first-person', 'shooter', 'multipl...
1       ['2d', 'fighting', 'martial arts', 'intentiona...
2                    ['animation & modeling', 'software']
3       ['zombies', 'world war ii', 'first-person', 's...
4       ['animation & modeling', 'utilities', 'design ...
                              ...                        
9957    ['first-person', 'shooter', 'action', 'comedy'...
9958    ['action', 'first-person', 'shooter', 'sci-fi'...
9959               ['adventure', 'action', 'local co-op']
9960    ['free to play', 'massively multiplayer', 'rpg...
9961    ['free to play', 'massively multiplayer', 'puz...
Name: tags, Length: 9962, dtype: object
['female protagonist', 'violent', 'procedural generation', 'ninja', 'linear', 'electronic music', 'transhumanism', 'dinosaurs', 'star wars', 'trivia', 'pool', 'escape room', 'old school', 'party game', 'archery', 'parkour', 'vr only', 'futuristic', 'fmv', 'hentai', 'pirates', 'western', 'comic book', 'narration', 'realis

In [29]:

def getCounts(vocab=all_tags, sents=game_reviews_sample['review_text'].fillna("").tolist()):

    pipe = Pipeline([('count', CountVectorizer(vocabulary=vocab)),

                    ('tfid', TfidfTransformer())], verbose=True).fit(sents)

    tdMatrix = pipe['count'].transform(sents).toarray().transpose()
    return tdMatrix


In [30]:
findTagOccurances = True

if findTagOccurances:
    tdMatrix = getCounts(sents=game_reviews['review_text'].fillna("").tolist())
    print('The shape of the term-document matrix is', tdMatrix.shape)
    tdMatrix_pd = pd.DataFrame(tdMatrix, index=all_tags, columns=list(game_reviews['app_id']))
    tdMatrix_pd.to_csv("TD_Matrix.csv", index=False)

else:
    tdMatrix_pd = pd.read_csv("TD_Matrix.csv")
    tdMatrix_pd.index = all_tags
    tdMatrix_pd.columns = list((game_reviews['app_id']).astype(int))

tdMatrix_pd


[Pipeline] ............. (step 1 of 2) Processing count, total= 5.3min
[Pipeline] .............. (step 2 of 2) Processing tfid, total=   0.1s
The shape of the term-document matrix is (402, 10)


Unnamed: 0,10,20,30,40,50,60,70,80,130,220
female protagonist,0,0,0,0,0,0,0,0,0,0
violent,0,0,0,0,0,0,0,0,0,0
procedural generation,0,0,0,0,0,0,0,0,0,0
ninja,12353,0,0,0,0,1237,0,0,0,0
linear,0,0,0,0,0,0,39580,4244,0,51864
...,...,...,...,...,...,...,...,...,...,...
mature,12353,0,1030,0,0,0,0,0,0,0
3d vision,0,0,0,0,0,0,0,0,0,0
sci-fi,0,0,0,0,0,0,0,0,0,0
thriller,0,0,0,0,0,0,0,0,0,0


In [31]:
def get_top_n_tags(app_id, n=5, td_matrix=tdMatrix_pd):
    """
    Returns a list of top n tags for a given app_id, sorted by count descending.
    """
    if app_id not in td_matrix.columns:
        raise ValueError(f"App ID {app_id} not found in the matrix.")
    
    # Extract the column for this app_id
    col = td_matrix[app_id]
    
    # Create list of (tag, count)
    tag_counts = list(zip(td_matrix.index, col))
    
    # Sort descending by count
    tag_counts.sort(key=lambda x: x[1], reverse=True)
    
    # Keep only tags with count > 0
    tag_counts = [tc for tc in tag_counts if tc[1] > 0]
    
    # Return only top n tags
    return tag_counts[:n]


app_id_example = 10
top_tags = get_top_n_tags(app_id_example, n=5)
print(top_tags)


[('shooter', 1432948), ('multiplayer', 889416), ('classic', 827651), ('action', 457061), ('competitive', 259413)]


In [32]:
actual_tags_df = game_reviews[['app_id', 'tags']].rename(columns={'tags': 'actual_tags'})
actual_tags_df['actual_tags'] = actual_tags_df['actual_tags'].apply(lambda tag_list: [t.lower() for t in tag_list])
print(actual_tags_df.shape)
print(actual_tags_df.head())

(10, 2)
   app_id                                        actual_tags
0      10  [action, first-person, shooter, multiplayer, s...
1      20  [action, first-person, shooter, multiplayer, c...
2      30  [first-person, shooter, world war ii, multipla...
3      40  [action, first-person, shooter, classic, multi...
4      50  [first-person, shooter, action, classic, sci-f...


In [33]:
def count_actual_tags(app_id):
    return len(actual_tags_df.loc[actual_tags_df['app_id'] == app_id, 'actual_tags'].values[0])

In [34]:
def get_top_tags_df(app_ids=None, n=5, td_matrix=tdMatrix_pd):
    """
    Returns a DataFrame with columns:
    - app_id
    - top_tags: list of top n tags for each app_id
    """
    if app_ids is None:
        app_ids = td_matrix.columns  # all app_ids by default
    
    data = []
    for app_id in app_ids:
        top_tags = get_top_n_tags(app_id, n=268)
        # Keep only the tag names, not counts
        top_tags_only = [tag for tag, count in top_tags]
        data.append({"app_id": app_id, "top_tags": top_tags_only})
    
    return pd.DataFrame(data)

# Just first 10 app_ids for demonstration
top_tags_df = get_top_tags_df(tdMatrix_pd.columns, n=20)
print(top_tags_df.shape)
print(top_tags_df)

(10, 2)
   app_id                                           top_tags
0      10  [shooter, multiplayer, classic, action, compet...
1      20  [classic, multiplayer, action, shooter, mod, r...
2      30  [war, shooter, classic, multiplayer, action, m...
3      40  [classic, shooter, multiplayer, arcade, mod, g...
4      50  [multiplayer, classic, shooter, immersive, act...
5      60  [realistic, masterpiece, competitive, multipla...
6      70  [shooter, classic, action, multiplayer, aliens...
7      80  [multiplayer, singleplayer, shooter, classic, ...
8     130               [modern, short, aliens, multiplayer]
9     220  [action, shooter, physics, adventure, zombies,...


In [35]:
# Assume your DataFrames
pred_df = top_tags_df        # columns: 'app_id', 'top_tags'
true_df = actual_tags_df     # columns: 'app_id', 'actual_tags'

# Merge on app_id to align
df = pred_df.merge(true_df, on='app_id')

mlb = MultiLabelBinarizer(classes=sorted(all_tags))

# Transform predicted and actual tags into binary indicator matrices
y_pred = mlb.fit_transform(df['top_tags'])
y_true = mlb.fit_transform(df['actual_tags']) 


In [36]:
precision = precision_score(y_true, y_pred, average='micro')  # micro: aggregate TP/FP/FN
recall = recall_score(y_true, y_pred, average='micro')
f1 = f1_score(y_true, y_pred, average='micro')
accuracy = accuracy_score(y_true, y_pred)  # exact match for all tags per row

print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")


Precision: 0.277
Recall:    0.394
F1-score:  0.325
Accuracy:  0.000


In [37]:
def find_top_x(li, x=10, print_output=True):
    flat = np.array([item for sublist in li for item in sublist])

    # Get unique tags and counts
    unique, counts = np.unique(flat, return_counts=True)

    # Sort tags by descending frequency
    sorted_indices = np.argsort(-counts)
    most_common_items = unique[sorted_indices]
    most_common_counts = counts[sorted_indices]

    # Print top 10 tags
    if print_output:
        print("Top tags:")
        for item, count in zip(most_common_items[:x], most_common_counts[:x]):
            print(f"{item}: {count}")
    return most_common_items[:x], most_common_counts[:x]

X=35
top_pred = find_top_x(top_tags_df['top_tags'], x=X, print_output=False)
top_act = find_top_x(actual_tags_df['actual_tags'], x=X, print_output=False)

print(len(top_tags_df['top_tags']), len(actual_tags_df['actual_tags']))
df_compare = pd.DataFrame({
    'pred_Tag': [t for t in top_pred[0]],
    'pred_count': [s for s in top_pred[1]],
    'act_Tag': [t for t in top_act[0]],
    'act_count': [s for s in top_act[1]]
})

music_counts = df_compare.loc[df_compare['act_Tag'] == 'music', 'act_count'].values
total_music = music_counts.sum()  
print(total_music)

for i in actual_tags_df['actual_tags']:
    if i == 'music':
        print(i)

df_compare.head(20)

10 10
0


Unnamed: 0,pred_Tag,pred_count,act_Tag,act_count
0,multiplayer,10,shooter,20
1,mod,9,first-person,20
2,classic,9,action,11
3,shooter,9,classic,10
4,action,8,multiplayer,8
5,realistic,7,singleplayer,7
6,short,7,retro,7
7,addictive,7,sci-fi,6
8,singleplayer,6,old school,6
9,modern,6,adventure,6


In [38]:
count=0

for i in zip(pred_df['top_tags'], true_df['actual_tags']):
    if len (i[0]) != len(i[1]):
        if len(i[0]) > len(i[1]):
            count+=1

print(count)

4
