In [3]:
# Imports
import pandas as pd
import ast
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, classification_report
import numpy as np
from sklearn.linear_model import LogisticRegression


In [4]:
games = pd.read_csv('data/games.csv')

In [5]:
games.columns

Index(['appid', 'name', 'detailed_description', 'about_the_game',
       'short_description', 'recommendations', 'supported_languages',
       'categories', 'genres', 'positive', 'negative', 'tags'],
      dtype='object')

In [None]:


# genre strings into real lists
def parse_genres(genre_str):
    try:
        return ast.literal_eval(genre_str)
    except:
        return []

games['genres_list'] = games['genres'].apply(parse_genres)

# find unique genres
all_genres = [
    genre
    for sublist in games['genres_list']
    if isinstance(sublist, list)
    for genre in sublist
]

unique_genres = sorted(set(all_genres))
print("Number of unique genres:", len(unique_genres))
print("Some unique genres:", unique_genres[:10])

# full numbered list of genres
print("\nFull list of unique genres:")
for i, genre in enumerate(unique_genres, start=1):
    print(f"{i}. {genre}")


Number of unique genres: 24
Some unique genres: ['Action', 'Adventure', 'Animation & Modeling', 'Audio Production', 'Casual', 'Design & Illustration', 'Early Access', 'Education', 'Free To Play', 'Game Development']

Full list of unique genres:
1. Action
2. Adventure
3. Animation & Modeling
4. Audio Production
5. Casual
6. Design & Illustration
7. Early Access
8. Education
9. Free To Play
10. Game Development
11. Gore
12. Indie
13. Massively Multiplayer
14. Photo Editing
15. RPG
16. Racing
17. Simulation
18. Software Training
19. Sports
20. Strategy
21. Utilities
22. Video Production
23. Violent
24. Web Publishing


In [None]:

# combine description cols
text_columns = ['detailed_description', 'about_the_game', 'short_description']
games['combined_text'] = games[text_columns].fillna('').agg(' '.join, axis=1)

# clean genre col
def safe_parse(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except:
            return []
    elif isinstance(x, list):
        return x
    else:
        return []

games['genres'] = games['genres'].apply(safe_parse)


# TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(games['combined_text'])

# label encoding
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(games['genres'])

# split data into test/train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# multi-label model
base_model = LogisticRegression(max_iter=1000)
model = OneVsRestClassifier(base_model)
model.fit(X_train, y_train)

# evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))


                       precision    recall  f1-score   support

               Action       0.80      0.75      0.77      6061
            Adventure       0.72      0.66      0.69      5430
 Animation & Modeling       1.00      0.11      0.20        36
     Audio Production       0.00      0.00      0.00        16
               Casual       0.70      0.60      0.65      5578
Design & Illustration       1.00      0.05      0.10        39
         Early Access       0.67      0.13      0.22      1451
            Education       0.00      0.00      0.00        30
         Free To Play       0.74      0.07      0.12       660
     Game Development       0.00      0.00      0.00        16
                 Gore       0.00      0.00      0.00        39
                Indie       0.79      0.94      0.86      9963
Massively Multiplayer       0.81      0.12      0.21       317
        Photo Editing       0.00      0.00      0.00         6
                  RPG       0.81      0.51      0.63  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# parse genres
import ast

def parse_genres(genre_str):
    try:
        if isinstance(genre_str, list):
            return genre_str
        return ast.literal_eval(genre_str)
    except Exception:
        return []

games['genres'] = games['genres'].apply(parse_genres)

# combine text fields
text_columns = ['name', 'short_description', 'about_the_game', 'detailed_description']
games['combined_text'] = games[text_columns].fillna('').agg(' '.join, axis=1)

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(games['combined_text'])

# multilabelbinarizer
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(games['genres'])

# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# One-vs-Rest logistic regression
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, Y_train)

# evaluate
Y_pred = model.predict(X_test)

print("Micro F1 Score:", f1_score(Y_test, Y_pred, average='micro'))
print("Macro F1 Score:", f1_score(Y_test, Y_pred, average='macro'))
print("\nClassification Report:\n")
print(classification_report(Y_test, Y_pred, target_names=mlb.classes_))


Micro F1 Score: 0.7089011629520663
Macro F1 Score: 0.3022123555536979

Classification Report:

                       precision    recall  f1-score   support

               Action       0.80      0.75      0.77      6061
            Adventure       0.72      0.66      0.69      5430
 Animation & Modeling       1.00      0.11      0.20        36
     Audio Production       0.00      0.00      0.00        16
               Casual       0.70      0.60      0.65      5578
Design & Illustration       1.00      0.05      0.10        39
         Early Access       0.67      0.13      0.21      1451
            Education       0.00      0.00      0.00        30
         Free To Play       0.72      0.06      0.12       660
     Game Development       0.00      0.00      0.00        16
                 Gore       0.00      0.00      0.00        39
                Indie       0.79      0.94      0.86      9963
Massively Multiplayer       0.82      0.13      0.22       317
        Photo Editing 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from tensorflow import keras
from keras.layers import Dense, Dropout
from keras import Sequential
from keras.optimizers import Adam
import numpy as np
import ast


In [None]:
# parse genres
def parse_genres(genre_str):
    try:
        if isinstance(genre_str, list):
            return genre_str
        return ast.literal_eval(genre_str)
    except:
        return []

games['genres'] = games['genres'].apply(parse_genres)

# combine text fields
text_columns = ['name', 'short_description', 'about_the_game', 'detailed_description']
games['combined_text'] = games[text_columns].fillna('').agg(' '.join, axis=1)

# TF-IDF
tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
X = tfidf.fit_transform(games['combined_text']).toarray()

# multilabelbinarizer
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(games['genres'])

# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# neural network
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(Y_train.shape[1], activation='sigmoid')  # sigmoid = multi-label output
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# =train
history = model.fit(
    X_train, Y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=64,
    verbose=1
)

# analysis
Y_pred_probs = model.predict(X_test)
Y_pred = (Y_pred_probs > 0.5).astype(int)

print("Micro F1:", f1_score(Y_test, Y_pred, average='micro'))
print("Macro F1:", f1_score(Y_test, Y_pred, average='macro'))
print("\nClassification Report:\n")
print(classification_report(Y_test, Y_pred, target_names=mlb.classes_))


Epoch 1/10


2025-10-23 11:38:05.378670: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Micro F1: 0.6915680670922251
Macro F1: 0.37021062674682165

Classification Report:

                       precision    recall  f1-score   support

               Action       0.78      0.74      0.75      6061
            Adventure       0.68      0.70      0.69      5430
 Animation & Modeling       0.78      0.39      0.52        36
     Audio Production       0.00      0.00      0.00        16
               Casual       0.66      0.61      0.63      5578
Design & Illustration       0.47      0.46      0.47        39
         Early Access       0.45      0.22      0.29      1451
            Education       0.13      0.07      0.09        30
         Free To Play       0.43      0.15      0.22       660
     Game Development       1.00      0.06      0.12        16
                 Gore       0.00      0.00      0.00        39
                Indie       0.80      0.86      0.83      9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
