In [1]:
# Imports
import ast
import numpy as np
import pandas as pd
from collections import Counter

# Scikit-learn
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam


In [2]:
games = pd.read_csv('../games.csv')

In [17]:


# genre strings into real lists
def parse_genres(genre_str):
    try:
        return ast.literal_eval(genre_str)
    except:
        return []

games['genres_list'] = games['genres'].apply(parse_genres)

# find unique genres
all_genres = [
    genre
    for sublist in games['genres_list']
    if isinstance(sublist, list)
    for genre in sublist
]

unique_genres = sorted(set(all_genres))
print("Number of unique genres:", len(unique_genres))
print("Some unique genres:", unique_genres[:10])

# full numbered list of genres
print("\nFull list of unique genres:")
for i, genre in enumerate(unique_genres, start=1):
    print(f"{i}. {genre}")


Number of unique genres: 16
Some unique genres: ['Action', 'Adventure', 'Animation & Modeling', 'Casual', 'Design & Illustration', 'Early Access', 'Free To Play', 'Indie', 'Massively Multiplayer', 'RPG']

Full list of unique genres:
1. Action
2. Adventure
3. Animation & Modeling
4. Casual
5. Design & Illustration
6. Early Access
7. Free To Play
8. Indie
9. Massively Multiplayer
10. RPG
11. Racing
12. Simulation
13. Sports
14. Strategy
15. Utilities
16. Video Production


In [18]:
# parse genres
import ast

def parse_genres(genre_str):
    try:
        if isinstance(genre_str, list):
            return genre_str
        return ast.literal_eval(genre_str)
    except Exception:
        return []

games['genres'] = games['genres'].apply(parse_genres)

# combine text fields
text_columns = ['name', 'short_description', 'about_the_game', 'detailed_description']
games['combined_text'] = games[text_columns].fillna('').agg(' '.join, axis=1)

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(games['combined_text'])

# multilabelbinarizer
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(games['genres'])

# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# One-vs-Rest logistic regression
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, Y_train)

# evaluate
Y_pred = model.predict(X_test)

print("Micro F1 Score:", f1_score(Y_test, Y_pred, average='micro'))
print("Macro F1 Score:", f1_score(Y_test, Y_pred, average='macro'))
print("\nClassification Report:\n")
print(classification_report(Y_test, Y_pred, target_names=mlb.classes_))


Micro F1 Score: 0.7120410002252759
Macro F1 Score: 0.4335677536868626

Classification Report:

                       precision    recall  f1-score   support

               Action       0.80      0.75      0.77      5987
            Adventure       0.73      0.66      0.69      5454
 Animation & Modeling       0.00      0.00      0.00        17
               Casual       0.71      0.60      0.65      5574
Design & Illustration       1.00      0.06      0.12        16
         Early Access       0.70      0.14      0.23      1408
         Free To Play       0.58      0.04      0.08       669
                Indie       0.79      0.94      0.86      9860
Massively Multiplayer       0.74      0.13      0.23       277
                  RPG       0.81      0.48      0.61      2266
               Racing       0.89      0.51      0.65       557
           Simulation       0.77      0.46      0.58      2804
               Sports       0.86      0.39      0.54       657
             Strategy 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from tensorflow import keras
#from keras.models import Model
from keras.layers import Dense, Dropout
#from keras.optimizers import Adam
from keras import Sequential
#from keras.layers import Dense, Dropout
from keras.optimizers import Adam
import numpy as np
import ast
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier


### NEURAL NETWORK

In [20]:
# parse genres
def parse_genres(genre_str):
    try:
        if isinstance(genre_str, list):
            return genre_str
        return ast.literal_eval(genre_str)
    except:
        return []

games['genres'] = games['genres'].apply(parse_genres)

# combine text fields
text_columns = ['name', 'short_description', 'about_the_game', 'detailed_description']
games['combined_text'] = games[text_columns].fillna('').agg(' '.join, axis=1)

# TF-IDF
tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
X = tfidf.fit_transform(games['combined_text']).toarray()

# multilabelbinarizer
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(games['genres'])

# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# neural network
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(Y_train.shape[1], activation='sigmoid')  # sigmoid = multi-label output
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# =train
history = model.fit(
    X_train, Y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=64,
    verbose=1
)

# analysis
Y_pred_probs = model.predict(X_test)
Y_pred = (Y_pred_probs > 0.5).astype(int)

print("Micro F1:", f1_score(Y_test, Y_pred, average='micro'))
print("Macro F1:", f1_score(Y_test, Y_pred, average='macro'))
print("\nClassification Report:\n")
print(classification_report(Y_test, Y_pred, target_names=mlb.classes_))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Micro F1: 0.7015938606847698
Macro F1: 0.505821287136677

Classification Report:

                       precision    recall  f1-score   support

               Action       0.75      0.79      0.77      5987
            Adventure       0.70      0.67      0.68      5454
 Animation & Modeling       0.67      0.24      0.35        17
               Casual       0.68      0.60      0.64      5574
Design & Illustration       0.50      0.19      0.27        16
         Early Access       0.41      0.24      0.30      1408
         Free To Play       0.37      0.11      0.17       669
                Indie       0.80      0.90      0.84      9860
Massively Multiplayer       0.62      0.20      0.30       277
                  RPG       0.71      0.57      0.63      2266
               Racing       0.80      0.59      0.68       557
           Simulation       0.62      0.57      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### MODEL - SVM - CURRENT BEST MODEL

In [21]:
svm_model = OneVsRestClassifier(LinearSVC())
svm_model.fit(X_train, Y_train)

svm_pred = svm_model.predict(X_test)

print("SVM Micro F1:", f1_score(Y_test, svm_pred, average='micro'))
print("SVM Macro F1:", f1_score(Y_test, svm_pred, average='macro'))
print("\nSVM Classification Report:\n")
print(classification_report(Y_test, svm_pred, target_names=mlb.classes_))


SVM Micro F1: 0.7085465675679444
SVM Macro F1: 0.5321544780828762

SVM Classification Report:

                       precision    recall  f1-score   support

               Action       0.79      0.75      0.77      5987
            Adventure       0.72      0.66      0.69      5454
 Animation & Modeling       0.78      0.41      0.54        17
               Casual       0.69      0.60      0.64      5574
Design & Illustration       0.60      0.19      0.29        16
         Early Access       0.67      0.15      0.24      1408
         Free To Play       0.54      0.04      0.08       669
                Indie       0.79      0.93      0.85      9860
Massively Multiplayer       0.59      0.19      0.29       277
                  RPG       0.79      0.50      0.62      2266
               Racing       0.87      0.55      0.67       557
           Simulation       0.74      0.48      0.58      2804
               Sports       0.81      0.42      0.55       657
             Strategy 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### MODEL - DECISION TREES

In [22]:
# Less Depth
fast_tree = OneVsRestClassifier(
    DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
)
fast_tree.fit(X_train, Y_train)

fast_tree_pred = fast_tree.predict(X_test)

print("Decision Tree (fast) Micro F1:", f1_score(Y_test, fast_tree_pred, average='micro'))
print("Decision Tree (fast) Macro F1:", f1_score(Y_test, fast_tree_pred, average='macro'))


Decision Tree (fast) Micro F1: 0.6548783439399763
Decision Tree (fast) Macro F1: 0.412658780156853


In [23]:
# More trees
extra_trees = OneVsRestClassifier(
    ExtraTreesClassifier(n_estimators=100, max_depth=15, n_jobs=-1, random_state=42)
)
extra_trees.fit(X_train, Y_train)

et_pred = extra_trees.predict(X_test)

print("Extra Trees Micro F1:", f1_score(Y_test, et_pred, average='micro'))
print("Extra Trees Macro F1:", f1_score(Y_test, et_pred, average='macro'))


Extra Trees Micro F1: 0.5038216336162868
Extra Trees Macro F1: 0.1358067857681201


### MODEL - RANDOM FORESTS

In [24]:
fast_rf = OneVsRestClassifier(
    RandomForestClassifier(
        n_estimators=100,       # fewer trees (for speed)
        max_depth=15,           # limit depth (for speed)
        min_samples_split=10,
        n_jobs=-1,
        random_state=42
    )
)
fast_rf.fit(X_train, Y_train)

rf_pred = fast_rf.predict(X_test)

print("Fast Random Forest Micro F1:", f1_score(Y_test, rf_pred, average='micro'))
print("Fast Random Forest Macro F1:", f1_score(Y_test, rf_pred, average='macro'))


Fast Random Forest Micro F1: 0.6195503390550513
Fast Random Forest Macro F1: 0.2550785583024753
