Imports

In [None]:
import nltk
import numpy as np
import os
import tensorflow as tf
import pandas as pd
import re
import requests
import string
import time
from bs4 import BeautifulSoup
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification, TFBertForSequenceClassification, BertTokenizer
from transformers import create_optimizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

API_KEY = "API_KEY" #Put steam api key here
MAX_GAMES = 10000


Functions to get Data from Steam API

In [2]:
lemmatizer = WordNetLemmatizer()
custom_stop_words = set(stopwords.words('english'))
app_ids_collected = set() 

def get_all_game_ids():
    url = "http://api.steampowered.com/ISteamApps/GetAppList/v2"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        app_ids = [app['appid'] for app in data['applist']['apps']]
        return app_ids
    else:
        return []

genres_choice = {'Action', 'Casual', 'Adventure', 'RPG', 'Simulation', 'Strategy', 'Sports'}

def get_game_details(app_id):
    url = f'http://store.steampowered.com/api/appdetails?appids={app_id}&key={API_KEY}'
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if str(app_id) in data and 'data' in data[str(app_id)]:
                genres = data[str(app_id)]['data'].get('genres', [])
                genres = {genre['description'] for genre in genres}
                if genres.intersection(genres_choice):
                    description = data[str(app_id)]['data'].get('detailed_description', '')
                    return description, genres
            return '', set()
        else:
            return '', set()
    except Exception as e:
        return '', set()


Functions to preprocess data

In [3]:
def clean_description(description):
    soup = BeautifulSoup(description, 'html.parser')
    return soup.get_text(separator=' ')
def enhanced_preprocess_description(description):
    if not isinstance(description, str):
        return ""
    if isinstance(description, list):
        description = ' '.join(description)
    description = description.lower()
    description = re.sub(r'<[^<]+?>', '', description) 
    description = re.sub(r'RT|cc', ' ', description)
    description = re.sub(r'http\S+\s*', ' ', description) 
    description = re.sub(r'#\S+', '', description) 
    description = re.sub(r'@\S+', '  ', description) 
    description = re.sub(r'\s+', ' ', description).strip()  
    description = ''.join([char for char in description if char not in string.punctuation]) 
    description = re.sub(r"[^a-zA-Z]", " ", description)  
    description = re.sub(r'[^\x00-\x7f]', r' ', description) 
    description = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(description) if word not in custom_stop_words])
    description = re.sub(r'\s+', ' ', description).strip()
    return description

Creation of the dataframe

In [4]:
csv_filename = 'game_data.csv'
if os.path.isfile(csv_filename):
    df = pd.read_csv(csv_filename, engine="python")
    df_bis = df
else:
    app_ids = get_all_game_ids()
    game_data = {'Description': [], 'Genres': []}
    count = 0

    for app_id in app_ids:
        if count >= MAX_GAMES:
            break
        description, genres = get_game_details(app_id)
        if description:
            game_data['Description'].append(clean_description(description))
            game_data['Genres'].append(list(genres))
            count += 1
            if count % 100 == 0:
                print(f"{count} jeux recup")
        time.sleep(1) 

    df = pd.DataFrame(game_data)
    df_bis = df
    df.to_csv(csv_filename, index=False)

Filter genres

In [None]:
genres_choice = ['Action', 'Casual', 'Adventure', 'RPG', 'Simulation', 'Strategy', 'Sports']

def filtrer_genres(genres):
    if isinstance(genres, list):
        return [genre for genre in genres if genre in genres_choice]
    elif isinstance(genres, str):
        genres_list = genres.strip("[]").replace("'", "").split(', ')
        return [genre for genre in genres_list if genre in genres_choice]
    else:
        return []
    
df['Filtered_Genres'] = df['Genres'].apply(filtrer_genres)
df['Genres'] = df['Filtered_Genres']
df.drop(columns=['Filtered_Genres'], inplace=True)

Random Forest Classifier

In [None]:
mlbRF = MultiLabelBinarizer()
yRF = mlbRF.fit_transform(df['Genres'])
xRF = [enhanced_preprocess_description(desc) for desc in df['Description']]

X_trainRF, X_testRF, y_trainRF, y_testRF = train_test_split(xRF, yRF, test_size=0.2, random_state=42)
vectorizerRF = TfidfVectorizer()
X_train_tfidfRF = vectorizerRF.fit_transform(X_trainRF)
X_test_tfidfRF = vectorizerRF.transform(X_testRF)
clfRF = RandomForestClassifier()
clfRF.fit(X_train_tfidfRF, y_trainRF)

y_predRF = clfRF.predict(X_test_tfidfRF)
accuracy_rf = accuracy_score(y_testRF, y_predRF)
recall_rf = recall_score(y_testRF, y_predRF, average='weighted')
f1_rf = f1_score(y_testRF, y_predRF, average='weighted')

print("Accuracy:", accuracy_rf)
print("Recall:", recall_rf)
print("F1-Score:", f1_rf)
class_namesRF = mlbRF.classes_
rf_report = classification_report(y_testRF, y_predRF, target_names=class_namesRF, output_dict=True)
print("Classification Report:\n", classification_report(y_testRF, y_predRF, target_names=class_namesRF))

BERT with Transformers library

In [None]:
mlbBERT = MultiLabelBinarizer()
yBERT = mlbBERT.fit_transform(df['Genres'])
xBERT = [enhanced_preprocess_description(desc) for desc in df['Description']]
X_trainBERT, X_testBERT, y_trainBERT, y_testBERT = train_test_split(xBERT, yBERT, test_size=0.2, random_state=42)
tokenizerBERT = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_textsBERT(tokenizer, texts, max_length=256):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='tf',
        )
        input_ids.append(encoded['input_ids'][0])
        attention_masks.append(encoded['attention_mask'][0])
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_masks)

X_train_idsBERT, X_train_masksBERT = encode_textsBERT(tokenizerBERT, X_trainBERT)
X_test_idsBERT, X_test_masksBERT = encode_textsBERT(tokenizerBERT, X_testBERT)
modelBERT = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=yBERT.shape[1])
num_train_stepsBERT = len(X_trainBERT) // 32 * 3  
optimizerBERT, lr_scheduleBERT = create_optimizer(
    init_lr=5e-5,
    num_train_steps=num_train_stepsBERT,
    num_warmup_steps=0,
    weight_decay_rate=0.01
)
lossBERT = tf.keras.losses.BinaryCrossentropy(from_logits=True)
modelBERT.compile(optimizer=optimizerBERT, loss=lossBERT, metrics=['accuracy'])

modelBERT.fit(
    [X_train_idsBERT, X_train_masksBERT],
    y_trainBERT,
    batch_size=32,
    epochs=3,
    validation_data=([X_test_idsBERT, X_test_masksBERT], y_testBERT)
)

modelBERT.evaluate([X_test_idsBERT, X_test_masksBERT], y_testBERT)
model_save_path = 'bert_model' 
modelBERT.save_pretrained(model_save_path)
tokenizer_save_path = 'bert_tokenizer' 
tokenizerBERT.save_pretrained(tokenizer_save_path)


In [None]:
y_predBERT = modelBERT.predict([X_test_idsBERT, X_test_masksBERT])
y_pred_bert = (tf.sigmoid(y_predBERT.logits) > 0.5).numpy().astype(int)
class_namesBERT = mlbBERT.classes_
bert_report = classification_report(y_testBERT, y_pred_bert, target_names=class_namesBERT, output_dict=True)
print("Classification Report:\n", classification_report(y_testBERT, y_pred_bert, target_names=class_namesBERT))

y_pred_labelsBERT = (tf.sigmoid(y_predBERT.logits) > 0.5).numpy().astype(int)
predicted_genresBERT = mlbBERT.inverse_transform(y_pred_labelsBERT)
real_genresBERT = mlbBERT.inverse_transform(y_testBERT)
results_dfBERT = pd.DataFrame({
    'Description': X_testBERT,
    'Real_Genres': real_genresBERT,
    'Predicted_Genres': predicted_genresBERT
})

def tuple_to_list(column):
    return [list(item) for item in column]

results_dfBERT['Real_Genres'] = tuple_to_list(results_dfBERT['Real_Genres'])
results_dfBERT['Predicted_Genres'] = tuple_to_list(results_dfBERT['Predicted_Genres'])
results_dfBERT.to_csv('predicted_genres.csv', index=False)

In [None]:

results_df_testbert = pd.read_csv('predicted_genres.csv', converters={'Real_Genres': eval, 'Predicted_Genres': eval})

mlb_testbert = MultiLabelBinarizer()
real_labels_testbert = mlb_testbert.fit_transform(results_df_testbert['Real_Genres'])
predicted_labels_testbert = mlb_testbert.transform(results_df_testbert['Predicted_Genres'])

accuracy_bert = accuracy_score(real_labels_testbert, predicted_labels_testbert)
recall_bert = recall_score(real_labels_testbert, predicted_labels_testbert, average='weighted')
f1_bert = f1_score(real_labels_testbert, predicted_labels_testbert, average='weighted')

print("Accuracy:", accuracy_bert)
print("Recall:", recall_bert)
print("F1 Score:", f1_bert)

RoBERTa

In [None]:
mlbROBERTA = MultiLabelBinarizer()
yROBERTA = mlbROBERTA.fit_transform(df['Genres'])
xROBERTA = [enhanced_preprocess_description(desc) for desc in df['Description']]
X_trainROBERTA, X_testROBERTA, y_trainROBERTA, y_testROBERTA = train_test_split(xROBERTA, yROBERTA, test_size=0.2, random_state=42)
tokenizerROBERTA = RobertaTokenizer.from_pretrained('roberta-base')

def encode_textsROBERTA(tokenizer, texts, max_length=256):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='tf',
        )
        input_ids.append(encoded['input_ids'][0])
        attention_masks.append(encoded['attention_mask'][0])
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_masks)

X_train_idsROBERTA, X_train_masksROBERTA = encode_textsROBERTA(tokenizerROBERTA, X_trainROBERTA)
X_test_idsROBERTA, X_test_masksROBERTA = encode_textsROBERTA(tokenizerROBERTA, X_testROBERTA)

modelROBERTA = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=yROBERTA.shape[1])

optimizerROBERTA, lr_scheduleROBERTA = create_optimizer(
    init_lr=5e-5,
    num_train_steps=len(X_trainROBERTA) // 32 * 3,
    num_warmup_steps=0,
    weight_decay_rate=0.01
)

lossROBERTA = tf.keras.losses.BinaryCrossentropy(from_logits=True)
modelROBERTA.compile(optimizer=optimizerROBERTA, loss=lossROBERTA, metrics=['accuracy'])


modelROBERTA.fit(
    [X_train_idsROBERTA, X_train_masksROBERTA],
    y_trainROBERTA,
    batch_size=32,
    epochs=3,
    validation_data=([X_test_idsROBERTA, X_test_masksROBERTA], y_testROBERTA)
)

modelROBERTA.evaluate([X_test_idsROBERTA, X_test_masksROBERTA], y_testROBERTA)
y_predROBERTA = modelROBERTA.predict([X_test_idsROBERTA, X_test_masksROBERTA])
y_pred_labelsROBERTA = (tf.sigmoid(y_predROBERTA.logits) > 0.5).numpy().astype(int)
predicted_genresROBERTA = mlbROBERTA.inverse_transform(y_pred_labelsROBERTA)
real_genresROBERTA = mlbROBERTA.inverse_transform(y_testROBERTA)


results_dfROBERTA = pd.DataFrame({
    'Description': X_testROBERTA,
    'Real_Genres': real_genresROBERTA,
    'Predicted_Genres': predicted_genresROBERTA
})


def tuple_to_list(column):
    return [list(item) for item in column]

results_dfROBERTA['Real_Genres'] = tuple_to_list(results_dfROBERTA['Real_Genres'])
results_dfROBERTA['Predicted_Genres'] = tuple_to_list(results_dfROBERTA['Predicted_Genres'])
results_dfROBERTA.to_csv('predicted_genres_Roberta.csv', index=False)
save_directory = "roberta_model" 
modelROBERTA.save_pretrained(save_directory)
tokenizerROBERTA.save_pretrained(save_directory)


Neural Network

In [None]:
df['Description'].fillna('', inplace=True)
df['Description'] = df['Description'].apply(str)

mlbNN = MultiLabelBinarizer()
yNN= mlbNN.fit_transform(df['Genres'])
vectorizerNN = TfidfVectorizer(max_features=5000)
XNN = vectorizerNN.fit_transform(df['Description']).toarray()

X_trainNN, X_testNN, y_trainNN, y_testNN = train_test_split(XNN, yNN, test_size=0.2, random_state=42)

modelNN = Sequential()
modelNN.add(Dense(512, input_shape=(X_trainNN.shape[1],), activation='relu'))
modelNN.add(Dropout(0.5))
modelNN.add(Dense(256, activation='relu'))
modelNN.add(Dropout(0.5))
modelNN.add(Dense(y_trainNN.shape[1], activation='sigmoid')) 

modelNN.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
modelNN.fit(X_trainNN, y_trainNN, epochs=5, batch_size=64, validation_data=(X_testNN, y_testNN))
scoreNN = modelNN.evaluate(X_testNN, y_testNN, verbose=0)

y_pred_probNN = modelNN.predict(X_testNN)
y_predNN = np.where(y_pred_probNN > 0.5, 1, 0)

accuracy_nn = accuracy_score(y_testNN, y_predNN)
recall_nn = recall_score(y_testNN, y_predNN, average='weighted')
f1_nn = f1_score(y_testNN, y_predNN, average='weighted')

print("Accuracy:", accuracy_nn)
print("Recall:", recall_nn)
print("F1 Score:", f1_nn)

nn_report = classification_report(y_testNN, y_predNN, target_names=mlbNN.classes_, output_dict=True)
print("Classification Report:\n", classification_report(y_testNN, y_predNN, target_names=mlbNN.classes_))

BERT and RoBERTa with Kaggle Dataset

In [9]:
file_path = 'games.csv' #Download from https://www.kaggle.com/datasets/fronkongames/steam-games-dataset/data
df_kaggle = pd.read_csv(file_path)
genres_choice_kaggle = ['Action', 'Casual', 'Adventure', 'RPG', 'Simulation', 'Strategy', 'Sports']
def filtrer_genres(genres):
    if isinstance(genres, str) and genres != "":
        genres_list = genres.split(',')
        return [genre.strip() for genre in genres_list if genre.strip() in genres_choice_kaggle]
    else:
        return []
    
df_kaggle['Description'] = df_kaggle['About the game'].apply(enhanced_preprocess_description)
df_kaggle['Filtered_Genres'] = df_kaggle['Genres'].apply(filtrer_genres)
df_kaggle['Genres'] = df_kaggle['Filtered_Genres']
df_kaggle.drop(columns=['Filtered_Genres'], inplace=True)
df_kaggle = df_kaggle[['Description', 'Genres']]

In [None]:
loaded_tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta_model')
loaded_model_roberta = TFRobertaForSequenceClassification.from_pretrained('roberta_model')
loaded_model_bert = TFBertForSequenceClassification.from_pretrained('bert_model')
loaded_tokenizer_bert = BertTokenizer.from_pretrained('bert_model')

In [None]:
X_new_roberta = [enhanced_preprocess_description(desc) for desc in df_kaggle['Description']]
X_new_encoded_roberta = loaded_tokenizer_roberta(X_new_roberta, padding=True, truncation=True, max_length=256, return_tensors='tf')
predictions_roberta = loaded_model_roberta.predict({'input_ids': X_new_encoded_roberta['input_ids'], 'attention_mask': X_new_encoded_roberta['attention_mask']})
y_pred_roberta = (tf.sigmoid(predictions_roberta.logits) > 0.5).numpy().astype(int)

In [None]:
mlb_kaggle = MultiLabelBinarizer()
y_true_kaggle = mlb_kaggle.fit_transform(df_kaggle['Genres'])
accuracy_roberta = accuracy_score(y_true_kaggle, y_pred_roberta)
precision_roberta = precision_score(y_true_kaggle, y_pred_roberta, average='weighted')
recall_roberta = recall_score(y_true_kaggle, y_pred_roberta, average='weighted')
f1_roberta = f1_score(y_true_kaggle, y_pred_roberta, average='weighted')
print(f"Accuracy: {accuracy_roberta}")
print(f"Precision: {precision_roberta}")
print(f"Recall: {recall_roberta}")
print(f"F1-Score: {f1_roberta}")

In [None]:
X_new_bert = [enhanced_preprocess_description(desc) for desc in df_kaggle['Description']]
X_new_encoded_bert = loaded_tokenizer_bert(X_new_bert, padding=True, truncation=True, max_length=256, return_tensors='tf')
predictions_bert = loaded_model_bert.predict({'input_ids': X_new_encoded_bert['input_ids'], 'attention_mask': X_new_encoded_bert['attention_mask']})
y_pred_bert = (tf.sigmoid(predictions_bert.logits) > 0.5).numpy().astype(int)

In [None]:
accuracy_bert = accuracy_score(y_true_kaggle, y_pred_bert)
precision_bert = precision_score(y_true_kaggle, y_pred_bert, average='weighted')
recall_bert = recall_score(y_true_kaggle, y_pred_bert, average='weighted')
f1_bert = f1_score(y_true_kaggle, y_pred_bert, average='weighted')

print(f"Accuracy: {accuracy_bert}")
print(f"Precision: {precision_bert}")
print(f"Recall: {recall_bert}")
print(f"F1-Score: {f1_bert}")
