# Machine Learning 1

1. Import packages

In [135]:
#Used some code with the website: https://blog.jaysinha.me/
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf
from sklearn.model_selection import train_test_split
from ast import literal_eval
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

2. Load csv

In [136]:
df = pd.read_csv('data_clean.csv')
df.drop(columns="Unnamed: 0", axis=1, inplace=True)
df["Genre"] = df["Genre"].apply(lambda x: x.strip("[]").replace("'","").split(", ")) # Genre Array
df.head()

Unnamed: 0,GameName,Description,MetaScore,UserScore,ReleaseYear,Multiplayer,Rating,Developer,Genre,Action,...,Horror,Open-World,Puzzle,Racing,Role-Playing,Sci-Fi,Shooter,Sports,Strategy,Survival
0,Disco Elysium: The Final Cut,Disco Elysium - The Final Cut is the definitiv...,97,82,2021,False,M,ZA/UM,[Role-Playing],0,...,0,0,0,0,1,0,0,0,0,0
1,Half-Life 2,[Metacritic's 2004 PC Game of the Year] By ta...,96,92,2004,False,M,Valve Software,"[Sci-Fi, Shooter, Action, Arcade]",1,...,0,0,0,0,0,1,1,0,0,0
2,Grand Theft Auto V,Los Santos: a sprawling sun-soaked metropolis ...,96,78,2015,True,M,Rockstar North,[Open-World],0,...,0,1,0,0,0,0,0,0,0,0
3,Out of the Park Baseball 2007,[Metacritic's 2007 PC Game of the Year] OOTP ...,96,26,2006,True,E,Sports Interactive,[Sports],0,...,0,0,0,0,0,0,0,1,0,0
4,The Orange Box,Games included in The Orange Box compilation: ...,96,91,2007,True,M,Valve Software,"[Sci-Fi, Shooter, Action]",1,...,0,0,0,0,0,1,1,0,0,0


3. Preprocessing

In [137]:
test_split = 0.1
train_df, test_df = train_test_split(df,test_size=test_split)
val_df = test_df.sample(frac=0.5)
print(f"Number of rows in training set: {len(train_df)}")
print(f"Number of rows in validation set: {len(val_df)}")
print(f"Number of rows in test set: {len(test_df)}")

Number of rows in training set: 3753
Number of rows in validation set: 208
Number of rows in test set: 417


In [140]:
train_df["Description"].apply(lambda x: len(x.split(" "))).describe()

count    3753.000000
mean       84.121503
std        65.564509
min         3.000000
25%        40.000000
50%        63.000000
75%       112.000000
max       943.000000
Name: Description, dtype: float64

In [138]:
terms = tf.ragged.constant(train_df["Genre"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
lookup.adapt(terms)
vocab = lookup.get_vocabulary()

def invert_multi_hot(encoded_labels):
    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
    return np.take(vocab, hot_indices)

print(vocab)

['[UNK]', 'Action', 'Strategy', 'Adventure', 'Role-Playing', 'Shooter', 'Arcade', 'Sci-Fi', 'Fantasy', 'Sports', 'Racing', 'Open-World', 'Puzzle', 'Survival', 'Horror', 'Fighting']


In [139]:
sample_label = train_df["Genre"].iloc[0]
print(f"Original representation: {sample_label}")

label_binarized = lookup([sample_label])
print(f"Binarized representation: {label_binarized}")

Original representation: ['Strategy']
Binarized representation: [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


4. Dataset Generator Function

In [141]:
max_seqlen = 150 #length of sequence
batch_size = 128 #number of training
padding_token = "<pad>"
auto = tf.data.AUTOTUNE
def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe["Genre"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices((dataframe["Description"].values, label_binarized))
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)

In [142]:
train_dataset = make_dataset(train_df, is_train=True)
validation_dataset = make_dataset(val_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)

In [143]:
text_batch, label_batch = next(iter(train_dataset))

for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Description: {text}")
    print(f"Genre: {invert_multi_hot(label[0])}")
    print(" ")

Description: b"In Episode 5, on a tip from Ivor, Jesse and friends head to an abandoned temple holding mysterious treasures. Ambushed by former Ocelot Aiden and his crew, our heroes find themselves in an entirely new world: Sky City! As they explore, the Order finds that resources are at a premium, and nothing but 'The Void' exists below the known world. When Aiden convinces Sky City's ruler that the New Order of the Stone are up to no good, you'll need to make some tough choices to clear your name, and keep Aiden from destroying an innocent world."
Genre: ['Adventure']
 
Description: b'We call Wildermyth a myth-making tactical RPG. It empowers you to craft iconic characters who grow through deep, rewarding battles and interactive storytelling.\r'
Genre: ['Strategy' 'Role-Playing']
 
Description: b'Doggone tired of the typical brawler or tower defense that is all bark and no bite? Then join Alpha and T. Juan on a dynamic adventure that has you battling hordes of baddies and bosses on t

In [144]:
vocabulary = set()
train_df["Description"].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print(vocabulary_size)

36297


5. The Model

In [146]:
def make_model():
    shallow_mlp_model = keras.Sequential(
        [
            layers.Dense(512, activation="relu"),
            layers.Dense(256, activation="relu"),
            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
        ]
    )
    return shallow_mlp_model

6. Training

In [145]:
text_vectorizer = layers.TextVectorization(max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf")
with tf.device("/CPU:0"):
    text_vectorizer.adapt(train_dataset.map(lambda text, label: text))
train_dataset = train_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
validation_dataset = validation_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
test_dataset = test_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)

In [147]:
epochs = 20

shallow_mlp_model = make_model()
shallow_mlp_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy"])

history = shallow_mlp_model.fit(train_dataset, validation_data=validation_dataset, epochs=epochs)
print(shallow_mlp_model.summary())

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 512)               18584576  
                                                                 
 dense_19 (Dense)            (None, 256)               131328    
                                                                 
 dense_20 (Dense)            (None, 16)                4112      
                                                                 
Total params: 18,720,016
Trainable params: 18,720,016
Non-trainable params: 0
_________________________________________________________________
None


7. Evaluation

In [148]:
_, categorical_acc = shallow_mlp_model.evaluate(test_dataset)
print(f"Accuracy: {round(categorical_acc * 100, 2)}%.")

Accuracy: 68.11%.


 8. Examples

In [153]:
model_for_inference = keras.Sequential([text_vectorizer, shallow_mlp_model])
inference_dataset = make_dataset(test_df.sample(100), is_train=False)
text_batch, label_batch = next(iter(inference_dataset))
predicted_probabilities = model_for_inference.predict(text_batch)
for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Description: {text}")
    print(f"Genres: {invert_multi_hot(label[0])}")
    predicted_proba = [proba for proba in predicted_probabilities[i]]
    top_3_labels = [
        x
        for _, x in sorted(
            zip(predicted_probabilities[i], lookup.get_vocabulary()),
            key=lambda pair: pair[0],
            reverse=True,
        )
    ][:3]
    print(f"Predicted Genres: ({', '.join([label for label in top_3_labels])})")
    print(" ")

Description: b'GoD Factory: Wingmen is a multiplayer space combat game played through fast-paced and intensely tactical 4 vs 4 player matches.'
Genres: ['Sci-Fi']
Predicted Genres: (Action, Strategy, Sci-Fi)
 
Description: b'In 2013, the U.S. Army will implement the Integrated Warfighter System (IWS), evolving what we know as the modern soldier. IWS combines advanced weapon systems, satellite communication devices and enhanced survivability into one fully integrated combat system. The IWS program has been developed to meet these new threats head on. Now, it can be tested on the battlefield. Following an insurgence in the heart of Mexico City, the U.S. Army\'s most elite Special Forces team is deployed to the center of the conflict to regain control of the city. Greatly outnumbered but fully equipped with the IWS, this elite team is the first and last line of defense on the battlefield. They are the "Quiet" professionals. They are the Ghosts. Gain access to the future of military techno