# Text Classification Huggingface Transformers

## Setting Up Necessary Things

In [1]:
# Jupyter Notebook Magic Command - Auto Reloading
%reload_ext autoreload
%autoreload 2

# Jupyter Notebook Magic Command - Inline Plotting
%matplotlib inline

In [2]:
# Ignore All Warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# NVIDIA Status
! nvidia-smi

Thu Aug 17 06:19:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/Colab Notebooks/TC.Video.Game

/content/drive/MyDrive/Colab Notebooks/TC.Video.Game


## Necessary Imports

In [6]:
! pip install -q transformers[sentencepiece]==4.28.0 datasets evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [52]:
import pandas as pd
import transformers

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

import evaluate
import numpy as np

In [8]:
bs = 32
epochs = 7
lr = 3.75e-4

*URL to the model: [distilroberta-base](https://huggingface.co/distilroberta-base)*

In [9]:
model_name = "distilroberta-base"

## Data Cleaning

In [10]:
# Load Data Into DataFrame
df = pd.read_csv("data/raw_data/game_details.csv")
df.head()

Unnamed: 0,Name,Summary,Genres
0,The Legend of Zelda: Ocarina of Time,"As a young boy, Link is tricked by Ganondorf, ...","['Action Adventure', 'Fantasy']"
1,Tony Hawk's Pro Skater 2,As most major publishers' development efforts ...,"['Sports', 'Alternative', 'Skateboarding']"
2,Grand Theft Auto IV,[Metacritic's 2008 PS3 Game of the Year; Also ...,"['Action Adventure', 'Modern', 'Modern', 'Open..."
3,SoulCalibur,"This is a tale of souls and swords, transcendi...","['Action', 'Fighting', '3D']"
4,Grand Theft Auto IV,[Metacritic's 2008 Xbox 360 Game of the Year; ...,"['Action Adventure', 'Modern', 'Modern', 'Open..."


In [11]:
# Shape of the DataFrame
df.shape

(20406, 3)

### Total Number of Genres

In [12]:
genres_list = df["Genres"].to_list()    # list of all the genres
genres_count = {}                       # count of all the genres

for genres in genres_list:
    genre_list = eval(genres)

    for genre in genre_list:
        if genre in genres_count.keys():
            genres_count[genre] += 1
        else:
            genres_count[genre] = 1

In [13]:
print("Total Number of Genres: ", len(genres_count))

sorted_genres_count = dict(sorted(genres_count.items(), key=lambda item: item[1], reverse=True))
print("Individual Genres Count: \n", sorted_genres_count)

Total Number of Genres:  178
Individual Genres Count: 
 {'Action': 7097, 'General': 7035, 'Shooter': 3549, 'Action Adventure': 3424, 'Platformer': 2371, 'Arcade': 2355, 'Role-Playing': 2341, '3D': 2127, 'Adventure': 1995, '2D': 1979, 'Miscellaneous': 1927, 'Strategy': 1905, 'First-Person': 1875, 'Sci-Fi': 1779, 'Fantasy': 1729, 'Sports': 1727, 'Modern': 1569, 'Puzzle': 1376, 'Action RPG': 1300, 'Racing': 1239, 'Sim': 1162, 'Simulation': 1160, 'Third-Person': 1040, 'Traditional': 1015, 'Driving': 971, 'Historic': 915, 'Fighting': 838, 'Automobile': 782, 'Real-Time': 772, 'Turn-Based': 740, 'Open-World': 736, "Beat-'Em-Up": 713, 'Team': 619, 'Compilation': 546, 'Survival': 544, 'Individual': 450, "Shoot-'Em-Up": 433, 'Point-and-Click': 430, 'Horror': 419, 'Japanese-Style': 415, 'Music': 411, 'Tactical': 405, 'Tactics': 392, 'Combat': 382, 'Linear': 353, 'Console-style RPG': 346, 'Rhythm': 340, 'Management': 334, 'GT / Street': 283, 'Western-Style': 279, 'Soccer': 276, 'Scrolling': 264, '

### Using Only 1st Category of the Genres: Single-Label Classification

In [14]:
primary_genre_list = []    # only first genre
indices_to_drop = []       # empty genre indices drop

for idx, genres in enumerate(df["Genres"].to_list()):
    genres_list = eval(genres)

    if len(genres_list) != 0:
        primary_genre_list.append(genres_list[0])
    else:
        indices_to_drop.append(idx)

In [15]:
# Total Primary Genres
print("Total Primary Genres: ", len(primary_genre_list))
print(primary_genre_list)

# Total Indices to Drop
print("Total Indices to Drop: ", len(indices_to_drop))

Total Primary Genres:  19894
['Action Adventure', 'Sports', 'Action Adventure', 'Action', 'Action Adventure', 'Action Adventure', 'Sports', 'Action Adventure', 'Action', 'Action Adventure', 'Action Adventure', 'Sports', 'Action Adventure', 'Action', 'Action Adventure', 'Action Adventure', 'Sports', 'Action Adventure', 'Action', 'Action Adventure', 'Action', 'Action', 'Action Adventure', 'Action Adventure', 'Modern', 'Role-Playing', 'Modern', 'Sports', 'Action Adventure', 'Role-Playing', 'Sports', 'Action', 'Action Adventure', 'Action Adventure', 'Action', 'Action Adventure', 'Action', 'Action', 'Sports', 'Action', 'Modern', 'Fantasy', 'Action', 'Action', 'Action Adventure', 'Action Adventure', 'Action', 'Action', 'Action Adventure', 'Action', 'Role-Playing', 'Role-Playing', 'Adventure', 'Action Adventure', 'Role-Playing', 'Action', 'Role-Playing', 'Action Adventure', 'Action Adventure', 'Driving', 'Action Adventure', 'Action', 'Action Adventure', 'Miscellaneous', 'Miscellaneous', 'Role

In [16]:
# Drop Empty Genre Data
df = df.drop(indices_to_drop).reset_index(drop=True)

In [17]:
# Create a Column Named Primary Genre
df["Primary Genre"] = primary_genre_list

In [18]:
# Individual Primary Genre Values Count
print("Primary Genre Value Count: ")
df["Primary Genre"].value_counts()

Primary Genre Value Count: 


Action                6436
Action Adventure      2579
Role-Playing          2081
Sports                1620
Strategy              1608
                      ... 
Party / Minigame         1
Survival                 1
Street                   1
Dancing                  1
Online Multiplayer       1
Name: Primary Genre, Length: 70, dtype: int64

In [19]:
# After Inserting Primary Genre
df.head()

Unnamed: 0,Name,Summary,Genres,Primary Genre
0,The Legend of Zelda: Ocarina of Time,"As a young boy, Link is tricked by Ganondorf, ...","['Action Adventure', 'Fantasy']",Action Adventure
1,Tony Hawk's Pro Skater 2,As most major publishers' development efforts ...,"['Sports', 'Alternative', 'Skateboarding']",Sports
2,Grand Theft Auto IV,[Metacritic's 2008 PS3 Game of the Year; Also ...,"['Action Adventure', 'Modern', 'Modern', 'Open...",Action Adventure
3,SoulCalibur,"This is a tale of souls and swords, transcendi...","['Action', 'Fighting', '3D']",Action
4,Grand Theft Auto IV,[Metacritic's 2008 Xbox 360 Game of the Year; ...,"['Action Adventure', 'Modern', 'Modern', 'Open...",Action Adventure


### Removing Rare Genres

In [20]:
genre_count = df["Primary Genre"].value_counts()
threshold = int(len(df) * 0.004)

rare_genres = [item for item, count in genre_count.items() if count < threshold]

print("Total Rare Genres: ", len(rare_genres))

Total Rare Genres:  58


In [21]:
# Rare indices to drop
rare_indices_to_drop = [index for index, row in df.iterrows() if row["Primary Genre"] in rare_genres]

print("Total Rare Indices to Drop: ", len(rare_indices_to_drop))

Total Rare Indices to Drop:  493


In [22]:
# Drop Rare Genres
df = df.drop(rare_indices_to_drop).reset_index(drop=True)

In [23]:
# Shape After Dropping Rare Genres
df.shape

(19401, 4)

In [24]:
# Show Remaining Primary Genres
print("Total Remaining Genres Count: ", len(df["Primary Genre"].value_counts()))

print("Individual Genres Count: ")
df["Primary Genre"].value_counts()

Total Remaining Genres Count:  12
Individual Genres Count: 


Action              6436
Action Adventure    2579
Role-Playing        2081
Sports              1620
Strategy            1608
Miscellaneous       1366
Adventure           1346
Driving              921
Simulation           640
Racing               355
General              249
Puzzle               200
Name: Primary Genre, dtype: int64

In [25]:
# Drop Genres Column
df.drop(["Genres"], axis = 1, inplace = True)

In [26]:
# Drop Duplicate Summary from the DataFrame
df = df.drop_duplicates(subset = "Summary", keep='first')

In [27]:
# Get Labels
labels = list(set(df["Primary Genre"].to_list()))
label_count = len(labels)

print("Total Label Count: ", label_count)
print("Labels: ", labels)

Total Label Count:  12
Labels:  ['Puzzle', 'General', 'Racing', 'Simulation', 'Strategy', 'Action Adventure', 'Role-Playing', 'Sports', 'Driving', 'Action', 'Miscellaneous', 'Adventure']


In [28]:
# Total Null Values in the Columns
df.isna().sum()

Name             0
Summary          1
Primary Genre    0
dtype: int64

In [29]:
# Drop Null Values
df = df.dropna()

In [30]:
# Description of The DataFrame
df.describe(include="object")

Unnamed: 0,Name,Summary,Primary Genre
count,15037,15037,15037
unique,12411,15037,12
top,MX vs. ATV Untamed,"As a young boy, Link is tricked by Ganondorf, ...",Action
freq,6,1,4915


## Data Processing

In [31]:
ds = Dataset.from_pandas(df)

In [32]:
ds

Dataset({
    features: ['Name', 'Summary', 'Primary Genre', '__index_level_0__'],
    num_rows: 15037
})

In [33]:
ds[0]

{'Name': 'The Legend of Zelda: Ocarina of Time',
 'Summary': 'As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human uses Link to gain access to the Sacred Realm, where he places his tainted hands on Triforce and transforms the beautiful Hyrulean landscape into a barren wasteland. Link is determined to fix the problems he helped to create, so with the help of Rauru he travels through time gathering the powers of the Seven Sages.',
 'Primary Genre': 'Action Adventure',
 '__index_level_0__': 0}

### Tokenization

#### Dummy Tokenization

In [34]:
# Pretrain Tokenizer
tokenizer = AutoTokenizer.from_pretrained((model_name))

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [35]:
tokenizer.tokenize(ds[0]["Summary"][:110])

['As',
 'Ġa',
 'Ġyoung',
 'Ġboy',
 ',',
 'ĠLink',
 'Ġis',
 'Ġtricked',
 'Ġby',
 'ĠGan',
 'ond',
 'orf',
 ',',
 'Ġthe',
 'ĠKing',
 'Ġof',
 'Ġthe',
 'ĠGer',
 'udo',
 'ĠThieves',
 '.',
 'ĠThe',
 'Ġevil',
 'Ġhuman',
 'Ġuses',
 'ĠLink',
 'Ġto',
 'Ġgain']

In [36]:
tokenizer(ds[0]["Summary"][:110])

{'input_ids': [0, 1620, 10, 664, 2143, 6, 4341, 16, 36985, 30, 16929, 2832, 16385, 6, 5, 1745, 9, 5, 7965, 23259, 40036, 4, 20, 9247, 1050, 2939, 4341, 7, 2364, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

#### Full DataFrame Tokenization and Numericalization

In [37]:
def tokenize_summary(dataframe):
  return tokenizer(dataframe["Summary"], truncation=True, padding=True)

In [38]:
tokenized_ds = ds.map(tokenize_summary, batched=True)

Map:   0%|          | 0/15037 [00:00<?, ? examples/s]

In [None]:
tokenized_ds[0]

In [40]:
row = tokenized_ds[0]

print("Summary: ", row["Summary"])
print("Input IDs: ", row["input_ids"])

Summary:  As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human uses Link to gain access to the Sacred Realm, where he places his tainted hands on Triforce and transforms the beautiful Hyrulean landscape into a barren wasteland. Link is determined to fix the problems he helped to create, so with the help of Rauru he travels through time gathering the powers of the Seven Sages.
Input IDs:  [0, 1620, 10, 664, 2143, 6, 4341, 16, 36985, 30, 16929, 2832, 16385, 6, 5, 1745, 9, 5, 7965, 23259, 40036, 4, 20, 9247, 1050, 2939, 4341, 7, 2364, 899, 7, 5, 25095, 38814, 6, 147, 37, 2127, 39, 25686, 1420, 15, 2393, 1594, 34260, 8, 33622, 5, 2721, 8851, 2070, 21926, 5252, 88, 10, 36175, 43689, 4, 4341, 16, 3030, 7, 4190, 5, 1272, 37, 1147, 7, 1045, 6, 98, 19, 5, 244, 9, 248, 8616, 257, 37, 12524, 149, 86, 5660, 5, 4361, 9, 5, 7732, 208, 3443, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [41]:
# Vocabulary index, Numericalization like we did in ULMFit
tokenizer.vocab["Ġyoung"]

664

#### Categorize

In [42]:
print("All the Labels: ")
labels

All the Labels: 


['Puzzle',
 'General',
 'Racing',
 'Simulation',
 'Strategy',
 'Action Adventure',
 'Role-Playing',
 'Sports',
 'Driving',
 'Action',
 'Miscellaneous',
 'Adventure']

In [43]:
# Index of a label
labels.index("Action")

9

In [44]:
def categorize(dataframe):
  return {"labels": [labels.index(primary_genre) for primary_genre in dataframe["Primary Genre"]]}

In [45]:
categorized_ds = tokenized_ds.map(categorize, batched=True)

Map:   0%|          | 0/15037 [00:00<?, ? examples/s]

In [46]:
categorized_ds

Dataset({
    features: ['Name', 'Summary', 'Primary Genre', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 15037
})

In [47]:
row = categorized_ds[0]
row["labels"]

5

In [None]:
categorized_ds[0]

## Data Splitting

In [49]:
split_ds = categorized_ds.train_test_split(0.1, seed=42)
split_ds

DatasetDict({
    train: Dataset({
        features: ['Name', 'Summary', 'Primary Genre', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 13533
    })
    test: Dataset({
        features: ['Name', 'Summary', 'Primary Genre', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1504
    })
})

## Modeling

In [50]:
args = TrainingArguments(
    "models",
    learning_rate = lr,
    warmup_ratio = 0.1,
    lr_scheduler_type='cosine',
    fp16=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to='none'
)

In [51]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=label_count)
model

Downloading pytorch_model.bin:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [53]:
def accuracy(eval_preds):
  metric = evaluate.load("accuracy")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [54]:
trainer = Trainer(
    model,
    args,
    train_dataset = split_ds["train"],
    eval_dataset = split_ds["test"],
    tokenizer = tokenizer,
    compute_metrics = accuracy
)

In [55]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.108583,0.34109
2,2.151200,2.096765,0.34109
3,2.126300,2.098485,0.34109
4,2.114800,2.092949,0.34109
5,2.115900,2.091029,0.34109
6,2.101500,2.09144,0.34109
7,2.101500,2.090294,0.34109


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=2961, training_loss=2.119608870393237, metrics={'train_runtime': 1752.4009, 'train_samples_per_second': 54.058, 'train_steps_per_second': 1.69, 'total_flos': 1.2551007036911616e+16, 'train_loss': 2.119608870393237, 'epoch': 7.0})

In [56]:
trainer.save_model('models/game-classifier')