# Transformers approach for text classification tasks

In [1]:
# General purposes required imports
import numpy as np
import pandas as pd
import os
import re
import json
import torch
from pathlib import Path

# Pre-processing tools
from sklearn.preprocessing import LabelEncoder

# Transformers (Hugging Face) imports
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load the data

In [2]:
def remove_emojis(text: str) -> str:
    """ Removes emojis from the given text """
    return re.sub(r"[^\w\s,!?@#áéíóúÁÉÍÓÚñÑ]", "", text)

def read_target(target_path: str) -> tuple:
    """ Reads the target variable data and returns it as a Python dictionary (hashmap) """
    label_encoder = LabelEncoder()
    target        = pd.read_csv(filepath_or_buffer=target_path, delimiter=',').to_numpy()
    target[:, 1]  = label_encoder.fit_transform(target[:, 1])
    user_labels = { re.findall(r"[0-9]+", target[i][0])[0]: target[i][1] for i in range(len(target)) }
    return user_labels, { i: label for i, label in enumerate(label_encoder.classes_) }

def prepare_data(data_path: str, 
                 target_path: str,
                 treat_emojis: bool=False,
                 verbose: bool=False) -> tuple:
    """ Builds a Dataset object of messages where each entry represents a user """
    path                = Path(data_path)
    all_users_data      = []
    user_labels, labels = read_target(target_path=target_path)

    # Iterate through the JSON data folder
    for element in path.iterdir():
        if element.is_file():
            user_id = re.findall(pattern=r"[0-9]+", string=element.name)[0]

            # Some execution verbose
            if verbose:
                print(f"Preparing messages from user: {user_id}")

            # Open and read the current JSON file
            with open(data_path + f'/{element.name}') as json_file:
                json_data = json.load(fp=json_file)

                if treat_emojis:
                    # Emojis must be deleted from the messages
                    messages = [ remove_emojis(str(record['message'])) for record in json_data ]

                else:
                    # Preserve emojis in the messages
                    messages = [ str(record['message']) for record in json_data ]
            
                # Create the current document (all user's messages in one string)
                user_document = messages[0]
                for i in range(1, len(messages)):
                    user_document += "" + messages[i]

                # Merge the previous data with the current user's
                all_users_data.append({ "text": user_document, "label": user_labels[user_id] })

    # Build the Dataset object and return it
    return Dataset.from_list(all_users_data), labels
    

In [3]:
# Input data paths
base_path       = os.getcwd()
task1_data_path = base_path + '/data/task1/complete/subjects/'
task2_data_path = base_path + '/data/task2/complete/subjects/'

# Target labels path
task1_target_path = base_path + '/data/task1/complete/gold_task1.txt'
task2_target_path = base_path + '/data/task2/complete/gold_task2.txt'

# Build the datasets
task1_dataset, task1_labels = prepare_data(data_path=task1_data_path,
                       target_path=task1_target_path,
                       treat_emojis=True,
                       verbose=False)

task2_dataset, task2_labels = prepare_data(data_path=task2_data_path,
                       target_path=task2_target_path,
                       treat_emojis=True,
                       verbose=False)

# Print the dataset objects
print(task1_dataset)
print(task2_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 357
})
Dataset({
    features: ['text', 'label'],
    num_rows: 357
})


## Model fine tuning - Task 1

### Small-BERT

In [4]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """ Evaluate and compute the accuracy for the given predictions """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# Tokenización
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

#model_name = "prajjwal1/bert-tiny"
model_name  = "prajjwal1/bert-small"
tokenizer   = AutoTokenizer.from_pretrained(model_name)
model       = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset built before
tokenized_dataset = task1_dataset.map(tokenize_function, batched=True)

tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset     = tokenized_dataset["train"]
eval_dataset      = tokenized_dataset["test"]

# Training
training_args = TrainingArguments(
    output_dir="./results_train_task1",
    eval_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=25,
    learning_rate=7e-7,
    weight_decay=0.001,
    logging_steps=10,
    logging_dir='./logs',
    load_best_model_at_end=True,
    save_strategy="epoch",
    seed=42)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 357/357 [00:00<00:00, 2999.87 examples/s]
[codecarbon ERROR @ 20:50:03] Error: Another instance of codecarbon is probably running as we find `C:\Users\SIMBAT\AppData\Local\Temp\.codecarbon.lock`. Turn off the other instance to be able to run this one or use `allow_multiple_runs` or delete the file. Exiting.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7525,0.692363,0.541667
2,0.6911,0.697463,0.597222
3,0.7072,0.699851,0.527778
4,0.6906,0.704446,0.444444
5,0.7396,0.705409,0.444444
6,0.6929,0.703331,0.5
7,0.711,0.705341,0.472222
8,0.6761,0.702239,0.5
9,0.6559,0.699499,0.541667
10,0.6413,0.696654,0.555556




TrainOutput(global_step=1800, training_loss=0.6792877446280585, metrics={'train_runtime': 40.5763, 'train_samples_per_second': 175.595, 'train_steps_per_second': 44.361, 'total_flos': 140895704448000.0, 'train_loss': 0.6792877446280585, 'epoch': 25.0})

### Train with all the data

In [5]:
# Training with all the data
full_dataset = task1_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results_final_task1",
    per_device_train_batch_size=4,
    num_train_epochs=25,
    learning_rate=7e-7,
    weight_decay=0.001,
    logging_steps=10,
    logging_dir='./logs',
    save_strategy="epoch",
    seed=42)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Map: 100%|██████████| 357/357 [00:00<00:00, 3336.45 examples/s]
[codecarbon ERROR @ 20:50:43] Error: Another instance of codecarbon is probably running as we find `C:\Users\SIMBAT\AppData\Local\Temp\.codecarbon.lock`. Turn off the other instance to be able to run this one or use `allow_multiple_runs` or delete the file. Exiting.


Step,Training Loss
10,0.6679
20,0.7013
30,0.709
40,0.6943
50,0.7155
60,0.7125
70,0.6961
80,0.7076
90,0.7128
100,0.6611




TrainOutput(global_step=2250, training_loss=0.6707141017913818, metrics={'train_runtime': 51.0449, 'train_samples_per_second': 174.846, 'train_steps_per_second': 44.079, 'total_flos': 176490408729600.0, 'train_loss': 0.6707141017913818, 'epoch': 25.0})

### Save the fine-tuned model

In [6]:
# Path to store the model
output_dir = "./task1_small_bert"

# Save the model and the tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Some verbose
print("Model and tokenizer stored in disk!", output_dir)

Model and tokenizer stored in disk! ./task1_small_bert


## Model fine tuning - Task 2

### Small-BERT

In [7]:
#model_name = "prajjwal1/bert-tiny"
model_name = "prajjwal1/bert-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Tokenize the dataset built before
tokenized_dataset = task2_dataset.map(tokenize_function, batched=True)

tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset     = tokenized_dataset["train"]
eval_dataset      = tokenized_dataset["test"]

# Training
training_args = TrainingArguments(
    output_dir="./results_train_task2",
    eval_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    learning_rate=3.5e-5,
    weight_decay=0.001,
    logging_steps=10,
    logging_dir='./logs',
    load_best_model_at_end=True,
    save_strategy="epoch",
    seed=42)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 357/357 [00:00<00:00, 3367.90 examples/s]
[codecarbon ERROR @ 20:51:36] Error: Another instance of codecarbon is probably running as we find `C:\Users\SIMBAT\AppData\Local\Temp\.codecarbon.lock`. Turn off the other instance to be able to run this one or use `allow_multiple_runs` or delete the file. Exiting.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8312,0.601333,0.888889
2,0.2675,0.327868,0.861111
3,0.1388,0.277281,0.902778
4,0.0791,0.57541,0.861111
5,0.0265,0.300103,0.930556
6,0.0041,0.245891,0.944444
7,0.0033,0.222251,0.944444
8,0.003,0.155158,0.958333
9,0.0024,0.287564,0.930556
10,0.0021,0.183443,0.958333




TrainOutput(global_step=1080, training_loss=0.1199103068517245, metrics={'train_runtime': 26.5335, 'train_samples_per_second': 161.117, 'train_steps_per_second': 40.703, 'total_flos': 84544159795200.0, 'train_loss': 0.1199103068517245, 'epoch': 15.0})

### Train with all the data

In [8]:
# Training with all the data
full_dataset = task2_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results_final_task2",
    per_device_train_batch_size=4,
    num_train_epochs=15,
    learning_rate=3.5e-5,
    weight_decay=0.001,
    logging_steps=10,
    logging_dir='./logs',
    save_strategy="epoch",
    seed=42)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Map: 100%|██████████| 357/357 [00:00<00:00, 3534.69 examples/s]
[codecarbon ERROR @ 20:52:03] Error: Another instance of codecarbon is probably running as we find `C:\Users\SIMBAT\AppData\Local\Temp\.codecarbon.lock`. Turn off the other instance to be able to run this one or use `allow_multiple_runs` or delete the file. Exiting.


Step,Training Loss
10,0.149
20,0.2027
30,0.0011
40,0.0694
50,0.0113
60,0.1433
70,0.001
80,0.0195
90,0.01
100,0.0006




TrainOutput(global_step=1350, training_loss=0.019448940579313786, metrics={'train_runtime': 27.5239, 'train_samples_per_second': 194.558, 'train_steps_per_second': 49.048, 'total_flos': 105902684375040.0, 'train_loss': 0.019448940579313786, 'epoch': 15.0})

### Save the fine-tuned model

In [9]:
# Path to store the model
output_dir = "./task2_small_bert"

# Save the model and the tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Some verbose
print("Model and tokenizer stored in disk!", output_dir)

Model and tokenizer stored in disk! ./task2_small_bert


## Make individual predictions

In [10]:
def predict_text(model, tokenizer, text: str, labels: dict):
    model.eval()
    device = next(model.parameters()).device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred_label = torch.argmax(logits, dim=1).item()
    print(labels)
    return labels[pred_label]

In [11]:
task = 2
model_load = AutoModelForSequenceClassification.from_pretrained(f"./task{task}_small_bert")
tokenizer = AutoTokenizer.from_pretrained(f"./task{task}_small_bert")

text = 'voy a intercambiar con trading los valores'

pred = predict_text(model_load, tokenizer, text, task2_labels)
print(pred)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{0: 'betting', 1: 'lootboxes', 2: 'onlinegaming', 3: 'trading'}
lootboxes
