# Alternative dataset test

__Objective:__ test the training pipeline with another dataset.

In [None]:
import os
import sys
import pandas as pd
import torch
import datasets
import transformers

sys.path.append('../../modules/')

from model_utils import get_deberta_model
from training_metrics import compute_metrics_sklearn

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

%load_ext autoreload
%autoreload 2

Load data.

In [None]:
DATA_DIR = '/data1/moscato/personalised-hate-boundaries-data/data/hateval2019/'

language = 'en'  # 'en' or 'es'.
partition = 'train'  # 'dev', 'test' or 'train'.

training_data_df = pd.read_csv(os.path.join(DATA_DIR, f'hateval2019_{language}_{partition}.csv'))[['text', 'HS']].rename(columns={'HS': 'label'})

language = 'en'  # 'en' or 'es'.
partition = 'test'  # 'dev', 'test' or 'train'.

test_data_df = pd.read_csv(os.path.join(DATA_DIR, f'hateval2019_{language}_{partition}.csv'))[['text', 'HS']].rename(columns={'HS': 'label'})

training_data_df

In [None]:
train_ds = datasets.Dataset.from_dict(
    training_data_df
    .to_dict(orient='list')
)
test_ds = datasets.Dataset.from_dict(
    test_data_df
    .to_dict(orient='list')
)

len(train_ds), len(test_ds)

Load model.

In [None]:
num_labels = training_data_df['label'].unique().shape[0]

tokenizer, classifier = get_deberta_model(
    num_labels,
    '/data1/shared_models/',
    device,
    use_custom_head=False,
    pooler_out_features=768,
    pooler_drop_prob=0.,
    classifier_drop_prob=0.1,
    use_fast_tokenizer=True
)

tokenizer

Tokenizer text.

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding='max_length',
        truncation=True,
        max_length=512,
        # return_tensors='pt'
    )

In [None]:
train_ds.map(tokenize_function, batched=True).remove_columns('text')

In [None]:
tokenized_train_ds = train_ds.map(tokenize_function, batched=True).remove_columns('text')
tokenized_test_ds = test_ds.map(tokenize_function, batched=True).remove_columns('text')

In [None]:
n_params_total = sum([p.numel() for p in classifier.parameters()])
n_params_trainable = sum([p.numel() for p in classifier.parameters() if p.requires_grad])

print(
    f'N params: {n_params_total} | N trainable params: {n_params_trainable}'
)

classifier.train();

In [None]:
EXPERIMENT_ID = 'hateval_data_model_test'
MODEL_OUTPUT_DIR = f'/data1/moscato/personalised-hate-boundaries-data/models/{EXPERIMENT_ID}/'
N_EPOCHS = 10

training_args = transformers.TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",  # Options: 'no', 'epoch', 'steps' (requires the `save_steps` argument to be set though).
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=1e-4,
    per_device_train_batch_size=32,  # Default: 8.
    gradient_accumulation_steps=1,  # Default: 1.
    per_device_eval_batch_size=32,  # Default: 8.
    num_train_epochs=N_EPOCHS,
    warmup_ratio=0.0,  # For linear warmup of learning rate.
    metric_for_best_model="f1",
    push_to_hub=False,
    logging_strategy='no',
    logging_first_step=True,
    logging_dir=f'../tensorboard_logs/{EXPERIMENT_ID}/',
    disable_tqdm=False
)

trainer = transformers.Trainer(
    model=classifier,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    compute_metrics=compute_metrics_sklearn,
)

In [None]:
training_output = trainer.train()