# Test the PyTorch training routine

In [1]:
import sys
import torch
from sklearn.model_selection import train_test_split
import datasets

sys.path.append('../modules/')

from custom_logger import get_logger
from data_utils import generate_aggregated_labels_dataset
from model_utils import get_deberta_model
from training import train_model

logger = get_logger('majority_vote_fine_tuning')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


Load data.

In [2]:
# Read data.
DATASET_PATHS = {
    'popquorn': '../data/samples/POPQUORN_offensiveness.csv',
    # 'kumar': '/data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean.csv'
    'kumar': '/data/milanlp/moscato/personal_hate_bounds_data/kumar_processed_with_ID_and_full_perspective_clean.csv'
}

DATASET_NAME = 'kumar'
BINARIZE_LABELS = True
SUBSAMPLE_MAJORITY_CLASS = False

majority_vote_data_df = generate_aggregated_labels_dataset(
    DATASET_NAME,
    DATASET_PATHS[DATASET_NAME],
    binarize_labels=BINARIZE_LABELS,
    subsample_majority_class=SUBSAMPLE_MAJORITY_CLASS
)

2025-01-09 17:42:39,511 - majority_vote_fine_tuning - INFO - Reading kumar dataset from: /data/milanlp/moscato/personal_hate_bounds_data/kumar_processed_with_ID_and_full_perspective_clean.csv
  data_df = pd.read_csv(dataset_path)
  data_df = pd.read_csv(dataset_path)[['comment_id', 'source', 'comment', 'toxic_score']]
2025-01-09 17:42:47,472 - majority_vote_fine_tuning - INFO - Binarizing labels


In [3]:
# Train-test split.
test_frac = 0.25

logger.info(f'Splitting training and test dataset (test_frac: {test_frac})')

training_data, test_data, training_labels, test_labels = train_test_split(
    majority_vote_data_df[['instance_id', 'text']],
    majority_vote_data_df['label'],
    test_size=test_frac,
    shuffle=True,
    stratify=majority_vote_data_df['label']
)

training_data['label'] = training_labels
test_data['label'] = test_labels

training_data = training_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

del training_labels, test_labels

# Check.
assert len(set(test_data['instance_id']) & set(training_data['instance_id'])) == 0

train_ds = datasets.Dataset.from_dict(
    training_data.drop(columns=['instance_id'])
    .iloc[:100]  # For testing!
    .to_dict(orient='list')
)
test_ds = datasets.Dataset.from_dict(
    test_data.drop(columns=['instance_id'])
    .iloc[:100]  # For testing!
    .to_dict(orient='list')
)

len(train_ds), len(test_ds)

2025-01-09 17:43:18,202 - majority_vote_fine_tuning - INFO - Splitting training and test dataset (test_frac: 0.25)


(100, 100)

Load model.

In [37]:
num_labels = majority_vote_data_df['label'].unique().shape[0]

logger.info(f'Number of labels found: {num_labels}')

tokenizer, classifier = get_deberta_model(
    num_labels,
    '/data/milanlp/huggingface/hub/',
    device,
    use_custom_head=False,
    pooler_out_features=768,
    pooler_drop_prob=0.,
    classifier_drop_prob=0.1
)

2025-01-09 18:00:08,809 - majority_vote_fine_tuning - INFO - Number of labels found: 2
2025-01-09 18:00:08,810 - majority_vote_fine_tuning - INFO - Instantiating DeBERTa tokenizer
2025-01-09 18:00:09,913 - majority_vote_fine_tuning - INFO - Instantiating DeBERTa model with default classification head
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# Test.
with torch.no_grad():
    output = classifier(**dict(
        **tokenizer(
            majority_vote_data_df['text'].iloc[:4].tolist(),
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=512
        ).to(device=device),
        **{'labels': torch.LongTensor(majority_vote_data_df['label'].iloc[:4]).to(device=device)}
    ))

output, torch.argmax(output.logits, dim=-1)

(SequenceClassifierOutput(loss=tensor(0.6971, device='cuda:0'), logits=tensor([[-0.0830,  0.0846],
         [-0.0841,  0.0908],
         [-0.0787,  0.0867],
         [-0.0822,  0.0925]], device='cuda:0'), hidden_states=None, attentions=None),
 tensor([1, 1, 1, 1], device='cuda:0'))

Explore the Hugging Face `Dataset` object.

In [6]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

In [7]:
# Tokenize datasets.
logger.info(f'Tokenizing datasets')

tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_test_ds = test_ds.map(tokenize_function, batched=True)

tokenized_train_ds.select_columns(['label', 'input_ids', 'token_type_ids', 'attention_mask'])
tokenized_test_ds.select_columns(['label', 'input_ids', 'token_type_ids', 'attention_mask'])

tokenized_train_ds.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
tokenized_test_ds.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])

logger.info(f'Training dataset size: {len(train_ds)} | Test dataset size: {len(test_ds)}')

2025-01-09 17:43:32,796 - majority_vote_fine_tuning - INFO - Tokenizing datasets
Map: 100%|██████████| 100/100 [00:00<00:00, 2907.68 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 3328.10 examples/s]
2025-01-09 17:43:32,894 - majority_vote_fine_tuning - INFO - Training dataset size: 100 | Test dataset size: 100


Testing...

In [39]:
training_loader = torch.utils.data.DataLoader(
    tokenized_train_ds.select_columns(['label', 'input_ids', 'token_type_ids', 'attention_mask']),
    batch_size=16,
    shuffle=False,
    drop_last=True
)

In [40]:
for batch in training_loader:
    break

In [10]:
batch['input_ids'].shape

torch.Size([16, 512])

In [11]:
del batch['label']

In [12]:
len(batch['input_ids'][0])

512

In [13]:
# Simulation of the training loop.

from pytorch_utils import send_batch_to_device

for i, batch in enumerate(training_loader):
    send_batch_to_device(batch, device)

    training_targets = batch['label']

    del batch['label']

    training_batch = batch

    with torch.no_grad():
        pred = classifier(**training_batch).logits

    break

pred

tensor([[-5.0253e-03,  8.5493e-02],
        [ 9.4846e-03,  9.8303e-02],
        [ 2.7217e-05,  8.6337e-02],
        [ 1.2943e-02,  8.1998e-02],
        [ 1.6069e-02,  8.9428e-02],
        [-8.3315e-03,  9.3504e-02],
        [ 5.9937e-03,  9.0166e-02],
        [-2.8528e-03,  9.1676e-02],
        [ 1.0890e-02,  8.7718e-02],
        [-6.4909e-03,  8.9493e-02],
        [ 1.4984e-03,  9.1422e-02],
        [-2.3769e-03,  9.3704e-02],
        [ 3.4288e-03,  9.0062e-02],
        [-2.8969e-03,  8.6417e-02],
        [ 1.3292e-02,  8.3348e-02],
        [ 9.5658e-03,  9.1117e-02]], device='cuda:0')

Train model.

In [41]:
learning_rate = 1e-4
batch_size = 32
n_epochs = 50

optimizer = torch.optim.Adam(
    params=classifier.parameters(),
    lr=learning_rate
)

classifier.train(True)

training_history = train_model(
    classifier,
    tokenized_train_ds,
    None,#tokenized_test_ds,
    optimizer,
    n_epochs,
    batch_size,
    device,
    loss_fn=torch.nn.CrossEntropyLoss(),
    checkpointing_period_epochs=None,
    model_dir=None,
    checkpoint_id=None,
    save_final_model=False,
    tensorboard_log_dir=None
)

2025-01-09 18:00:32,951 - majority_vote_fine_tuning - INFO - Training model


Computing initial metrics


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2025-01-09 18:00:33,996 - majority_vote_fine_tuning - INFO - Epoch: 0 | training_loss: 0.733799397945404 | training_accuracy: 0.2708333333333333 | training_precision: 0.46068548387096775 | training_recall: 0.5133547008547009 | training_f1: 0.22747421414088084 | learning_rate: 0.0001
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is",

In [46]:
training_history['training_accuracy']

[np.float64(0.2708333333333333),
 np.float64(0.23958333333333334),
 np.float64(0.23958333333333334),
 np.float64(0.25),
 np.float64(0.21875),
 np.float64(0.23958333333333334),
 np.float64(0.21875),
 np.float64(0.25),
 np.float64(0.25),
 np.float64(0.22916666666666666),
 np.float64(0.25),
 np.float64(0.25),
 np.float64(0.25),
 np.float64(0.22916666666666666),
 np.float64(0.23958333333333334),
 np.float64(0.23958333333333334),
 np.float64(0.22916666666666666),
 np.float64(0.21875),
 np.float64(0.23958333333333334),
 np.float64(0.25),
 np.float64(0.23958333333333334),
 np.float64(0.23958333333333334),
 np.float64(0.25),
 np.float64(0.22916666666666666),
 np.float64(0.25),
 np.float64(0.22916666666666666),
 np.float64(0.25),
 np.float64(0.22916666666666666),
 np.float64(0.23958333333333334),
 np.float64(0.22916666666666666),
 np.float64(0.22916666666666666),
 np.float64(0.25),
 np.float64(0.21875),
 np.float64(0.25),
 np.float64(0.23958333333333334),
 np.float64(0.23958333333333334),
 np.f