# Fine-tuning the model** (20 points)

In [None]:
import torch
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
columns = ['topic', 'question_title', 'question_content', 'best_answer'] 

In [None]:
import pandas as pd
df_train = pd.read_csv('../input/yahoo-answers-dataset/yahoo_answers_csv/train.csv',
                       header=None,
                       names=columns).dropna(subset=['best_answer'])
df_test = pd.read_csv('../input/yahoo-answers-dataset/yahoo_answers_csv/test.csv',
                      header=None,
                      names=columns).dropna(subset=['best_answer'])

In [None]:
df_train = df_train.groupby('topic').sample(1000)
df_test = df_test.groupby('topic').sample(300)

In [None]:
df_train.topic = df_train.topic - 1
df_test.topic = df_test.topic - 1

In [None]:
df_train = df_train.sample(frac=1)
df_test = df_test.sample(frac=1)

* load tokenizer and model
* Do not forget to set num_labels parameter, when initializing the model

In [None]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification,AdamW
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=10)
model.cuda()

In [None]:
train_texts = df_train['best_answer']
train_labels = df_train['topic']#pd.get_dummies(df_train['topic'])
test_texts = df_test['best_answer']
test_labels = df_test['topic'] #pd.get_dummies(df_test['topic'])

* convert best_answer to the input tokens (supporting function for dataset is provided below)

In [None]:
train_tokenized = tokenizer.batch_encode_plus(train_texts,
                                              max_length=128,
                                              add_special_tokens=True,
                                              return_attention_mask=True,
                                              pad_to_max_length=True,
                                              truncation=True)
train_input_ids = train_tokenized['input_ids']
train_attention_masks = train_tokenized['attention_mask']
test_tokenized = tokenizer.batch_encode_plus(test_texts,
                                              max_length=128,
                                              add_special_tokens=True,
                                              return_attention_mask=True,
                                              pad_to_max_length=True,
                                              truncation=True)
test_input_ids = test_tokenized['input_ids']
test_attention_masks = test_tokenized['attention_mask']

In [None]:
train_labels = torch.tensor(train_labels.to_numpy())
test_labels = torch.tensor(test_labels.to_numpy())

In [None]:
train_masks = torch.tensor(train_attention_masks)
test_masks = torch.tensor(test_attention_masks)

In [None]:
train_inputs = torch.tensor(train_input_ids)
test_inputs = torch.tensor(test_input_ids)

In [None]:
from torch.utils.data import TensorDataset, DataLoader
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

* define optimizer, sheduler (optional)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [None]:
from transformers import get_linear_schedule_with_warmup

In [None]:
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=100, 
                                            num_training_steps=total_steps)

In [None]:
import numpy as np
from sklearn.metrics import f1_score

* fine-tune the model (write the training loop), plot the loss changes and measure results in terms of weighted F1 score

In [None]:
def evluating():
    model.eval()
    preds, true = [], []
    f1_score_ = 0
    nb_eval_steps, nb_test_examples = 0, 0

    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        preds.append(logits)
        true.append(label_ids)
        f1_score_ += f1_score(label_ids,
                              np.argmax(logits, axis=1),
                              average='weighted')
        nb_eval_steps += 1
    print(f'F1-score: {f1_score_ / nb_eval_steps}')

In [None]:
def training():
    losses = []
    for epoch in range(0, epochs):
        total_loss = 0
        print(f'Epoch {epoch}')
        model.train()
        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)      
            loss = outputs[0]
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avgerage_loss = total_loss / len(train_dataloader)            
        losses.append(avgerage_loss)
        print(f'Average training loss: {avgerage_loss}')
        evluating() 
    return losses

In [None]:
import random

seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

losses = training()

In [None]:
import plotly.express as px 
fig = px.line(x=list(range(epochs)), y=losses, title='Loss')
fig.show()

|Train set size | Test set size | Learning rate | Epsilon | Batch size | # Epochs | Max token length | # warmup steps | F1-score |
|---------------|---------------|---------------|---------|------------|----------|------------------|----------------|----------|
|10000|3000|2e-5|1e-8|32|5|128|0|0.617|
|10000|3000|1e-5|1e-8|32|5|128|0|0.603|
|10000|3000|2e-5|1e-8|32|4|128|100|0.624|
|10000|3000|2e-3|1e-8|32|5|128|300|0.023|
|10000|3000|2e-5|1e-8|64|5|128|100|0.607|
|10000|3000|2e-5|1e-8|32|10|128|100|0.603|
|10000|3000|2e-5|1e-8|32|5|256|100|0.586|
|20000|6000|2e-5|1e-8|32|5|256|100|0.618|