# Fine-tune Natural Language Inference(NLI) model

This notebook contains code to convert classification dataset to NLI dataset and fine-tune model.

In [None]:
!pip install transformers datasets evaluate accelerate sentencepiece -U -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/521.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m307.2/521.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import random
import numpy as np
import evaluate

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    train_path = '/content/gdrive/MyDrive/advanced-ml-project/data/train.tsv'
    test_path = '/content/gdrive/MyDrive/advanced-ml-project/data/test.tsv'
    dev_path = '/content/gdrive/MyDrive/advanced-ml-project/data/dev.tsv'

    eval_model_path = '/content/gdrive/MyDrive/advanced-ml-project/fine-tuned-nli-model'
except:
    train_path = 'data/train.tsv'
    test_path = 'data/test.tsv'
    dev_path = 'data/dev.tsv'

    eval_model_path = 'fine-tuned-nli-model'

Mounted at /content/gdrive


## Load data

In [None]:
train = pd.read_csv(train_path, sep='\t', header=0)
# train['label'] = train['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
train = train.sample(frac=1).reset_index(drop=True)

test = pd.read_csv(test_path, sep='\t', header=0)
# test['label'] = test['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
test = test.sample(frac=1).reset_index(drop=True)

dev = pd.read_csv(dev_path, sep='\t', header=0)
# dev['label'] = dev['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
dev = dev.sample(frac=1).reset_index(drop=True)
print(train.label.value_counts())
train.head(10)

moderate          6019
not depression    1971
severe             901
Name: label, dtype: int64


Unnamed: 0,PID,text,label
0,train_pid_5589,Too numb to create the emotional response that...,moderate
1,train_pid_4537,"i cant do this anymore, ending it all : im a g...",moderate
2,train_pid_999,Start of the decade: Alive. End of the decade:...,moderate
3,train_pid_2923,I’m so depressed that I disconnected with my f...,moderate
4,train_pid_1691,I dont think I have much time left. : Im hopin...,moderate
5,train_pid_4081,I’m feeling so down today. : I don’t know what...,moderate
6,train_pid_7847,NM claimed she's so tired of all us she wants ...,not depression
7,train_pid_7551,I wish that I was never born : [removed],not depression
8,train_pid_4821,"theres 335million ppl in USA : and i'm alone, ...",moderate
9,train_pid_7230,anyone wanna talk? : hey y’all. is anyone not ...,not depression


In [None]:
data = DatasetDict({
    'train': Dataset.from_pandas(train).shuffle(seed=42),
    'test': Dataset.from_pandas(test).shuffle(seed=42),
    'dev': Dataset.from_pandas(dev).shuffle(seed=42),
})
data

DatasetDict({
    train: Dataset({
        features: ['PID', 'text', 'label'],
        num_rows: 8891
    })
    test: Dataset({
        features: ['PID', 'text', 'label'],
        num_rows: 3245
    })
    dev: Dataset({
        features: ['PID', 'text', 'label'],
        num_rows: 4496
    })
})

## Process data

In [None]:
label2idx = {'moderate': 0, 'not depression': 1, 'severe':2}
idx2label = {0: 'moderate', 1: 'not depression', 2: 'severe'}
id2labels = ['moderate', 'not depression', 'severe']

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

def process_func(sample):
    template = "The level of depression in this tweet is {}."
    text = sample["text"]
    label = sample["label"][0]
    contradiction_label = random.choice([x for x in id2labels if x!=label])

    encoded_sequence = tokenizer(
        text*2,
         [template.format(label), template.format(contradiction_label)],
        truncation='only_first',
        padding='max_length',
        max_length=512
    )
    encoded_sequence["labels"] = [2,0]
    encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)
    return encoded_sequence

processed_data = data.map(process_func, batched=True, batch_size=1, remove_columns=["label", "text", "PID"])
processed_data

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/8891 [00:00<?, ? examples/s]

Map:   0%|          | 0/3245 [00:00<?, ? examples/s]

Map:   0%|          | 0/4496 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 17782
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 6490
    })
    dev: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 8992
    })
})

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Fine-tuning

In [None]:
def compute_metrics(p):
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis = 1)
    result = {}
    result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
    result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
    return result

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels = len(id2labels),
    ignore_mismatched_sizes=True
)

training_args = TrainingArguments(
    output_dir="model_output",
    learning_rate=2e-5,
    per_device_train_batch_size=14,
    per_device_eval_batch_size=14,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    warmup_steps = 500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_data["train"],
    eval_dataset=processed_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4237,0.594344,0.74114,0.740954
2,0.241,1.020748,0.668875,0.666593
3,0.1702,1.250742,0.690909,0.690078


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

TrainOutput(global_step=3813, training_loss=0.3083254556283218, metrics={'train_runtime': 5449.1544, 'train_samples_per_second': 9.79, 'train_steps_per_second': 0.7, 'total_flos': 1.4036048382117888e+16, 'train_loss': 0.3083254556283218, 'epoch': 3.0})

In [None]:
trainer.save_model('fine-tuned-nli-model')

In [None]:
trainer.predict(processed_data['test'])

PredictionOutput(predictions=array([[ 1.4586536, -4.7660913,  3.138733 ],
       [ 2.541217 , -4.5007076,  1.428854 ],
       [ 1.2060326, -4.5628567,  3.3446462],
       ...,
       [ 2.7736938, -5.144325 ,  1.7572695],
       [ 1.2836802, -4.6957726,  3.3107257],
       [ 2.5593607, -4.639618 ,  1.6659696]], dtype=float32), label_ids=array([2, 0, 2, ..., 0, 2, 0]), metrics={'test_loss': 0.5943436026573181, 'test_accuracy': 0.7411402157164869, 'test_f1': 0.7409540130174932, 'test_runtime': 199.1114, 'test_samples_per_second': 32.595, 'test_steps_per_second': 2.33})

## Evaluate

In [None]:
def compute_metrics(p):
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis = 1)
    result = {}
    result["accuracy"] = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    result["macro f1"] = metric_f1.compute(predictions=preds, references=p.label_ids, average='macro')["f1"]
    result["weighted f1"] = metric_f1.compute(predictions=preds, references=p.label_ids, average='weighted')["f1"]
    return result

In [None]:
tokenizer = AutoTokenizer.from_pretrained('kwang123/roberta-base-nli')
model = AutoModelForSequenceClassification.from_pretrained(
    "kwang123/roberta-base-nli",
    num_labels = len(id2labels),
    ignore_mismatched_sizes=True
)

trainer = Trainer(
    model=model,
    eval_dataset=processed_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.predict(processed_data['test'])

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/889 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

PredictionOutput(predictions=array([[ 1.4586538, -4.766092 ,  3.1387322],
       [ 2.541217 , -4.500708 ,  1.4288546],
       [ 1.2060324, -4.562857 ,  3.3446467],
       ...,
       [ 2.7736938, -5.144325 ,  1.7572695],
       [ 1.2836803, -4.695774 ,  3.3107255],
       [ 2.559361 , -4.6396184,  1.6659721]], dtype=float32), label_ids=array([2, 0, 2, ..., 0, 2, 0]), metrics={'test_loss': 0.5943436622619629, 'test_accuracy': 0.7411402157164869, 'test_macro f1': 0.7409540130174932, 'test_weighted f1': 0.7409540130174932, 'test_runtime': 190.8983, 'test_samples_per_second': 33.997, 'test_steps_per_second': 4.254})

## Inference

In [None]:
data['test'][0]

{'PID': 'test_pid_908',
 'text': "I am in a dark place right now : As I am writing this, tears are coming out my eyes because I am so fed up with life.\nWhen do things get better?\nSo much drama and it is exhausting. \nI try to convince myself that I can get through this but deep down I'm sinking further and further into the cold abyss.\nThoughts about suicide are creeping in my mind (although I would never do it) and I am fantasizing about it. \nHow easy it would be to just end the suffering but then I remember my loved ones and I could never do that to them.\nEverything is exhausting. So exhausting.",
 'label': 'moderate'}

In [None]:
# Import the Transformers pipeline library
from transformers import pipeline

# Initializing Zero-Shot Classifier
classifier = pipeline(
    "zero-shot-classification",
    model="kwang123/roberta-base-nli",
    batch_size=64,
)
# Running the classifier
predictions = classifier(
    data['test'][0]['text'],
    candidate_labels=['moderate', 'not depression', 'severe'],
    hypothesis_template='"The level of depression in this tweet is {}."',
    tokenizer_kwargs={'padding': 'max_length', 'truncation': 'only_first', 'max_length': 512}
)
predictions

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


{'sequence': "I am in a dark place right now : As I am writing this, tears are coming out my eyes because I am so fed up with life.\nWhen do things get better?\nSo much drama and it is exhausting. \nI try to convince myself that I can get through this but deep down I'm sinking further and further into the cold abyss.\nThoughts about suicide are creeping in my mind (although I would never do it) and I am fantasizing about it. \nHow easy it would be to just end the suffering but then I remember my loved ones and I could never do that to them.\nEverything is exhausting. So exhausting.",
 'labels': ['moderate', 'severe', 'not depression'],
 'scores': [0.7105392813682556, 0.14855943620204926, 0.14090123772621155]}