In [None]:
# check which gpu we're using
!nvidia-smi

Wed Mar  2 02:02:39 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P4000        Off  | 00000000:00:05.0 Off |                  N/A |
| 47%   42C    P8     6W / 105W |      0MiB /  8119MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers
!pip install pandas

!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import torch

if not torch.cuda.is_available():
    print('WARNING: You may want to change the runtime to GPU for faster training!')
    DEVICE = 'cpu'
else:
    print("CUDA is available")
    DEVICE = 'cuda:0'

## Load training data

In [None]:
import pandas as pd
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('./data', './data')
dpm.load_task1()

In [None]:
dpm.train_task1_df

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4


## Split into train and dev sets

In [None]:
practice_splits_dir = './data/practice_splits/'
train_ids = pd.read_csv(practice_splits_dir + 'train_semeval_parids-labels.csv')
dev_ids = pd.read_csv(practice_splits_dir + 'dev_semeval_parids-labels.csv')
train_ids.par_id = train_ids.par_id.astype(str)
dev_ids.par_id = dev_ids.par_id.astype(str)
train_ids.head()

Unnamed: 0,par_id,label
0,4341,"[1, 0, 0, 1, 0, 0, 0]"
1,4136,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,"[1, 0, 0, 1, 1, 1, 0]"


In [None]:
def extract_split_data(ids_df, original_df):
    """ ids_df is dataframe with columns 'par_id', 'label'
        original_df is original dataframe with columns 'par_id', 'text', 'label', etc.
    """
    rows = [] # will contain par_id, label and text
    for idx in range(len(ids_df)):  
        par_id = ids_df.par_id[idx]
        # select row from original dataset to retrieve `text` and binary label
        text = original_df.loc[original_df.par_id == par_id].text.values[0]
        label = original_df.loc[original_df.par_id == par_id].label.values[0]
        rows.append({
            'par_id':par_id,
            'text':text,
            'label':label
        })
    return pd.DataFrame(rows)

In [None]:
train_df1 = extract_split_data(train_ids, dpm.train_task1_df)
train_df1

Unnamed: 0,par_id,text,label
0,4341,"The scheme saw an estimated 150,000 children f...",1
1,4136,Durban 's homeless communities reconciliation ...,1
2,10352,The next immediate problem that cropped up was...,1
3,8279,Far more important than the implications for t...,1
4,1164,To strengthen child-sensitive social protectio...,1
...,...,...,...
8370,8380,Rescue teams search for survivors on the rubbl...,0
8371,8381,The launch of ' Happy Birthday ' took place la...,0
8372,8382,"The unrest has left at least 20,000 people dea...",0
8373,8383,You have to see it from my perspective . I may...,0


In [None]:
dev_set = extract_split_data(dev_ids, dpm.train_task1_df)
dev_set_short = extract_split_data(dev_ids[150:250].reset_index(drop=True), dpm.train_task1_df)
dev_set

Unnamed: 0,par_id,text,label
0,4046,We also know that they can benefit by receivin...,1
1,1279,Pope Francis washed and kissed the feet of Mus...,1
2,8330,Many refugees do n't want to be resettled anyw...,1
3,4063,"""Budding chefs , like """" Fred """" , """" Winston ...",1
4,4089,"""In a 90-degree view of his constituency , one...",1
...,...,...,...
2089,10462,"The sad spectacle , which occurred on Saturday...",0
2090,10463,""""""" The Pakistani police came to our house and...",0
2091,10464,"""When Marie O'Donoghue went looking for a spec...",0
2092,10465,"""Sri Lankan norms and culture inhibit women fr...",0


## Downsample negative instances

In [None]:
# downsample negative instances, so num_negative is 2 * num_positive

pos_samples = train_df1[train_df1.label==1]
neg_samples = train_df1[train_df1.label==0]

print("Number of positive samples:", len(pos_samples))
print("Number of negative samples:", len(neg_samples))

training_set = pd.concat([pos_samples, neg_samples[:len(pos_samples)*2]])

print("Number of negative samples after downsampling:", len(training_set[training_set.label==0]))

training_set

Number of positive samples: 794
Number of negative samples: 7581
Number of negative samples after downsampling: 1588


Unnamed: 0,par_id,text,label
0,4341,"The scheme saw an estimated 150,000 children f...",1
1,4136,Durban 's homeless communities reconciliation ...,1
2,10352,The next immediate problem that cropped up was...,1
3,8279,Far more important than the implications for t...,1
4,1164,To strengthen child-sensitive social protectio...,1
...,...,...,...
2377,1775,Last but not the least element of culpability ...,0
2378,1776,"Then , taking the art of counter-intuitive non...",0
2379,1777,Kagunga village was reported to lack necessary...,0
2380,1778,"""After her parents high-profile divorce after ...",0


## BERT Tokeniser

In [None]:
from transformers import BertTokenizer
from transformers import BertPreTrainedModel, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

## Dataset

In [None]:
from torch.utils.data import Dataset

class PatroniseDataset(Dataset):
    def __init__(self, tokenizer, input_set):
        self.tokenizer = tokenizer
        self.texts = input_set['text']
        self.labels = input_set['label']
        print(len(self.texts))
        
    def collate_fn(self, batch):
        texts = []
        labels = []

        for b in batch:
            texts.append(b['text'])
            labels.append(b['label'])

        # The maximum sequence size for BERT is 512 but here the tokenizer truncate sentences longer than 128 tokens.  
        # We also pad shorter sentences to a length of 128 tokens
        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        encodings['label'] =  torch.tensor(labels, dtype=torch.int64)
        
        return encodings
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if idx >= len(self):
            raise IndexError
        item = {'text': self.texts[idx],
                'label': self.labels[idx]}
        return item
    
train_dataset = PatroniseDataset(tokenizer, training_set)

2382


In [None]:
batch = [sample for sample in train_dataset]

encodings = train_dataset.collate_fn(batch[:10])

for key, value in encodings.items():
    print(f"{key}: {value.numpy().tolist()}")

tensor([  101,  1109,  5471,  1486,  1126,  3555,  4214,   117,  1288,  1482,
         1121,  2869,  2073,  1217,  1850,  1106,  2192,  1104,  1103,  1418,
         2813,  1206,  3598,  1105,  2424,   117,  1118,  2689,  3791,  1105,
        18844,  1150,  1163,  1152,  1156,  1730,  1618,  2491,   119,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0])


## BERT Model

In [None]:
class BERT_patronise(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.bert = BertModel(config)
        
        self.projection = torch.nn.Sequential(torch.nn.Dropout(0.2),
                                              torch.nn.Linear(config.hidden_size, 2))
        
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
 
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        logits = self.projection(outputs[1])
        
        return logits

## Finetuning

In [None]:
from transformers import Trainer, TrainingArguments
import torch.nn as nn

class Trainer_patronise(Trainer):
    def compute_loss(self, model, inputs):
        labels = inputs.pop('label')
        outputs = model(**inputs)

        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs.view(-1, 2), labels.view(-1))
        
        return loss

In [None]:
def main_patronise():
    # call our custom BERT model and pass as parameter the name of an available pretrained model
    model = BERT_patronise.from_pretrained("bert-base-cased")
    
    training_args = TrainingArguments(
        output_dir = './experiment/patronise',
        learning_rate = 0.0001,
        logging_steps = 100,
        per_device_train_batch_size = 32,
        num_train_epochs = 3,
    )
    trainer = Trainer_patronise(
        model = model,                         
        args = training_args,                 
        train_dataset = train_dataset,                   
        data_collator = train_dataset.collate_fn
    )
    trainer.train()

    trainer.save_model('./models/bert_patronise_finetuned/')

In [None]:
# main_patronise() # Train/fine-tune the model

## Evaluation

In [None]:
def predict_patronise(inputs, tokenizer, model): 
    model.eval()
    encodings = tokenizer(inputs, return_tensors='pt', padding=True, truncation=True, max_length=128)
    output = model(**encodings)
    preds = torch.max(output, 1)
    return {'prediction':preds[1], 'confidence':preds[0]}

In [None]:
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report

def evaluate(model, tokenizer, data_loader):
    total_count = 0
    correct_count = 0 

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for data in tqdm(data_loader):
            labels = data['label']
            texts = data['text']
            pred = predict_patronise(texts, tokenizer, model)
            all_preds += pred['prediction'].tolist()
            all_labels += labels.tolist()

    # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score
    report = classification_report(all_labels, all_preds, target_names=["Not patronising", "Patronising"], output_dict=True)

    return report

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

model_name = './models/bert_patronise_finetuned/'
model = BERT_patronise.from_pretrained(model_name)

In [None]:
dev_dataset = PatroniseDataset(tokenizer, dev_set)
dev_loader = DataLoader(dev_dataset, batch_size=8)
report = evaluate(model, tokenizer, dev_loader)

2094


100%|██████████| 262/262 [08:09<00:00,  1.87s/it]


In [None]:
print("Not patronising:")
for k, v in report['Not patronising'].items():
    print(f"{k:<10}: {v}")
    
print("\nPatronising:")
for k, v in report['Patronising'].items():
    print(f"{k:<10}: {v}")

Not patronising:
precision : 0.9607163489312536
recall    : 0.8775725593667546
f1-score  : 0.9172642029784887
support   : 1895

Patronising:
precision : 0.3608815426997245
recall    : 0.6582914572864321
f1-score  : 0.4661921708185052
support   : 199
