In [1]:
# check which gpu we're using
!nvidia-smi

Fri Mar  4 04:34:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P4000        Off  | 00000000:00:05.0 Off |                  N/A |
| 46%   39C    P0    28W / 105W |      0MiB /  8119MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers
!pip install pandas

!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [3]:
import torch

if not torch.cuda.is_available():
    print('WARNING: You may want to change the runtime to GPU for faster training!')
    DEVICE = 'cpu'
else:
    print("CUDA is available")
    DEVICE = 'cuda:0'

CUDA is available


## Load training data

In [4]:
import pandas as pd
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('./data', './data')
dpm.load_task1()

In [5]:
dpm.train_task1_df

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4


## Split into train and dev sets

In [6]:
# get training set and dev set ids
practice_splits_dir = './data/practice_splits/'
train_ids = pd.read_csv(practice_splits_dir + 'train_semeval_parids-labels.csv')
dev_ids = pd.read_csv(practice_splits_dir + 'dev_semeval_parids-labels.csv')
# convert ids to strings
train_ids.par_id = train_ids.par_id.astype(str)
dev_ids.par_id = dev_ids.par_id.astype(str)
train_ids.head()

Unnamed: 0,par_id,label
0,4341,"[1, 0, 0, 1, 0, 0, 0]"
1,4136,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,"[1, 0, 0, 1, 1, 1, 0]"


In [7]:
def extract_split_data(ids_df, original_df):
    """ ids_df is dataframe with columns 'par_id', 'label'
        original_df is original dataframe with columns 'par_id', 'text', 'label', etc.
    """
    rows = [] # will contain par_id, label and text
    for idx in range(len(ids_df)):  
        par_id = ids_df.par_id[idx]
        # select row from original dataset to retrieve `text` and binary label
        text = original_df.loc[original_df.par_id == par_id].text.values[0]
        label = original_df.loc[original_df.par_id == par_id].label.values[0]
        rows.append({
            'par_id':par_id,
            'text':text,
            'label':label
        })
    return pd.DataFrame(rows)

In [8]:
train_set = extract_split_data(train_ids, dpm.train_task1_df)
train_set

Unnamed: 0,par_id,text,label
0,4341,"The scheme saw an estimated 150,000 children f...",1
1,4136,Durban 's homeless communities reconciliation ...,1
2,10352,The next immediate problem that cropped up was...,1
3,8279,Far more important than the implications for t...,1
4,1164,To strengthen child-sensitive social protectio...,1
...,...,...,...
8370,8380,Rescue teams search for survivors on the rubbl...,0
8371,8381,The launch of ' Happy Birthday ' took place la...,0
8372,8382,"The unrest has left at least 20,000 people dea...",0
8373,8383,You have to see it from my perspective . I may...,0


In [9]:
dev_set = extract_split_data(dev_ids, dpm.train_task1_df)
dev_set_short = extract_split_data(dev_ids[150:250].reset_index(drop=True), dpm.train_task1_df)
dev_set

Unnamed: 0,par_id,text,label
0,4046,We also know that they can benefit by receivin...,1
1,1279,Pope Francis washed and kissed the feet of Mus...,1
2,8330,Many refugees do n't want to be resettled anyw...,1
3,4063,"""Budding chefs , like """" Fred """" , """" Winston ...",1
4,4089,"""In a 90-degree view of his constituency , one...",1
...,...,...,...
2089,10462,"The sad spectacle , which occurred on Saturday...",0
2090,10463,""""""" The Pakistani police came to our house and...",0
2091,10464,"""When Marie O'Donoghue went looking for a spec...",0
2092,10465,"""Sri Lankan norms and culture inhibit women fr...",0


## Define downsampling/upsampling functions

In [10]:
def downsample(train_set, ratio=2):
    """ Downsample (majority) negative instances, so num_negative is ratio * num_positive
        args:
            ratio: The ratio of negative (majority) samples compared to positive (minority) samples
    """
    
    pos_samples = train_set[train_set.label==1]
    neg_samples = train_set[train_set.label==0]

    print("Number of positive samples:", len(pos_samples))
    print("Number of negative samples:", len(neg_samples))

    res = pd.concat([pos_samples, neg_samples[:len(pos_samples)*ratio]])

    print("Number of negative samples after downsampling:", len(res[res.label==0]))

    return res

def upsample(train_set, ratio=2):
    """ Upsample (minority) positive instances, so num_negative is ratio * num_positive
        args:
            ratio: The ratio of negative (majority) samples compared to positive (minority) samples
    """
    
    pos_samples = train_set[train_set.label==1]
    neg_samples = train_set[train_set.label==0]

    print("Number of positive samples:", len(pos_samples))
    print("Number of negative samples:", len(neg_samples))

    res = pd.concat([pos_samples.sample(len(neg_samples)//ratio, replace=True), neg_samples])

    print("Number of positive samples after upsampling:", len(res[res.label==1]))

    return res

## Define Custom Dataset Class

In [11]:
from torch.utils.data import Dataset

class PatroniseDataset(Dataset):
    def __init__(self, tokenizer, input_set):
        self.tokenizer = tokenizer
        self.texts = input_set['text']
        self.labels = input_set['label']
        
    def collate_fn(self, batch):
        texts = []
        labels = []

        for b in batch:
            texts.append(b['text'])
            labels.append(b['label'])

        # The maximum sequence size for BERT is 512 but here the tokenizer truncate sentences longer than 128 tokens.  
        # We also pad shorter sentences to a length of 128 tokens
        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        encodings['labels'] =  torch.tensor(labels, dtype=torch.int64)
        
        return encodings
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if idx >= len(self):
            raise IndexError
        item = {'text': self.texts[idx],
                'label': self.labels[idx]}
        return item

----------------------------------------------------------------------

Models:
- BERT
- RoBERTa
- XLNet
- DeBERTa

For each, change:
- learning_rate
- batch_size
- num_epochs
- downsampling/upsampling
- loss weighting of pos/neg samples

## AUTO

### Imports

In [20]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report

### Define Evaluation functions

In [21]:
def predict_patronise(inputs, tokenizer, model): 
    model.eval()
    encodings = tokenizer(inputs, return_tensors='pt', padding=True, truncation=True, max_length=128)
    encodings.to(DEVICE)
    output = model(**encodings)
    preds = torch.argmax(output.logits, axis=-1)
    return preds

In [22]:
def evaluate(model, tokenizer, data_loader):
    total_count = 0
    correct_count = 0 

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for data in tqdm(data_loader):
            labels = data['label']
            texts = data['text']
            pred = predict_patronise(texts, tokenizer, model)
            all_preds += pred.tolist()
            all_labels += labels.tolist()

    # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score
    report = classification_report(all_labels, all_preds, target_names=["Not patronising", "Patronising"], output_dict=True)

    return report

In [23]:
def display_report(report):
    print("Not patronising:")
    for k, v in report['Not patronising'].items():
        print(f"{k:<10}: {v}")

    print("\nPatronising:")
    for k, v in report['Patronising'].items():
        print(f"{k:<10}: {v}")

    print("\n\nThe f1-score we care about:", report['Patronising']['f1-score'])

### Define overall pipeline

In [24]:
def generate_model_filename(model_name, learning_rate, batch_size, num_epochs, sampling, sampling_ratio):
    sampling_text = f"{'downsampling' if sampling == 1 else 'upsampling'}_{sampling_ratio}" if sampling else ""
    model_filename = f"./models/patronise_{model_name}_{learning_rate}_{batch_size}_{num_epochs}_{sampling_text}/"
    return model_filename

In [55]:
from os.path import exists
results_filename = "./results"

def load_results():
    if not exists(results_filename):
        return pd.DataFrame()
    results = pd.read_csv(results_filename)
    results['learning_rate'] = results['learning_rate'].astype(float)
    results['batch_size'] = results['batch_size'].astype(int)
    results['num_epochs'] = results['num_epochs'].astype(int)
    results['sampling'] = results['sampling'].astype(int)
    results['sampling_ratio'] = results['sampling_ratio'].astype(int)
    return results


def save_results(results):
    results.to_csv(results_filename, index=False)

In [26]:
def main_patronise(pretrained_model_name='bert-base-cased', learning_rate=0.0001, batch_size=32, num_epochs=3, sampling=1, sampling_ratio=2, save_model=False):
    """
    Args:
        pretrained_model_name
        learning_rate
        batch_size
        num_epochs
        sampling: (0 = use original data) (1 = downsample majority class) (2 = upsample minority class)
        sampling_ratio: the ratio of negative (majority) samples compared to positive (minority) samples
        save_model: boolean value indicating whether or not to save model parameters/weights
    """
    
    # create tokenizer for specified pretrained model
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

    # upsample / downsample
    train_set_sampled = downsample(train_set) if sampling == 1 else upsample(train_set) if sampling == 2 else train_set
    
    # create dataset
    train_dataset = PatroniseDataset(tokenizer, train_set_sampled)
    
    # create classification model with specified pretrained model
    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=2)
    
    # train model
    training_args = TrainingArguments(
        output_dir = './experiment/patronise',
        learning_rate = learning_rate,
        logging_steps = 100,
        per_device_train_batch_size = batch_size,
        num_train_epochs = num_epochs,
    )
    trainer = Trainer(
        model = model,                         
        args = training_args,                 
        train_dataset = train_dataset,                   
        data_collator = train_dataset.collate_fn
    )
    trainer.train()
    
    # save model parameters if applicable
    if save_model:
        model_filename = generate_model_filename(pretrained_model_name, learning_rate, batch_size, num_epochs, sampling, sampling_ratio)
        trainer.save_model(model_filename)
    
    # evaluate model on dev dataset
    dev_dataset = PatroniseDataset(tokenizer, dev_set)
    dev_loader = DataLoader(dev_dataset, batch_size=8)
    report = evaluate(model, tokenizer, dev_loader)
    
    return report

### Base-model-choosing and Hyperparameter tuning

In [32]:
model_names = ['bert-base-cased']
learning_rates = [0.0001, 0.0002, 0.0005, 0.001]
batch_sizes = [32]
num_epochses = [3]
samplings = [1]
sampling_ratios = [2]

reports = load_results()

for pretrained_model_name in model_names:
    for learning_rate in learning_rates:
        for batch_size in batch_sizes:
            for num_epochs in num_epochses:
                for sampling in samplings:
                    if not sampling: continue
                    for sampling_ratio in sampling_ratios:
                        already_exists = not reports.empty and not reports.loc[(reports['pretrained_model_name']==pretrained_model_name) & 
                                                                       (reports['learning_rate']==learning_rate) &
                                                                       (reports['batch_size']==batch_size) &
                                                                       (reports['num_epochs']==num_epochs) &
                                                                       (reports['sampling']==sampling) &
                                                                       (reports['sampling_ratio']==sampling_ratio)].empty
                        if already_exists:
                            continue
                        
                        report = main_patronise(pretrained_model_name=pretrained_model_name,
                                                learning_rate=learning_rate,
                                                batch_size=batch_size,
                                                num_epochs=num_epochs,
                                                sampling=sampling,
                                                sampling_ratio=sampling_ratio)
                        row = {'pretrained_model_name': pretrained_model_name,
                               'learning_rate': learning_rate,
                               'batch_size': batch_size,
                               'num_epochs': num_epochs,
                               'sampling': sampling,
                               'sampling_ratio': sampling_ratio}
                        print(row)
                        display_report(report)
                        for k, v in report['Not patronising'].items():
                            row[f"neg_{k}"] = v
                        for k, v in report['Patronising'].items():
                            row[f"pos_{k}"] = v
                        reports = reports.append(row, ignore_index=True)
                        save_results(reports)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/bert-base-cased/resolve/main/voc

Number of positive samples: 794
Number of negative samples: 7581
Number of negative samples after downsampling: 1588


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Step,Training Loss
100,0.647
200,0.6413




Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 262/262 [00:14<00:00, 18.11it/s]
  reports = reports.append(row, ignore_index=True)


{'pretrained_model_name': 'bert-base-cased', 'learning_rate': 0.0002, 'batch_size': 32, 'num_epochs': 3, 'sampling': 1, 'sampling_ratio': 2}
Not patronising:
precision : 0.9067182213629773
recall    : 0.9899736147757255
f1-score  : 0.9465186680121089
support   : 1895

Patronising:
precision : 0.24
recall    : 0.03015075376884422
f1-score  : 0.05357142857142856
support   : 199


The f1-score we care about: 0.05357142857142856


In [33]:
reports

Unnamed: 0,pretrained_model_name,learning_rate,batch_size,num_epochs,sampling,sampling_ratio,neg_precision,neg_recall,neg_f1-score,neg_support,pos_precision,pos_recall,pos_f1-score,pos_support
0,bert-base-cased,0.0001,32,3,1,2,0.959777,0.906596,0.932429,1895,0.417763,0.638191,0.50497,199
1,bert-base-cased,0.001,32,3,1,2,0.904967,1.0,0.950113,1895,0.0,0.0,0.0,199
2,bert-base-cased,0.0005,32,3,1,2,0.904967,1.0,0.950113,1895,0.0,0.0,0.0,199
3,bert-base-cased,0.0002,32,3,1,2,0.906718,0.989974,0.946519,1895,0.24,0.030151,0.053571,199


### Train with best model and optimal hyperparameters

In [73]:
reports = load_results()
optimal_hyperparams = reports.iloc[[reports['pos_f1-score'].idxmax()]]

print("Best base model:", optimal_hyperparams['pretrained_model_name'].item())
print("\nOptimal hyperparameters:")
print("learning_rate:", optimal_hyperparams['learning_rate'].item())
print("batch_size:", optimal_hyperparams['batch_size'].item())
print("num_epochs:", optimal_hyperparams['num_epochs'].item())
print("sampling:", optimal_hyperparams['sampling'].item())
print("sampling_ratio:", optimal_hyperparams['sampling_ratio'].item())

Best base model: bert-base-cased

Optimal hyperparameters:
learning_rate: 0.0001
batch_size: 32
num_epochs: 3
sampling: 1
sampling_ratio: 2


In [69]:
report = main_patronise(pretrained_model_name=optimal_hyperparams['pretrained_model_name'].values[0],
                        learning_rate=optimal_hyperparams['learning_rate'].item(),
                        batch_size=optimal_hyperparams['batch_size'].item(),
                        num_epochs=optimal_hyperparams['num_epochs'].item(),
                        sampling=optimal_hyperparams['sampling'].item(),
                        sampling_ratio=optimal_hyperparams['sampling_ratio'].item())
display_report(report)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/bert-base-cased/resolve/main/voc

Number of positive samples: 794
Number of negative samples: 7581
Number of negative samples after downsampling: 1588


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Step,Training Loss
100,0.4638
200,0.1882




Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 262/262 [00:14<00:00, 18.07it/s]

Not patronising:
precision : 0.9599096555618295
recall    : 0.8970976253298153
f1-score  : 0.9274413529732678
support   : 1895

Patronising:
precision : 0.39628482972136225
recall    : 0.6432160804020101
f1-score  : 0.4904214559386973
support   : 199


The f1-score we care about: 0.4904214559386973



