# Analysis

## Imports and data
### Imports

In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval
import tensorflow as tf


import torch
import transformers
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer
from transformers import BertPreTrainedModel, BertModel

from transformers import RobertaTokenizer
from transformers import RobertaPreTrainedModel, RobertaModel

from transformers import DebertaTokenizer
from transformers import DebertaPreTrainedModel, DebertaModel

import nlpaug.augmenter.word as naw
from sklearn.utils import shuffle
from ray import tune

import pandas as pd
import numpy as np
import os

import random

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

if not torch.cuda.is_available():
  print('WARNING: You may want to change the runtime to GPU for faster training!')
  DEVICE = 'cpu'
else:
  DEVICE = 'cuda:0'

ModuleNotFoundError: ignored

In [2]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


NameError: ignored

### Data Loading and Processing

In [None]:
import os
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import MultiLabelBinarizer


# Class from 'dont_patronize_me.py' to merge labels into binary classfications

class DontPatronizeMe:

  def __init__(self, train_path, test_path):

    self.train_path = train_path
    self.test_path = test_path
    self.train_task1_df = None
    self.train_task2_df = None
    self.test_set_df = None

  def load_task1(self):
    """
    Load task 1 training set and convert the tags into binary labels. 
    Paragraphs with original labels of 0 or 1 are considered to be negative examples of PCL and will have the label 0 = negative.
    Paragraphs with original labels of 2, 3 or 4 are considered to be positive examples of PCL and will have the label 1 = positive.
    It returns a pandas dataframe with paragraphs and labels.
    """
    rows=[]
    with open(self.train_path) as f: #no joining, just the exact path otherwise, it seems to fail
      for line in f.readlines()[4:]:

        par_id=line.strip().split('\t')[0]
        art_id = line.strip().split('\t')[1]
        keyword=line.strip().split('\t')[2]
        country=line.strip().split('\t')[3]
        t=line.strip().split('\t')[4]#.lower()
        l=line.strip().split('\t')[-1]
        # if l=='3':
        #   print(t)
        #   print("----------")
        if l=='0' or l=='1':
          lbin=0
        else:
          lbin=1
        rows.append(
          {'par_id':par_id,
          'art_id':art_id,
          'keyword':keyword,
          'country':country,
          'text':t, 
          'label':lbin, 
          'orig_label':l
          }
          )
    df=pd.DataFrame(rows, columns=['par_id', 'art_id', 'keyword', 'country', 'text', 'label', 'orig_label']) 
    self.train_task1_df = df

 

Loading the data in

In [None]:
#this is where I hardcoded it in
dpm = DontPatronizeMe('dontpatronizeme_pcl.tsv', None) 
dpm.load_task1()

open('train_semeval_parids-labels.csv')
open('dev_semeval_parids-labels.csv')

In [None]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

Make the data

In [None]:
# Rebuilt Official Train Set without keyword and country codes
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  rows.append({
      #'par_id':parid,
      'texts':text,
      'labels':label
  })

trdf1 = pd.DataFrame(rows)


# Rebuild Official Test Set
rows_test = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  rows_test.append({
      #'par_id':parid,
      'texts':text,
      'labels':label,
  })

tedf1 = pd.DataFrame(rows_test)

We also rebuild the dataset for particular label types and lengths: this is to answer question 3.

In [None]:
# Rebuild Official Test for particular scores
def rebuild_test(score):
  rows_test = [] # will contain par_id, label and text
  for idx in range(len(teids)):  
    parid = teids.par_id[idx]
    #print(parid)
    # select row from original dataset
    text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
    label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
    orig_label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].orig_label.values[0]
    if orig_label==score:
      rows_test.append({
          #'par_id':parid,
          'texts':text,
          'labels':label,
      })
  return pd.DataFrame(rows_test)

testset_score_0 = rebuild_test('0')
testset_score_1 = rebuild_test('1')
testset_score_2 = rebuild_test('2')
testset_score_3 = rebuild_test('3')
testset_score_4 = rebuild_test('4')

In [None]:
# Rebuild Official Test for particular scores
def rebuild_test_size(size):
  rows_test = [] # will contain par_id, label and text
  for idx in range(len(teids)):  
    parid = teids.par_id[idx]
    #print(parid)
    # select row from original dataset
    text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
    label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
    orig_label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].orig_label.values[0]
    if size[0] < len(text) <= size[1]:
      rows_test.append({
          #'par_id':parid,
          'texts':text,
          'labels':label,
      })
  return pd.DataFrame(rows_test)

In [None]:
testset_scores_length = []
for i in range(10):
  testset_scores_length.append(rebuild_test_size([i*50, (i+1)*50]))

Split is determanistic due to random state = 10

In [None]:
# Split data into training and validation
trainset, valset = train_test_split(trdf1, test_size=0.2, random_state = 10)

trainset.index = pd.RangeIndex(len(trainset.index))
valset.index = pd.RangeIndex(len(valset.index))

trainset.index = range(len(trainset.index))
valset.index = range(len(valset.index))


### Class to handle data

In [None]:
# Define DpmDataset class which allows us to control how we handle the iteration and batches
# From lab 5
# At each iteration over the dataset object, the function ''__get_item__'' is called and returns a list of dictionnaries with the paragraphs and their labels
# Then, the ''collate_fn'' function will process the list of samples into their encodings and return a batch when called by the iterator during training

class DpmDataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, input_set):

        self.tokenizer = tokenizer
        self.texts = input_set['texts']
        self.labels = input_set['labels']
        
    def collate_fn(self, batch):

        texts = []
        labels = []

        for b in batch:
            texts.append(b['text'])
            labels.append(b['label'])

        #The maximum sequence size for BERT is 512 but here the tokenizer truncate sentences longer than 128 tokens.  
        # We also pad shorter sentences to a length of 128 tokens
        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=450)
        labels_all = {}
        encodings['label'] =  torch.tensor(labels)
        return encodings
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if idx < len(self.texts):
          item = {'text': self.texts[idx],
                'label': self.labels[idx]}
          return item
        else:
          return

In [None]:
# Use a pre-made tokenizer that was used for the BERT model 
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

## Creat datasets for training and evaluation using Dpm class
#train_dataset = DpmDataset(tokenizer, trainset_downsample_aug) # with downsampling and augmentation
val_dataset = DpmDataset(tokenizer, valset)

## Model

In [None]:
# Define our model on top of BERT for classification

class BERT_dpm(BertPreTrainedModel):
#class BERT_dpm(RobertaPreTrainedModel):
#class BERT_dpm(DebertaPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)

        # BERT Model
        self.bert = BertModel(config)
        #self.bert = RobertaModel(config)
        #self.bert = DebertaModel(config)

        self.projection = torch.nn.Sequential(torch.nn.Dropout(0.2),
                                                torch.nn.Linear(config.hidden_size, 2))

        
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
 
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Logits 
        logits = self.projection(outputs[1])
        return logits

## Evaluration Functions

In [None]:
# Function to extract predicted label

def predict_dpm(input, tokenizer, model): 
  model.eval()
  encodings = tokenizer(input, return_tensors='pt', padding=True, truncation=True, max_length=450)
  
  output = model(**encodings)
  preds = torch.max(output, 1)


  return {'prediction':preds[1], 'confidence':preds[0]} # index 0 is values, index 1 is indices

In [None]:
# Function to evaluate model on validation set

def evaluate(model, tokenizer, data_loader):

  total_count = 0
  correct_count = 0 

  preds = []
  tot_labels = []

  with torch.no_grad():
    for data in tqdm(data_loader): 

      labels = {}
      labels['label'] = data['label']

      paras = data['text']

      pred = predict_dpm(paras, tokenizer, model)

      #preds.append(pred)
      preds.append(pred['prediction'].tolist())
      tot_labels.append(labels['label'].tolist())

  # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score
  report = classification_report(tot_labels, preds, target_names=["negative","positive"], output_dict= True)

  return report

## Analysis

In [None]:
# Import model

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model_name = './models/dpm_bert_finetuned/' 
model = BERT_dpm.from_pretrained(model_name)

Evaluate pefoemence for different patroizing score.

### 3.1

In [None]:
test_score_sets = [testset_score_2, testset_score_3, testset_score_4]
reports = []
for test_score in test_score_sets:
  test_dataset = DpmDataset(tokenizer, test_score)
  test_loader = DataLoader(test_dataset)
  report = evaluate(model, tokenizer, test_loader)
  reports.append(report)

In [None]:
for report in reports:
  print(report['positive'])

## 3.2

In [None]:
reports = []
for test_score in testset_scores_length:
  test_dataset = DpmDataset(tokenizer, test_score)
  test_loader = DataLoader(test_dataset)
  report = evaluate(model, tokenizer, test_loader)
  reports.append(report)
  print(report['positive']['f1-score'])

In [None]:
recalls = []
f1s = []
precisons = []
for report in reports:
  recalls.append(report['positive']['recall'])
  f1s.append(report['positive']['f1-score'])
  precisons.append(report['positive']['precision'])

In [None]:
testset_scores_length = []
for i in range(10):
  testset_scores_length.append(rebuild_test_size([i*50, (i+1)*50]))

In [None]:
for i in range(0,500, 50):
  print(i)

In [None]:
import matplotlib.pyplot as plt
#plt.figure(figsize=(10,5))

plt.style.use('seaborn')
plt.bar(range(25,525, 50), f1s, width = 49)
plt.xlabel("Length of text")
plt.ylabel("F1 score")

#plt.xticks(x_pos, x)
plt.show()

In [None]:
import matplotlib.pyplot as plt
#plt.figure(figsize=(10,5))

plt.style.use('seaborn')
plt.bar(range(25,525, 50), recalls, width = 49)
plt.xlabel("Length of text")
plt.ylabel("Recall score")

plt.show()

In [None]:
## import matplotlib.pyplot as plt
#plt.figure(figsize=(10,5))

plt.style.use('seaborn')
plt.bar(range(25,525, 50), f1s, width = 49, alpha = 1, color = 'b', label = 'F1')
plt.bar(range(25,525, 50), recalls, width = 49, alpha = 0.5, color='g', label = 'Recall')
plt.bar(range(25,525, 50), precisons, width = 49, alpha = 0.7, color='r', label = 'Precison')
plt.xlabel("Length of text")
plt.legend(fontsize = 10)

plt.show()