

# Detecting PCL | Training Model

### Importing the libraries

In [None]:
! pip install transformers datasets

In [None]:
!pip install tensorflow==2.8.0

In [None]:
import os.path
import numpy as np
import tensorflow as tf


In [None]:
import pandas as pd

In [None]:
from urllib import request

module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

In [None]:
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', 'dontpatronizeme_pcl.tsv')

## Part 1: Data Preprocessing

### Loading the PCL dataset

In [None]:
dpm.load_task1()


In [None]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')


trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)
trids.label = np.asarray(trids.label)
teids.label = np.asarray(teids.label)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  label = trids.label[idx]
  # select row from original dataset to retrieve the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]
  rows.append({
      'par_id':parid,
      'text': text,
      'keyword':keyword,
      'unb': label[1]  ,
       'com': label[4],
       'pre':label[7] ,
       'aut':label[10] ,
       'sha':label[13] , 
       'met':label[16] , 
       'merr':label[19]
  })


  

In [None]:
trdf1 = pd.DataFrame(rows)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  label = teids.label[idx]
  # select row from original dataset to access the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]

  rows.append({
      'par_id':parid,
      'text':  text,
      'keyword':keyword,
      'unb': label[1]  ,
       'com': label[4],
       'pre':label[7] ,
       'aut':label[10] ,
       'sha':label[13] , 
       'met':label[16] , 
       'merr':label[19]
  })
  

In [None]:
tedf1=pd.DataFrame(rows)

In [None]:
class_weight = {0: 16.300911229034686, 1: 20.319536321616038, 2: 20.218773701570143, 3: 19.0102260019453, 4: 16.68439631892069, 5: 21.182697470184646, 6: 64.27296843297307}
# For try different weights for different values corresponding to 0 and 1
# INS weight = 1/n
# ISNS weight = 1/root(n)
# ENS weight = (1-Beta)/(1-Beta^n)

In [None]:

all_negs = trdf1[trdf1.unb + trdf1.com + trdf1.pre + trdf1.aut+ trdf1.sha+ trdf1.met+ trdf1.merr ==0]
all_pos = trdf1[trdf1.unb + trdf1.com + trdf1.pre + trdf1.aut+ trdf1.sha+ trdf1.met+ trdf1.merr !=0]

trdf1 = pd.DataFrame(pd.concat([all_pos,all_negs[:round(len(all_pos)*0.5)]]))

## Preparing the Dataset

In [None]:
from datasets import Dataset
train_raw = Dataset.from_pandas(trdf1)
dev_raw = Dataset.from_pandas(tedf1)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')


def tokenize_function(examples):
    return tokenizer(text=examples["keyword"],text_pair=examples["text"],add_special_tokens=True, max_length=256,padding="max_length", truncation=True)

  
train = train_raw.map(tokenize_function, batched=True)
dev = dev_raw .map(tokenize_function, batched=True)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [None]:


tf_validation_dataset = dev.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=['unb', 'com', 'pre', 'aut', 'sha', 'met', 'merr'],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=16,
)

tf_train_dataset = train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=['unb', 'com', 'pre', 'aut', 'sha', 'met', 'merr'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=16,
)

## Finetuning with Keras

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained('albert-base-v2', num_labels=7)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.CategoricalAccuracy(),
)

model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, class_weight = class_weight)

In [None]:
model.summary()