

# Detecting PCL | Training Model

### Importing the libraries

In [None]:
! pip install  transformers datasets

In [None]:
import os.path
import numpy as np
import tensorflow as tf


In [None]:
import pandas as pd

In [None]:
from urllib import request

module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [None]:
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', 'dontpatronizeme_pcl.tsv')

## Part 1: Data Preprocessing

### Loading the PCL dataset

In [None]:
dpm.load_task1()


In [None]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  tag = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]

  rows.append({
      'par_id':parid,
      'text': text,
      'keyword' : keyword,
      'label':label
  })
  

In [None]:
trdf1 = pd.DataFrame(rows)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]

  rows.append({
      'par_id':parid,
      'text': text,
      'keyword' :keyword,
      'label':label
  })
  

In [None]:
tedf1=pd.DataFrame(rows)

In [None]:
class_weight = {0:2.80048445005996,1:11.8523601712888}
# For try different weights for different values corresponding to 0 and 1
# INS weight = 1/n
# ISNS weight = 1/root(n)
# ENS weight = (1-Beta)/(1-Beta^n)

## Preparing the Dataset

In [None]:
from datasets import Dataset
train_raw = Dataset.from_pandas(trdf1)
dev_raw = Dataset.from_pandas(tedf1)

In [None]:
print(train_raw[0]['keyword'])

poor-families


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")


def tokenize_function(examples):
  return tokenizer(text=examples['keyword'],text_pair=examples['text'],add_special_tokens=True, max_length=500, padding="max_length", truncation=True)  
train = train_raw.map(tokenize_function, batched=True)
dev = dev_raw.map(tokenize_function , batched=True)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [None]:
tf_train_dataset = train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=16,
)

tf_validation_dataset = dev.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=16,
)

## Finetuning with Keras

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=tf.metrics.BinaryAccuracy(),
)

model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=1, class_weight = class_weight)