In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q git+https://github.com/gmihaila/ml_things.git

In [None]:
import io
import os
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import set_seed, TrainingArguments, Trainer, GPT2Config, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup, GPT2ForSequenceClassification

epochs = 7
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
labels_ids = {0: 0, 1: 1}
n_labels = len(labels_ids)


In [None]:
import pandas as pd
class SarcasmDataset(Dataset):
  def __init__(self, df):
    self.texts = []
    self.labels = []
    for index, row in df.iterrows():
      content = fix_text(row['headline'])
      self.texts.append(content)
      self.labels.append(row['is_sarcastic'])

    self.n_examples = len(self.labels)
        return

  def __len__(self):
    return self.n_examples

  def __getitem__(self, item):
    return {'text':self.texts[item],
            'label':self.labels[item]}



class Gpt2ClassificationCollator(object):
    def __init__(self, tokenizer, labels_encoder):
        self.tokenizer = tokenizer
        self.max_sequence_len = tokenizer.model_max_length
        self.labels_encoder = labels_encoder

        return

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [sequence['label'] for sequence in sequences]
        labels = [self.labels_encoder[label] for label in labels]
        inputs = self.tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
        inputs.update({'labels':torch.tensor(labels)})

        return inputs


def train(model, dataloader, optimizer_, scheduler_, device_):

  predictions_labels = []
  true_labels = []
  total_loss = 0
  model.train()

  for batch in tqdm(dataloader, total=len(dataloader)):

    true_labels += batch['labels'].numpy().flatten().tolist()
    batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

    model.zero_grad()
    outputs = model(**batch)
    loss, logits = outputs[:2]
    total_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()
    logits = logits.detach().cpu().numpy()
    predictions_labels += logits.argmax(axis=-1).flatten().tolist()

  avg_epoch_loss = total_loss / len(dataloader)
  return true_labels, predictions_labels, avg_epoch_loss



def validate(model, dataloader, device_):
  predictions_labels = []
  true_labels = []
  total_loss = 0
  model.eval()

  for batch in tqdm(dataloader, total=len(dataloader)):

    true_labels += batch['labels'].numpy().flatten().tolist()

    batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

    with torch.no_grad():

        outputs = model(**batch)
        loss, logits = outputs[:2]
        logits = logits.detach().cpu().numpy()
        total_loss += loss.item()
        predict_content = logits.argmax(axis=-1).flatten().tolist()
        predictions_labels += predict_content

  avg_epoch_loss = total_loss / len(dataloader)
  return true_labels, predictions_labels, avg_epoch_loss


In [None]:
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path='gpt2', num_labels=n_labels)

tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path='gpt2')
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path='gpt2', config=model_config)

model.resize_token_embeddings(len(tokenizer))

model.config.pad_token_id = model.config.eos_token_id

model.to(device)


In [None]:
from sklearn.model_selection import train_test_split

gpt2_classificaiton_collator = Gpt2ClassificationCollator(tokenizer=tokenizer, labels_encoder=labels_ids)

df = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines = True)
train_df, test_df = train_test_split(df, test_size=0.2)
train_dataset = SarcasmDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)

valid_dataset =  SarcasmDataset(test_df)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)


In [None]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = len(train_dataloader) * epochs)

all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}


for epoch in tqdm(range(epochs)):
  train_labels, train_predictions, train_loss = train(model, train_dataloader, optimizer, scheduler, device)
  train_acc = accuracy_score(train_labels, train_predictions)
  valid_labels, valid_predictions, val_loss = validate(model, valid_dataloader, device)
  val_acc = accuracy_score(valid_labels, valid_predictions)
  all_loss['train_loss'].append(train_loss)
  all_loss['val_loss'].append(val_loss)
  all_acc['train_acc'].append(train_acc)
  all_acc['val_acc'].append(val_acc)

plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value')

plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value')


In [None]:
true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader, device)
evaluation_report = classification_report(true_labels, predictions_labels)
plot_confusion_matrix(y_true=true_labels, y_pred=predictions_labels, classes=list(labels_ids.keys()), normalize=True, magnify=0.1)