In [6]:
!pip install -q -U watermark
!pip install -qq transformers



In [7]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

Python implementation: CPython
Python version       : 3.7.10
IPython version      : 7.26.0

numpy       : 1.19.5
pandas      : 1.2.5
torch       : 1.7.1+cu110
transformers: 4.5.1



In [8]:
#Import relevant libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [9]:
#Clone stance detection datasets
!git clone https://huggingface.co/datasets/strombergnlp/zulu_stance
!git clone https://huggingface.co/datasets/SetFit/tweet_eval_stance

fatal: destination path 'zulu_stance' already exists and is not an empty directory.
fatal: destination path 'tweet_eval_stance' already exists and is not an empty directory.


In [10]:
#Read original Zulu Dataset
df = pd.read_json('/kaggle/working/zulu_stance/ZUstance.json')
df.head()

Unnamed: 0,Stance,Tweet,Target
0,AGAINST,ubukhulu be-islam buba sobala lapho i-smartpho...,Atheism
1,NONE,@piddy_x @nero @cyberstalin kungcono umzamo ng...,Feminist Movement
2,NONE,@frankcraig: @skzdalimit uma ungcono kakhulu u...,Hillary Clinton
3,AGAINST,"wacaphuna umama ngokuba owesifazane: ""? thina ...",Feminist Movement
4,FAVOR,@arforhillary @hillaryclinton @ hillaryin2016 ...,Hillary Clinton


In [11]:
df.shape

(1343, 3)

In [17]:
def make_new_df(n_samples):
    '''
    Create a dataframe for English stance detection .
    
    This function creates a dataframe of n random samples for every class 
    of the English tweet stance dataset. The function also reformats the 
    columns to match the Zulu Stance dataset.

    Parameters
    ----------
    n_samples : int
        Number of random samples per class to be returned.

    Returns
    -------
    pandas.core.frame.DataFrame
        A Pandas dataframe from the English Stance dataset.
    '''
    df_1 = pd.read_json('/kaggle/working/tweet_eval_stance/stance_feminist/train.jsonl', lines=True).sample(n_samples)
    df_2 = pd.read_json('/kaggle/working/tweet_eval_stance/stance_abortion/train.jsonl', lines=True).sample(n_samples)
    df_3 = pd.read_json('/kaggle/working/tweet_eval_stance/stance_hillary/train.jsonl', lines=True).sample(n_samples)
    df_4 = pd.read_json('/kaggle/working/tweet_eval_stance/stance_atheism/train.jsonl', lines=True).sample(n_samples)
    df_5 = pd.read_json('/kaggle/working/tweet_eval_stance/stance_climate/train.jsonl', lines=True).sample(n_samples)

    df_list = [df_1, df_2, df_3, df_4, df_5]
    targets = list(df.Target.value_counts().index)

    for i in range(0, 5):
        df_list[i]['Target'] = targets[i]

    df_new = pd.concat(df_list, axis=0)
    df_new.drop(columns='label', inplace=True)

    df_new.set_axis(['Tweet', 'Stance', 'Target'], axis=1, inplace=True)
    df_new = df_new[['Stance', 'Tweet', 'Target']]

    df_new['Stance'].replace({'favor':0, 'against':1, 'none':2}, inplace=True)
    df_new['Target'].replace({'Feminist Movement':0, 'Legalization of Abortion':1, 'Hillary Clinton':2, 'Atheism':3, 'Climate Change is a Real Concern':4}, inplace=True)
    return df_new

In [18]:
#Creating an English stance dataset of 1000 samples
english_df = make_new_df(200)
english_df.Stance.value_counts()

1    460
0    280
2    260
Name: Stance, dtype: int64

In [19]:
english_df.head()

Unnamed: 0,Stance,Tweet,Target
109,1,feminist response people wearing meninist clot...,0
480,0,RT @user Let's demand less cat-calling and mor...,0
135,1,"A ""Strong Feminist"" on Twitter is measured by ...",0
77,1,Want revenge for being a slut? Cry rape. The...,0
396,0,Wow @user you are beautiful even make up free...,0


In [None]:
df.Target.value_counts()

In [123]:
df['Stance'].replace({'FAVOR':0, 'AGAINST':1, 'NONE':2}, inplace=True)
df['Target'].replace({'Feminist Movement':0, 'Legalization of Abortion':1, 'Hillary Clinton':2, 'Atheism':3, 'Climate Change is a Real Concern':4}, inplace=True)

In [14]:
PRE_TRAINED_MODEL_NAME = 'bert-base-multilingual-uncased'

In [15]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

In [16]:
class GPReviewDataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.reviews)
  
  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [70]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_val, test_size=0.5, random_state=RANDOM_SEED)

In [71]:
# Concatenate the English dataframe to the Zulu dataset
df_train = pd.concat([df_train, x], axis=0).sample(frac=1)

In [72]:
df_train.shape, df_val.shape, df_test.shape

((1074, 3), (134, 3), (135, 3))

In [73]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = GPReviewDataset(
    reviews=df[['Tweet', 'Target']].to_numpy(),
    targets=df['Stance'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [74]:
MAX_LEN = 128

In [75]:
BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [76]:
data = next(iter(train_data_loader))
data.keys()

dict_keys(['review_text', 'input_ids', 'attention_mask', 'targets'])

In [77]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16])


In [78]:
bert_model = BertModel.from_pretrained('Davlan/xlm-roberta-base-finetuned-zulu')

You are using a model of type xlm-roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at Davlan/xlm-roberta-base-finetuned-zulu were not used when initializing BertModel: ['roberta.embeddings.position_ids', 'roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attentio

In [83]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [111]:
model = SentimentClassifier(3)
model = model.to(device)

In [112]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

torch.Size([16, 128])
torch.Size([16, 128])


In [114]:
EPOCHS = 5

optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=True)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [115]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [116]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [117]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/5
----------
Train loss 1.0785696480204077 accuracy 0.4208566108007449
Val   loss 1.041065752506256 accuracy 0.44029850746268656

Epoch 2/5
----------
Train loss 1.017500951886177 accuracy 0.5130353817504656
Val   loss 0.9477486146820916 accuracy 0.5373134328358209

Epoch 3/5
----------
Train loss 0.9512170842465233 accuracy 0.5502793296089385
Val   loss 0.9445955554644266 accuracy 0.5373134328358209

Epoch 4/5
----------
Train loss 0.911954749156447 accuracy 0.5642458100558659
Val   loss 0.9461628198623657 accuracy 0.5298507462686567

Epoch 5/5
----------
Train loss 0.8860322727876551 accuracy 0.5782122905027933
Val   loss 0.9492429428630405 accuracy 0.5597014925373134

CPU times: user 1min 20s, sys: 8.89 s, total: 1min 29s
Wall time: 1min 36s


In [118]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_val)
)

test_acc.item()

0.5223880597014925