In [1]:
pip install transformers --quiet

In [2]:
import torch
import torch.nn as nn 
import numpy as np
import os
import pandas as pd
import random
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split

In [3]:
def seed_everything(seed):
  np.random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
    
seed_everything(21)

In [4]:
train = pd.read_csv("https://raw.githubusercontent.com/Josepholaidepetro/Gender-Based-Violence-Tweet-Classification-Challenge/main/data/Train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/Josepholaidepetro/Gender-Based-Violence-Tweet-Classification-Challenge/main/data/Test.csv")
sub = pd.read_csv('https://raw.githubusercontent.com/Josepholaidepetro/UN-Tweety-Goal-5/main/data/SampleSubmission.csv')

In [5]:
X = train['tweet']
y = train["type"]

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()        
y = le.fit_transform(y)

In [7]:
train_df = pd.DataFrame(X)
train_df['type'] = y

In [8]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [9]:
"""train_df = pd.DataFrame(X_train)
train_df['type'] = y_train
val_df = pd.DataFrame(X_val)
val_df['type'] = y_val"""

"train_df = pd.DataFrame(X_train)\ntrain_df['type'] = y_train\nval_df = pd.DataFrame(X_val)\nval_df['type'] = y_val"

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [11]:
class Dataset(Dataset):
  def __init__(self, df, tokenizer, max_len):
    self.labels = df['type'].values
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.texts = [text for text in df['tweet']]

  def __len__(self):
      return len(self.labels)

  def __getitem__(self, idx):

      texts = self.texts[idx]
      label = self.labels[idx]
      text = self.tokenizer(texts, 
                            padding='max_length', max_length = self.max_len, truncation=True,
                            return_tensors="pt")

      return text, label

In [12]:
class TextDataset(Dataset):
  def __init__(self, df, tokenizer, max_len):
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.texts = [text for text in df['tweet']]

  def __len__(self):
      return len(self.texts)

  def __getitem__(self, idx):

      texts = self.texts[idx]
      text = self.tokenizer(texts, 
                            padding='max_length', max_length = self.max_len, truncation=True,
                            return_tensors="pt")

      return text

In [13]:
def GBVDataloader(df, batch_size, tokenizer):
  dataset = Dataset(df, tokenizer, 120)
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
  return dataloader

def GBVTestDataloader(df, batch_size , tokenizer):
    dataset = TextDataset(df, tokenizer, 120)
    dataloader = DataLoader(dataset , batch_size , shuffle=False)
    return dataloader

In [14]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

  def __init__(self, n_labels, dropout=0.85):

      super(BertClassifier, self).__init__()

      self.bert = BertModel.from_pretrained('bert-base-cased')
      self.dropout = nn.Dropout(dropout)
      self.linear = nn.Linear(768, n_labels)
      self.relu = nn.ReLU()

  def forward(self, input_id, mask):

      _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
      dropout_output = self.dropout(pooled_output)
      linear_output = self.linear(dropout_output)
      final_layer = self.relu(linear_output)

      return final_layer

In [15]:
model = BertClassifier(5)
model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [16]:
def train_loop(model, train_data, dataloader, optimizer, scheduler, criterion, device):
  
  model.train()

  total_acc_train = 0
  total_loss_train = 0

  for train_input, train_label in tqdm(dataloader):

    train_label = train_label.to(device)
    mask = train_input['attention_mask'].to(device)
    input_id = train_input['input_ids'].squeeze(1).to(device)

    output = model(input_id, mask)

    batch_loss = criterion(output, train_label)
    total_loss_train += batch_loss.item()

    acc = (output.argmax(dim=1) == train_label).sum().item()
    total_acc_train += acc

    batch_loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return total_loss_train / len(train_data), total_acc_train / len(train_data)

In [17]:
def eval_loop(model, val_data, dataloader, criterion, device):
  total_acc_val = 0
  total_loss_val = 0
  
  model.eval()

  with torch.no_grad():

    for val_input, val_label in dataloader:

      val_label = val_label.to(device)
      mask = val_input['attention_mask'].to(device)
      input_id = val_input['input_ids'].squeeze(1).to(device)

      output = model(input_id, mask)

      batch_loss = criterion(output, val_label)
      total_loss_val += batch_loss.item()
      
      acc = (output.argmax(dim=1) == val_label).sum().item()
      total_acc_val += acc
  return total_loss_val / len(val_data), total_acc_val / len(val_data)

In [18]:
def train(model, train_data, learning_rate, batch_size, epochs, epsilon):
  
  # loss fn
  criterion = nn.CrossEntropyLoss()

  for epoch_num in range(epochs):      
  
    fold = StratifiedKFold(n_splits=5, shuffle=False)
  
    for step, (train, valid) in enumerate(fold.split(train_data['tweet'], train_data["type"])):
  
      train_dataloader = GBVDataloader(train_data.iloc[train], batch_size, tokenizer)
      val_data_loader = GBVDataloader(train_data.iloc[valid], batch_size, tokenizer)
      # Create the optimizer
      optimizer = AdamW(model.parameters(),
            lr=learning_rate,    # Default learning rate
            eps=epsilon)   # Default epsilon value
      
      # Set up the learning rate scheduler
      total_steps = (len(train_dataloader) // batch_size) * epochs
      scheduler = get_linear_schedule_with_warmup(optimizer,
                                      num_warmup_steps=0, # Default value
                                      num_training_steps=total_steps)
        
      train_loss, train_acc = train_loop(model, train_data.iloc[train], train_dataloader, optimizer, scheduler, criterion, device)
      print(f"Train accuracy {train_acc} ,Train Loss {train_loss}")
      val_loss, val_acc = eval_loop(model, train_data.iloc[valid], val_data_loader, criterion, device)
      print(f"Validation accuracy {val_acc} ,Validation Loss {val_loss}")
      

In [21]:
EPOCHS = 2
LR = 3e-5
batch_size = 30
epsilon = 1e-8

In [22]:
train(model, train_data=train_df, learning_rate=LR, batch_size=batch_size, epochs=EPOCHS, epsilon=epsilon)

100%|██████████| 1058/1058 [19:29<00:00,  1.11s/it]


Train accuracy 0.9877679697351829 ,Train Loss 0.0012601745032165973
Validation accuracy 0.992938209331652 ,Validation Loss 0.0011199553125025


100%|██████████| 1058/1058 [19:28<00:00,  1.10s/it]


Train accuracy 0.9888398486759142 ,Train Loss 0.001249152787727813
Validation accuracy 0.992938209331652 ,Validation Loss 0.0010196314047495594


100%|██████████| 1058/1058 [19:28<00:00,  1.10s/it]


Train accuracy 0.9895334174022699 ,Train Loss 0.0012510890628451205
Validation accuracy 0.9935687263556116 ,Validation Loss 0.000889423011591125


100%|██████████| 1058/1058 [19:26<00:00,  1.10s/it]


Train accuracy 0.9924022698612862 ,Train Loss 0.0010393295653020537
Validation accuracy 0.9939470365699874 ,Validation Loss 0.0007461187458756552


100%|██████████| 1058/1058 [19:30<00:00,  1.11s/it]


Train accuracy 0.991046658259773 ,Train Loss 0.0010289893558897455
Validation accuracy 0.9918032786885246 ,Validation Loss 0.0008569205322347727


100%|██████████| 1058/1058 [19:31<00:00,  1.11s/it]


Train accuracy 0.9958701134930643 ,Train Loss 0.0005614105790462241
Validation accuracy 0.9967213114754099 ,Validation Loss 0.0006657846922891741


100%|██████████| 1058/1058 [19:31<00:00,  1.11s/it]


Train accuracy 0.9956179066834805 ,Train Loss 0.0005808038504869126
Validation accuracy 0.9972257250945775 ,Validation Loss 0.0005158706531808608


100%|██████████| 1058/1058 [19:35<00:00,  1.11s/it]


Train accuracy 0.9959331651954603 ,Train Loss 0.0004431900164556172
Validation accuracy 0.998108448928121 ,Validation Loss 0.00034611098147998686


100%|██████████| 1058/1058 [19:33<00:00,  1.11s/it]


Train accuracy 0.9978877679697352 ,Train Loss 0.00033483479395957
Validation accuracy 0.9987389659520807 ,Validation Loss 0.00021858692906254052


100%|██████████| 1058/1058 [19:35<00:00,  1.11s/it]


Train accuracy 0.9979823455233291 ,Train Loss 0.00032513432481198567
Validation accuracy 0.9994955863808322 ,Validation Loss 0.00012892679413271322


In [25]:
testdataloader = GBVTestDataloader(test, 1, tokenizer)

In [36]:
def pred_loop(model, dataloader, device):
  
  pred_proba = []
  
  model.eval()

  with torch.no_grad():

    for val_input in dataloader:

      mask = val_input['attention_mask'].to(device)
      input_id = val_input['input_ids'].squeeze(1).to(device)

      output = model(input_id, mask)

      pred_proba.append(output)

  return pred_proba

In [38]:
pred_outputs = pred_loop(model, testdataloader, device=device)

In [50]:
dat = []
for i in pred_outputs:
    dat.append(i.flatten().tolist())

In [58]:
pred = [np.argmax(i) for i in dat]

In [59]:
final = le.inverse_transform(pred)

In [60]:
sub['type'] = final

In [61]:
sub['type'].value_counts()

sexual_violence                 11340
Harmful_Traditional_practice     3028
emotional_violence                690
economic_violence                 396
Physical_violence                 127
Name: type, dtype: int64