In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1: Exploratory Data Analysis and Preprocessing

In [None]:
import torch
from tqdm.notebook import tqdm

In [None]:
df = pd.read_csv('model_data.csv',
                names=['text', 'category'])
df = df[1:]
df.insert(0, 'id', range(1, 1 + len(df)))
df.set_index('id', inplace=True)
df = df[df.category != 'neutral']

In [None]:
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,AITA for not going to my Gf's sister's gender ...,1
2,AITA for going out with someone else I started...,1
3,AITA for not wanting to acknowledge my mother'...,1
4,AITA for not going outside to spend time with ...,1
5,WIBTA if I want my roommate to buy me (new) wi...,1


In [None]:
df.category.value_counts()

-1    2263
1     2263
Name: category, dtype: int64

In [None]:
df.category.value_counts()

-1    2263
1     2263
Name: category, dtype: int64

In [None]:
possible_labels = df.category.unique()

In [None]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [None]:
label_dict

{'-1': 1, '1': 0}

In [None]:
df.category = df['category'].map(label_dict)

In [None]:
df.head(10)

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,AITA for not going to my Gf's sister's gender ...,0
2,AITA for going out with someone else I started...,0
3,AITA for not wanting to acknowledge my mother'...,0
4,AITA for not going outside to spend time with ...,0
5,WIBTA if I want my roommate to buy me (new) wi...,0
6,AITA for feeling a gnawing sense of unease whe...,0
7,"AITA - Just got out of a LDR, am I the asshole...",0
8,AITA for Calling a Guy Obsessed with Genitals?...,0
9,AITA for refusing to let my step sister use my...,0
10,AITA for getting in a relationship with a male...,0


Classes are imbalanced as visible

## 2: Training/Validation Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.category.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.category.values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

In [None]:
df.head()

Unnamed: 0_level_0,text,category,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,AITA for not going to my Gf's sister's gender ...,0,not_set
2,AITA for going out with someone else I started...,0,not_set
3,AITA for not wanting to acknowledge my mother'...,0,not_set
4,AITA for not going outside to spend time with ...,0,not_set
5,WIBTA if I want my roommate to buy me (new) wi...,0,not_set


In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['category', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
category,data_type,Unnamed: 2_level_1
0,train,1923
0,val,340
1,train,1924
1,val,339


# 3. Loading Tokenizer and Encoding our Data

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 456 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 41.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
df.dropna()

Unnamed: 0_level_0,text,category,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,AITA for not going to my Gf's sister's gender ...,0,train
2,AITA for going out with someone else I started...,0,val
3,AITA for not wanting to acknowledge my mother'...,0,train
4,AITA for not going outside to spend time with ...,0,train
5,WIBTA if I want my roommate to buy me (new) wi...,0,train
...,...,...,...
4522,AITA for screaming at a child who was being a ...,1,train
4523,WIBTA for asking my wife to take down a shrine...,1,train
4524,AITA for being upset that an old friend hasnt ...,1,val
4525,WIBTA if I slept with a Taiwanese girl if I ha...,1,train


In [None]:
df['text'] = df['text'].astype('str') 
df[df.data_type=='train'].text.values

array(["AITA for not going to my Gf's sister's gender reveal party after a series of other parties at the same house? My girlfriends family is having a super busy 2 weeks, they had family come from their home country for my gf's graduation and graduation party and it's been amazing seeing her so happy and spending time with them. At the same time, it's not often that I have gone over their house in the past, when my gf and I were having trouble communicating, I would cancel plans to go out with her family to a winery, or to the beach, etc.. I don't do that at all now that I feel like we actually understand each other and can solve things even on the go. Over these 2 weeks, I've gone over 4 times, for the initial dinner for everyone there, another dinner for the family, the graduation party all day, and I went out to breakfast with the family after her ceremony. The ceremony was 3 days ago, and now they are having a gender reveal for my girlfriend's sister today. I feel a bit overwhelme

In [None]:
all_text_train = df[df.data_type=='train'].text.tolist()

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    all_text_train,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].category.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].category.values)



In [None]:
len(input_ids_train), len(attention_masks_train), len(labels_train)

(3847, 3847, 3847)

In [None]:
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_val)

In [None]:
len(dataset_train)

3847

In [None]:
dataset_val.tensors

(tensor([[  101,  9932,  2696,  ...,     0,     0,     0],
         [  101, 15536, 19279,  ...,     0,     0,     0],
         [  101,  9932,  2696,  ...,     0,     0,     0],
         ...,
         [  101,  9932,  2696,  ...,     0,     0,     0],
         [  101,  9932,  2696,  ...,     0,     0,     0],
         [  101,  9932,  2696,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# 4. Setting up BERT Pretrained Model

In [None]:
from transformers import BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = len(label_dict),
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# 5. Creating Data Loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

# 6. Setting Up Optimizer and Scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)

In [None]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

# 7. Defining our Performance Metrics

In [None]:
import numpy as np
from sklearn.metrics import f1_score

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

# 8. Creating our Training Loop

In [None]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
    #torch.save(model.state_dict(), f'Models/BERT_ft_Epoch{epoch}.model')
    
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/962 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.6781662253047226


  0%|          | 0/22 [00:00<?, ?it/s]

Validation loss: 0.6642658060247247
F1 Score (weighted): 0.5906201353552532


Epoch 2:   0%|          | 0/962 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.6187646688269974


  0%|          | 0/22 [00:00<?, ?it/s]

Validation loss: 0.658922171050852
F1 Score (weighted): 0.59505482564974


Epoch 3:   0%|          | 0/962 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.5248211265702319


  0%|          | 0/22 [00:00<?, ?it/s]

Validation loss: 0.9053814302791249
F1 Score (weighted): 0.6499460232573377


Epoch 4:   0%|          | 0/962 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.46968523888769337


  0%|          | 0/22 [00:00<?, ?it/s]

Validation loss: 1.3504186977039685
F1 Score (weighted): 0.628211131870052


Epoch 5:   0%|          | 0/962 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.3639664863869877


  0%|          | 0/22 [00:00<?, ?it/s]

Validation loss: 1.6155867793343284
F1 Score (weighted): 0.628382328916709


# 9. Evaluating our Model

In [None]:
accuracy_per_class(predictions, true_vals)

Class: 1
Accuracy:226/340

Class: -1
Accuracy:201/339

