In [2]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [17]:
df = pd.read_csv(
    'Data/train_and_val.csv',
    names=['id','text','label'])

df.set_index('id',inplace = True)

In [18]:
#View our dataset
df.head()

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0


In [19]:
#Checking out the text

df.text.iloc[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [21]:
#Viewing the class balance across the different classes in our dataset
df.label.value_counts()

0    208935
1     22614
Name: label, dtype: int64

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train,X_val,y_train,y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=17,
    stratify = df.label.values
)

In [24]:
df['data_type'] = ['not_set']*df.shape[0]

In [25]:
df.head()

Unnamed: 0_level_0,text,label,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,not_set
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,not_set
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,not_set
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,not_set
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,not_set


In [26]:
df.loc[X_train,'data_type'] = 'train'
df.loc[X_val,'data_type'] = 'val'

In [28]:
df.groupby(['label','data_type']).count()# see the distribution across
#our different classes

Unnamed: 0_level_0,Unnamed: 1_level_0,text
label,data_type,Unnamed: 2_level_1
0,train,177594
0,val,31341
1,train,19222
1,val,3392


In [30]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [31]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', # lower-case data
    do_lower_case = True,
)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [32]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,
    add_special_tokens = True, # so that BERT knows when a new sentence begins
    return_attention_mask = True,# For a fixed input of variable length, we mask all
    #to same length; attention_mask tells us where the actual values are and
    #where the irrelevant information is
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt' # for pytorch
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens = True, # so that BERT knows when a new sentence begins
    return_attention_mask = True,# For a fixed input of variable length, we mask all
    #to same length; attention_mask tells us where the actual values are and
    #where the irrelevant information is
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt' # for pytorch
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

In [33]:
dataset_train = TensorDataset(input_ids_train,
                              attention_masks_train,labels_train)

dataset_val = TensorDataset(input_ids_val,
                              attention_masks_val,labels_val)

In [34]:
len(dataset_train)

196816

In [35]:
len(dataset_val)

34733

In [36]:
from transformers import BertForSequenceClassification

In [37]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels = len(label_dict), # we add a layer above the
    #original BERT model for correct classification
    output_attentions = False, # We don't need these, like, bert telling
    #you how it predicted the label
    output_hidden_states = False, #state before the prediction
    
)

HBox(children=(IntProgress(value=0, description='Downloading', max=433, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




In [38]:
#Offers a nice way to iterate over our dataset in batches
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [39]:
batch_size = 32
dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)
dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

In [40]:
from transformers import AdamW, get_linear_schedule_with_warmup
#Adam with weight decay, stochastic optimization

In [41]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5, # 2e-5 to 5e-5 as recommended by original BERT paper
    eps = 1e-8
)

In [42]:
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = len(dataloader_train)*epochs
)

In [43]:
import numpy as np
from sklearn.metrics import f1_score

In [44]:
def f1_score_func(preds, labels):
    #We use f1_score here because we are aware of the skewness in our data.
    #Using accuracy alone will give us skewed results
    preds_flat = np.argmax(preds,axis = 1).flatten() # we want a single array
    labels_flat = labels.flatten()
    return f1_score(labels_flat,preds_flat,average = 'weighted')

In [46]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {0:'Non-Toxic',1:'Toxic'}
    
    preds_flat = np.argmax(preds,axis = 1).flatten() # we want a single array
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [47]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [48]:
device = torch.device('cpu')
#model.to(device)

print(device)


cpu


In [49]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    #model is training now
    model.train()
    
    
    loss_train_total = 0 #initially
    
    #progress_bar shows us how we are progressing
    progress_bar = tqdm(dataloader_train, 
                        desc = 'Epoch {:1d}'.format(epoch),
                       leave = False, #Allow overwriting
                       disable = False)
    
    for batch in progress_bar:
        
        model.zero_grad()
        
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {
            'input_ids'      : batch[0],
            'attention_mask' : batch[1],
            'labels'         : batch[2]
            
        }
        
        
        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:,.3f}'.format(loss.item()/len(batch)
            )})
        
    torch.save(model.state_dict(), f'Models/BERT_ft_epoch{epoch}.model')
    
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score(weighted): {val_f1}')
    

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=6151, style=ProgressStyle(description_width='in…

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

In [None]:
model.load_state_dict(torch.load('Models/finetuned_bert_epoch_1_gpu_trained.model',
                                map_location = torch.device('cpu')
                                ))

In [None]:
_, predictions, true_vals = evaluate(dataloader_val)

In [None]:
accuracy_per_class(predictions,true_vals)