In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [None]:
import time
import numpy as np
import pandas as pd

import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertPreTrainedModel, BertModel
from transformers import get_linear_schedule_with_warmup

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, recall_score
#ConfusionMatrixDisplay
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [None]:
from keras_preprocessing.sequence import pad_sequences

In [None]:
import matplotlib.pyplot as plt

In [None]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertPreTrainedModel, BertModel
from transformers import get_linear_schedule_with_warmup

In [None]:
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print(f'Found GPU at: {device_name}')
device_name

In [None]:
torch.cuda.is_available()

In [None]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('GPU in use:', torch.cuda.get_device_name(0))
else:
    print('using the CPU')
    device = torch.device("cpu")

In [None]:
MAX_LEN = 128 # max sequences length
batch_size = 32

In [None]:
# extra preprocessing steps
# prepend CLS and append SEP, truncate, pad

# labels_encoding = {'NO':0,'Political Attack':1,'Politicians':1,'Taunt':2}
# 
labels_encoding = {'N':0,'P':1,'O':1,'T':2}

def preprocessing(df):
    sentences = df.Text.values
    ID = df['Unnamed: 0'].values
    labels = np.array([labels_encoding[l] for l in df.Label.values])

    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
    
    encoded_sentences = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(
                            sent,
                            add_special_tokens = True,
                            truncation=True,
                            max_length = MAX_LEN
                    )
        
        encoded_sentences.append(encoded_sent)
    encoded_sentences = pad_sequences(encoded_sentences, maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")
    return encoded_sentences, labels
    
def attention_masks(encoded_sentences):
    # attention masks, 0 for padding, 1 for actual token
    attention_masks = []
    for sent in encoded_sentences:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks

In [None]:
df = pd.read_csv("Sarah_Label1_Dup_Updated.csv")
df.head()

In [None]:
df.rename(columns = {'New_Label':'Label'},inplace = True)

In [None]:
df['Label'] = df['New_Label'].map({'N':0,'P':1,'O':1,'T':1})

In [None]:
df.Label.value_counts()

In [None]:
df.info()

In [None]:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
def compute_accuracy(preds, labels):
    p = np.argmax(preds, axis=1).flatten()
    l = labels.flatten()
    return np.sum(p==l)/len(l)

def run_train(epochs,train_dataloader,validation_dataloader):

#     print("hello")
    losses = []
    for e in range(epochs):
        d = pd.DataFrame()
        print('======== Epoch {:} / {:} ========'.format(e + 1, epochs))
        start_train_time = time.time()
        total_loss = 0
        model.train()
        
        for step, batch in enumerate(train_dataloader):

            if step%10 == 0:
                elapsed = time.time()-start_train_time
                print(f'{step}/{len(train_dataloader)} --> Time elapsed {elapsed}')

            # input_data, input_masks, input_labels = batch
            input_data = batch[0]#.to(device)
            input_masks = batch[1]#.to(device)
            input_labels = batch[2]#.to(device)


            model.zero_grad()

            # forward propagation
            out = model(input_data,
                        token_type_ids = None, 
                        attention_mask = input_masks,
                        labels = input_labels)
#             print(out)
            
            loss = out[0]
            total_loss = total_loss + loss.item()

            # backward propagation
            loss.backward()
            
            torch.nn.utils.clip_grad_norm(model.parameters(), 1)

            optimizer.step()
        
        epoch_loss = total_loss/len(train_dataloader)
        losses.append(epoch_loss)
        print(f"Training took {time.time()-start_train_time}")

        # Validation
        start_validation_time = time.time()
        model.eval()
        eval_loss, eval_acc = 0,0
        preds = []
        evals = []
        for step, batch in enumerate(validation_dataloader):
            batch = tuple(t.to(device) for t in batch)
            eval_data, eval_masks, eval_labels = batch
            with torch.no_grad():
                out = model(eval_data,
                            token_type_ids = None, 
                            attention_mask=eval_masks)
            logits = out[0]

            #  Uncomment for GPU execution
#             logits = logits.detach().cpu().numpy()
#             eval_labels = eval_labels.to('cpu').numpy()
#             batch_acc = compute_accuracy(logits, eval_labels)
            p = np.argmax(logits, axis=1).flatten()            
            preds.extend(p)
            e = eval_labels.flatten()
            evals.extend(e)


            # Uncomment for CPU execution
            # batch_acc = compute_accuracy(logits.numpy(), eval_labels.numpy())

#             eval_acc += batch_acc
#         print(f"Accuracy: {eval_acc/(step+1)}, Time elapsed: {time.time()-start_validation_time}")
        print(f"Time elapsed: {time.time()-start_validation_time}")
        d['Evals'] = evals
        d['Preds'] = preds
        print(classification_report(evals,preds))

    return losses        

In [None]:
import sys
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

kf = KFold(n_splits=5, shuffle=True)

loss = []

model = BertForSequenceClassification.from_pretrained(
        "bert-base-multilingual-cased",
        num_labels = 3,   
        output_attentions = False, 
        output_hidden_states = False, )
    
for name,param in model.named_parameters():
        if ((name.startswith("bert.encoder.layer.0.")) or (name.startswith("bert.encoder.layer.1.")) or 
        (name.startswith("bert.encoder.layer.2.")) or (name.startswith("bert.encoder.layer.3.")) or 
        (name.startswith("bert.encoder.layer.4.")) or (name.startswith("bert.encoder.layer.5."))):
            param.requires_grad = False
            
for train_index, test_index in kf.split(df):
    
    
    train_encoded_sentences, train_labels = preprocessing(df.iloc[train_index])
    train_attention_masks = attention_masks(train_encoded_sentences)

    test_encoded_sentences, test_labels = preprocessing(df.iloc[test_index])
    test_attention_masks = attention_masks(test_encoded_sentences)
    
    train_inputs = torch.tensor(train_encoded_sentences)
    train_labels = torch.tensor(train_labels)
    train_masks = torch.tensor(train_attention_masks)

    validation_inputs = torch.tensor(test_encoded_sentences)
    validation_labels = torch.tensor(test_labels)
    validation_masks = torch.tensor(test_attention_masks)
    
        # data loader for training
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = SequentialSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # data loader for validation
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
    
    import random

    seed_val = 42

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

#     model.cuda()
    model.to(device)

    optimizer = AdamW(model.parameters(),
                      lr = 3e-5, 
                      eps = 1e-8, 
                      weight_decay = 0.01
                    )

    epochs = 3
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # 10% * datasetSize/batchSize
                                                num_training_steps = total_steps)

    
    losses = run_train(epochs,train_dataloader,validation_dataloader)
#     loss.append(losses)
# print(losses)

In [None]:
torch.save(model.state_dict(),'mBERT')