In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaModel

  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
torch.cuda.empty_cache()
import gc
#del variables
gc.collect()

0

In [38]:
def get_dataloader(sentences,labels,max_len,batch_size, tokenizer, device):
    #sentences = df['Messages'].tolist()
    #labels = df['Class'].tolist()
    labels = labels.tolist()    
    
    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, max_length = max_len, pad_to_max_length=True,truncation=True)
        input_ids.append(encoded_sent)
        
    attention_masks = []
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
        
    assert len(attention_masks) == len(input_ids)
    
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)
    
    inputs = inputs.to(device)
    masks = masks.to(device)
    labels = labels.to(device)

    data = TensorDataset(inputs, masks, labels)
    dataloader = DataLoader(data, batch_size = batch_size)
    
    return dataloader

In [3]:
class BERT_Arch(nn.Module):
    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()
      self.bert = bert 
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      # relu activation function
      self.relu =  nn.ReLU()
     # dense layer 1
      self.fc1 = nn.Linear(768,512)
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,2)
      #self.fc2 = nn.Linear(768,2)
      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)
    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      #x = self.fc2(cls_hs)
      # apply softmax activation
      x = self.softmax(x)

      return x


In [7]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

'''
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
bert = RobertaModel.from_pretrained('roberta-base')
'''

# freeze all the parameters

for param in bert.parameters():
    param.requires_grad = False



In [72]:
traindata = pd.read_pickle("train.pkl")
traindata['Class']=traindata['Class'].replace({"true": 1, "false": 0})

In [73]:
print(len(traindata['Class']))

9755


In [75]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split( traindata['Messages'], traindata['Class'], test_size=0.1, random_state=42, stratify=traindata['Class'])


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Using device {}'.format(device))

Using device cuda


In [78]:
batch_size = 1
train_dataloader =get_dataloader(X_train,y_train,300,batch_size,tokenizer,device)
val_dataloader =get_dataloader(X_val,y_val,300,batch_size,tokenizer,device)



In [8]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

In [80]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(),lr = 1e-5)

In [81]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight('balanced', np.unique(traindata['Class']), traindata['Class'])

print("Class Weights:",class_weights)

Class Weights: [0.55533417 5.01800412]




In [82]:

# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# number of training epochs


In [83]:
# function to train the model
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    if step % 500 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    
    

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds

In [84]:
# function for evaluating the model
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
        
    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

In [85]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]
epochs = 15
#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights_freeze_bert.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')




In [9]:
#load weights of best model
#path = 'saved_weights_with_freeze.pt'
#path = 'saved_weights.pt'
path = 'saved_weights_freeze_bert.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [11]:
def test_data(df,max_len, tokenizer, device):
    sentences = df['Messages'].tolist()
    labels = df['Class'].tolist()
    
    
    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, max_length = max_len, pad_to_max_length=True,truncation=True)
        input_ids.append(encoded_sent)
        
    attention_masks = []
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
        
    assert len(attention_masks) == len(input_ids)
    
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)
    return inputs,masks,labels

In [14]:
testdata = pd.read_pickle("test.pkl")
testdata['Class']=testdata['Class'].replace({"true": 1, "false": 0})

In [17]:
# get predictions for test data
predictions = []
with torch.no_grad():
  for i in range(len(testdata)):
    if(i%1000)==0:
        print(i)    
    test_seq,test_mask,test_y = test_data(testdata.iloc[i:i+1],300, tokenizer, device)
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    predictions.append(preds[0])

0







In [19]:
print(classification_report(testdata['Class'].tolist(), predictions,digits=4))

  

In [20]:
from sklearn.metrics import fbeta_score

In [21]:
print(fbeta_score(testdata['Class'].tolist(), predictions, average='micro', beta=0.5))
print(fbeta_score(testdata['Class'].tolist(), predictions, average='macro', beta=0.5))
print(fbeta_score(testdata['Class'].tolist(), predictions, average='weighted', beta=0.5))



In [22]:
print(fbeta_score(testdata['Class'].tolist(), predictions, average='micro', beta=2))
print(fbeta_score(testdata['Class'].tolist(), predictions, average='macro', beta=2))
print(fbeta_score(testdata['Class'].tolist(), predictions, average='weighted', beta=2))

