In [1]:
import pandas as pd
import csv
import numpy as np
import re
!pip install transformers
from transformers import AutoTokenizer
from transformers import BertConfig, BertModel
from torch import nn
import torch
from torch.utils.data import TensorDataset, DataLoader
import os

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 11.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 53.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


### Preprocessing

In [2]:
# Read training file
df = pd.read_csv("https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/test-labels/lcp_single_test.tsv",
                 delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df = df.replace(np.nan, 'null')

# Remove columns ID and Corpus
df = df[['sentence', 'token', 'complexity']]

# Lowercase sentence and token column in dataframe
df['token'] = df['token'].str.lower() 
df['sentence'] = df['sentence'].str.lower()

# Convert columns to list
sentences = df['sentence'].tolist()
tokens = df['token'].tolist()
complexities = df['complexity'].tolist()

In [3]:
df

Unnamed: 0,sentence,token,complexity
0,"but he, beckoning to them with his hand to be ...",hand,0.000000
1,"if i forget you, jerusalem, let my right hand ...",hand,0.197368
2,"the ten sons of haman the son of hammedatha, t...",hand,0.200000
3,let your hand be lifted up above your adversar...,hand,0.267857
4,"abimelech chased him, and he fled before him, ...",entrance,0.000000
...,...,...,...
912,"the report by mr philippe busquin, on behalf o...",dissemination,0.569444
913,section v - court of auditors (sec(2002) 405 -...,dec,0.535714
914,"- mr president, on 1 june, on the basis of inf...",radiological,0.546875
915,i would like to extend a warm welcome to this ...,sidi,0.571429


In [4]:
# Used for preprocessing
class preprocessing():
    def __init__(self, sentences, tokens, complexities):
        self.tokens = tokens
        self.sentences = sentences
        # Convert complexity list to tensor
        self.complexity = torch.Tensor(complexities)

        self.remove_websites()
        self.remove_punctuation()
        
    # Remove any punctuation                                                  
    def remove_punctuation(self):
        # Loop through each index and update text in index without punctuation
        for index in range(0, len(self.sentences)):
            sentence = self.sentences[index]
            text = re.sub(r'[^\w\s]',"", sentence)
            self.sentences[index] = text
    # Remove website links
    def remove_websites(self):
        # Loop through each index and update text in index without websites
        for index in range(0, len(self.sentences)):
            sentence = self.sentences[index]
            text = re.sub(r'(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z][a-z\-]*', "", sentence)
            self.sentences[index] = text

In [5]:
pp = preprocessing(sentences, tokens, complexities)

### Tokenizer and Apply Padding

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

#### Sentences

In [7]:
# Create a sentence tokenizer using AutoTokenizer above with min/max length of 85
encoded_sentences = tokenizer(pp.sentences, padding="max_length", max_length=85, truncation=True)
input_ids_sentences = encoded_sentences['input_ids']
attention_mask_sentences = encoded_sentences['attention_mask']

#### Token

In [8]:
# Create a token tokenizer using AutoTokenizer above with min/max length of 85
encoded_tokens = tokenizer(pp.tokens, padding="max_length", max_length=85, truncation=True)
input_ids_tokens = encoded_tokens['input_ids']
attention_mask_tokens = encoded_tokens['attention_mask']

### Convert to Test Set to DataLoaders

In [9]:
# Used to create dataloaders and tensor datasets
def dataloaders(sentences_id, sentence_masks, labels, token_id, token_mask, batch_size):
    
    # Create tensors for sentence inputs and masks
    sent_inputs = torch.tensor(sentences_id)
    sent_masks = torch.tensor(sentence_masks)
    
    # Create tensors for token inputs and masks
    token_inputs = torch.tensor(token_id)
    token_masks = torch.tensor(token_mask)
    
    # Create tensor input for labels
    labels = torch.tensor(labels)
    
    # Create the Tensordata set for test_data
    test_data = TensorDataset(sent_inputs, sent_masks, labels, token_inputs, token_masks)
    
    # Create dataloader
    test_dataloader = DataLoader(dataset = test_data, batch_size = batch_size)
    return test_dataloader

In [10]:
# Batch size
batch_size = 32

# Create dataloaders and tensordatasets
test_dataloader = dataloaders(input_ids_sentences, attention_mask_sentences, complexities, 
                               input_ids_tokens, attention_mask_tokens, batch_size)

In [11]:
# Class to create BERT model
class Bert_Model(torch.nn.Module):
    def __init__(self, input_dim, config):
        super().__init__()
        
        # BERT models for sentence and token
        self.sent_emb = BertModel(config)
        self.token_emb = BertModel(config)
        
        # Linear Layer for BERT Model outputs
        self.cls = nn.Linear(2 * 768, 1)
        # Sigmoid activation for linear layers
        self.sigmoid = nn.Sigmoid()

    def forward(self, sent_id, sent_mask, token_id, token_mask):
        # BERT for sentence
        # Retrieve the prediction BERT for sentence
        sent_embed = self.sent_emb(sent_id,sent_mask)[1]
        sent_embed = sent_embed.view(sent_embed.shape[0], -1)
        # apply 0.20 weight to BERT for Sentence
        sent_embed = torch.mul(sent_embed, 0.20)

        # BERT for token
        # Retrieve the prediction BERT for token
        token_embed = self.token_emb(token_id,token_mask)[1]
        token_embed = token_embed.view(token_embed.shape[0], -1)
        # apply 0.80 weight to BERT for Token
        token_embed = torch.mul(token_embed, 0.80)
        
        # Concat the results from BERT models
        last_hidden_state = torch.cat([sent_embed, token_embed], dim=1)
        # Apply linear layer
        output = self.cls(last_hidden_state)
        # Apply sigmoid function
        y_pred = self.sigmoid(output)
        return y_pred

In [12]:
# Used to calculuate test loss
def evaluate(model, test_dataloader):
    print("---------------------------------------------------------------------------------------------------")
    print("Test Loss")
    print("---------------------------------------------------------------------------------------------------")
    
    model.eval()
    total_loss = 0
    count = 0
    
    # Loop through each test_dataloader batch
    for step, batch in enumerate(test_dataloader): 
        # Retrieve data
        sent_id, sent_mask, labels, token_id, token_mask = tuple(b.to(device) for b in batch)
        # Retrieve prediction
        outputs = model(sent_id, sent_mask, token_id, token_mask)
        # Retrieve loss
        test_loss = mae_loss(outputs.squeeze(), labels.squeeze())
        # Add loss to total loss
        total_loss += test_loss.item()
        # Print loss for every ten steps
        if step % 10 == 0:
            print("Test Step: " + str(step))  
            print("Test Loss: " + str(test_loss.item()))
        count += 1
    # Calculate Average Loss
    average_loss = total_loss/count
    return test_loss, average_loss, total_loss

In [13]:
# Used to find MAE loss
def mae_loss(outputs, labels):
    subtract = torch.subtract(outputs, labels)
    absolute_val = torch.abs(subtract)
    sum_all = torch.sum(absolute_val)
    return torch.div(sum_all, outputs.shape[0])

### Load Model

In [14]:
from google.colab import drive
drive.mount('/content/drive')

path ='/content/drive/MyDrive/NLPProject/model.pth'

Mounted at /content/drive


In [15]:
# Model config
config = BertConfig(max_position_embeddings = 85, hidden_act = 'relu', hidden_dropout_prob = 0.15, attention_probs_dropout_prob = 0.15, classifier_dropout = 0.25)
model = Bert_Model(768, config)

# Load model parameters
model.load_state_dict(torch.load(path))

device = torch.device("cuda")
model.to(device)

Bert_Model(
  (sent_emb): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(85, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.15, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.15, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [16]:
# Find the test, average loss, total loss
test_loss, average_loss, total_loss = evaluate(model, test_dataloader)
print()
print("Average Test Loss: " +  str(average_loss))
print("Total Test Loss: " +  str(total_loss))
print("---------------------------------------------------------------------------------------------------")

---------------------------------------------------------------------------------------------------
Test Loss
---------------------------------------------------------------------------------------------------
Test Step: 0
Test Loss: 0.07839058339595795
Test Step: 10
Test Loss: 0.061962638050317764
Test Step: 20
Test Loss: 0.03821107745170593

Average Test Loss: 0.09075745297917004
Total Test Loss: 2.6319661363959312
---------------------------------------------------------------------------------------------------
