In [1]:
# Downloading/Installing Gensim
# pip install --upgrade gensim

In [2]:
# Import necessary libraries
import gensim.downloader
word2vec_google = gensim.downloader.load('word2vec-google-news-300')

# Part 2

In [3]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

[nltk_data] Downloading package punkt to C:\Users\Jun
[nltk_data]     Wei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
train = train.drop(['label-fine'], axis=1)
test = test.drop(['label-fine'], axis=1)
display(train)
display(test)

Unnamed: 0,label-coarse,text
0,0,How did serfdom develop in and then leave Russ...
1,1,What films featured the character Popeye Doyle ?
2,0,How can I find a list of celebrities ' real na...
3,1,What fowl grabs the spotlight after the Chines...
4,2,What is the full form of .com ?
...,...,...
5447,1,What 's the shape of a camel 's spine ?
5448,1,What type of currency is used in China ?
5449,4,What is the temperature today ?
5450,4,What is the temperature for cooking ?


Unnamed: 0,label-coarse,text
0,4,How far is it from Denver to Aspen ?
1,5,"What county is Modesto , California in ?"
2,3,Who was Galileo ?
3,0,What is an atom ?
4,4,When did Hawaii become a state ?
...,...,...
495,3,Who was the 22nd President of the US ?
496,1,What is the money they use in Zambia ?
497,4,How many feet in a mile ?
498,1,What is the birthstone of October ?


In [5]:
# Print unique labels in train and test dataset
train_labels = train['label-coarse'].sort_values().unique()
test_labels = test['label-coarse'].sort_values().unique()
print(f"Train labels: {train_labels}")
print(f"Test labels: {test_labels}")


Train labels: [0 1 2 3 4 5]
Test labels: [0 1 2 3 4 5]


## Re-classifying classes of original dataset

In [6]:
# Combine labels 2 and 5 to form single class 'OTHERS', denoted by label 2, for train and test dataset
train_mask = ((train['label-coarse']==2) | (train['label-coarse']==5))
test_mask = ((test['label-coarse']==2) | (test['label-coarse']==5))
train.loc[train_mask, 'label-coarse'] = 2
test.loc[test_mask, 'label-coarse'] = 2

display(train)
display(test)

Unnamed: 0,label-coarse,text
0,0,How did serfdom develop in and then leave Russ...
1,1,What films featured the character Popeye Doyle ?
2,0,How can I find a list of celebrities ' real na...
3,1,What fowl grabs the spotlight after the Chines...
4,2,What is the full form of .com ?
...,...,...
5447,1,What 's the shape of a camel 's spine ?
5448,1,What type of currency is used in China ?
5449,4,What is the temperature today ?
5450,4,What is the temperature for cooking ?


Unnamed: 0,label-coarse,text
0,4,How far is it from Denver to Aspen ?
1,2,"What county is Modesto , California in ?"
2,3,Who was Galileo ?
3,0,What is an atom ?
4,4,When did Hawaii become a state ?
...,...,...
495,3,Who was the 22nd President of the US ?
496,1,What is the money they use in Zambia ?
497,4,How many feet in a mile ?
498,1,What is the birthstone of October ?


In [7]:
train_labels_cleaned = train['label-coarse'].sort_values().unique()
test_labels_cleaned = test['label-coarse'].sort_values().unique()
print(f"Train labels: {train_labels_cleaned}")
print(f"Test labels: {test_labels_cleaned}")

Train labels: [0 1 2 3 4]
Test labels: [0 1 2 3 4]


In [8]:
from sklearn.model_selection import train_test_split

print("Original train size:", train.shape)

train_set, dev_set = train_test_split(train, test_size = 500)

print("New train size:", train_set.shape)
print("New development set size:", dev_set.shape)
display(train_set)
display(dev_set)

Original train size: (5452, 2)
New train size: (4952, 2)
New development set size: (500, 2)


Unnamed: 0,label-coarse,text
4875,4,When was Dubai 's first concrete house built ?
5328,4,What should the temperature be set at while ba...
1099,3,Who was the famous door-to-door brush salesman ?
4940,1,What type of exercise burns the most calories ?
1856,3,Who started the Dominos Pizza chain ?
...,...,...
4686,0,How can you tell when figs are ripe ?
3681,0,How do you do a topic outline ?
3324,4,How many states have a `` lemon law '' for new...
4520,4,How much did Mercury spend on advertising in 1...


Unnamed: 0,label-coarse,text
1553,3,What was the Christian name of the title chara...
1135,0,"What do the names Neil , Mary , and Anthony me..."
4554,0,Why is Rush 's 2112 called 2112 ?
4427,4,How many people own pets ?
5080,1,What are some of Australia 's native flora ?
...,...,...
4420,4,How many zip codes are there in the U.S. ?
1487,0,What are you hearing when you put a seashell t...
252,0,What is the origin of the name Katie ?
2175,3,What cheery fellow got the ZIP code 9971 from ...


## Neural Network Implementation

### Input Pre-processing
Architecture: Split sentences into words --> Apply word2vec embeddings --> Map embeddings to indices (word indexing) --> Padding --> Convert to PyTorch tensor --> Create DataLoader to load data for model training

In [9]:
# Function to split sentences into words and convert the column into a list of list of words

def split_text_into_words(df, column_name):
   # Split the text into words
   temp_df = df.copy()
   temp_df[column_name] = temp_df[column_name].str.split()
   
   # Convert the column into a list
   word_list = temp_df[column_name].tolist()
   
   # Return the list
   return word_list

In [10]:
train_wordList = split_text_into_words(train_set, 'text')
dev_wordList = split_text_into_words(dev_set, 'text')
test_wordList = split_text_into_words(test, 'text')


In [11]:
train_wordList

[['When', 'was', 'Dubai', "'s", 'first', 'concrete', 'house', 'built', '?'],
 ['What',
  'should',
  'the',
  'temperature',
  'be',
  'set',
  'at',
  'while',
  'baking',
  'Peachy',
  'Oat',
  'Muffins',
  '?'],
 ['Who', 'was', 'the', 'famous', 'door-to-door', 'brush', 'salesman', '?'],
 ['What', 'type', 'of', 'exercise', 'burns', 'the', 'most', 'calories', '?'],
 ['Who', 'started', 'the', 'Dominos', 'Pizza', 'chain', '?'],
 ['How',
  'many',
  'muscles',
  'does',
  'the',
  'average',
  'adult',
  'use',
  'when',
  'going',
  'for',
  'a',
  'walk',
  '?'],
 ['In', 'what', 'year', 'did', 'Thatcher', 'become', 'prime', 'minister', '?'],
 ['Which', 'two', 'states', 'enclose', 'Chesapeake', 'Bay', '?'],
 ['What', 'state', 'is', 'Niagara', 'Falls', 'located', 'in', '?'],
 ['Who', 'invented', 'television', '?'],
 ['Who', 'killed', 'JFK', '?'],
 ['What', 'is', 'the', 'chemical', 'reactivity', 'of', 'neon', '?'],
 ['How',
  'much',
  'is',
  'a',
  'Canadian',
  '1967',
  'twenty',
  'd

In [12]:
# Function to convert list of sentences (list of list of words) into a matrix format
def createMatrix(sentences, word2Idx):

    dataset = []

    wordCount = 0
    unknownWordCount = 0

    for sentence in sentences:
        wordIndices = []

        for word in sentence:
            wordCount += 1
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]
            else:
                wordIdx = word2Idx['UNKNOWN_TOKEN']
                unknownWordCount += 1
            
            # Get the label and map to int
            wordIndices.append(wordIdx)

        dataset.append(wordIndices)

    return dataset

In [13]:
# Finding the longest sentence in the whole dataset (train, dev and test)
train_maxlen = train_set['text'].str.split().str.len().max()
dev_maxlen = dev_set['text'].str.split().str.len().max()
test_maxlen = test['text'].str.split().str.len().max()

print(train_maxlen, dev_maxlen, test_maxlen)

37 29 17


In [14]:
# Function to pad each sentence to a fixed length (max sentence length)
def padding(wordList):
    max_sentence_length = 37
    for sentence in wordList:
        num_zeros = max_sentence_length-len(sentence)
        sentence.extend([0]*num_zeros)
    return wordList


In [15]:
# Create label set and word set
labelSet = set()
wordSet = set()

# unique words and labels in data  
for dataset in [train_set, test, dev_set]:
    for index, row in dataset.iterrows():
        sentence = row['text']
        label = row['label-coarse']
        labelSet.add(label)
        for word in sentence.split():
            wordSet.add(word.lower())


embed_size = len(word2vec_google.vectors[0])

# Get all words in the word2vec model
w2v_dictionary = list(word2vec_google.key_to_index.keys())

wordEmbeddings = []
# adding vector for padding token
wordEmbeddings.append(np.zeros(embed_size))
# adding vector for unknown vector intialised with uniform distribution
wordEmbeddings.append(np.random.uniform(-0.25, 0.25, embed_size))

# Initialise dictionary to map each word to a unique index
word2Idx = {}

# add padding+unknown
word2Idx["PADDING_TOKEN"] = len(word2Idx)
word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)

# adding vector for words present in the CoNLL2003 dataset
for word in w2v_dictionary:
    try:
        if word.lower() in wordSet:
            embedding_vector = word2vec_google.get_vector(word)
            if embedding_vector is not None:
                wordEmbeddings.append(embedding_vector)
                word2Idx[word] = len(word2Idx)
    except Exception as e:
        pass

wordEmbeddings = np.array(wordEmbeddings)

# format: [[padded wordindices], [caseindices], [padded char indices], [label indices]]

In [16]:
# Converting list of word indices and labels into PyTorch tensors
trainSentences = torch.tensor(padding(createMatrix(train_wordList, word2Idx)))
devSentences = torch.tensor(padding(createMatrix(dev_wordList, word2Idx)))
testSentences = torch.tensor(padding(createMatrix(test_wordList, word2Idx)))

trainLabels = torch.tensor(train_set['label-coarse'].tolist())
devLabels = torch.tensor(dev_set['label-coarse'].tolist())
testLabels = torch.tensor(test['label-coarse'].tolist())

In [17]:
# Create DataLoader for batched model training

from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
   def __init__(self, data, labels):
       self.data = data
       self.labels = labels

   def __len__(self):
       return len(self.data)

   def __getitem__(self, idx):
       return self.data[idx], self.labels[idx]

# Create DataLoader
train_dataloader = DataLoader(CustomDataset(trainSentences, trainLabels), batch_size=64, shuffle=True)
dev_dataloader = DataLoader(CustomDataset(devSentences, devLabels), batch_size=64, shuffle=True)


In [18]:
trainSentences.dtype

torch.int64

In [19]:
# Early stop as regularization to prevent overfitting (stops model training when dev set stops improving)

class EarlyStopper:
    def __init__(self, patience=10, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.max_accuracy = -np.inf

    def early_stop(self, accuracy):
        if accuracy > self.max_accuracy:
            self.max_accuracy = accuracy
            self.counter = 0
        elif accuracy < (self.max_accuracy - self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [20]:
wordEmbeddings.shape

(22955, 300)

### BiLSTM Implementation

In [21]:
# Define your Bi-LSTM model
class BiLSTMModel(nn.Module):
    def __init__(self, hidden_size, num_classes,wordEmbeddings,aggregation_method, dropout_rate):
        super(BiLSTMModel, self).__init__()  # Call the superclass's __init__ method

        self.wordEmbeddings = wordEmbeddings
        self.hidden_size = hidden_size
        self.aggregation_method = aggregation_method
        self.dropout_rate = dropout_rate

        self.embedding = nn.Embedding.from_pretrained(embeddings=self.wordEmbeddings,freeze=True,padding_idx=0)
        self.dropout = nn.Dropout(p=self.dropout_rate)

        self.bilstm = nn.LSTM(input_size=self.wordEmbeddings.shape[1], hidden_size=self.hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # Multiply by 2 for bidirectional
        self.softmax = nn.Softmax(dim=1) 


    def forward(self, x):
        x = x.to(torch.long)  # Convert the data type of the inputs        
        embed_out = self.embedding(x)
        embed_out = embed_out.to(torch.float32)
        lstm_out, _ = self.bilstm(embed_out)

        # Create a mask that is the same size as the input tensor
        mask = (x != 0).float()
        # Apply the mask to the output of the LSTM layer
        lstm_out = lstm_out * mask.unsqueeze(-1)

        if self.aggregation_method == 'avg':
           agg_output = torch.sum(lstm_out, dim=1) / torch.sum(mask, dim=1).unsqueeze(-1)
        elif self.aggregation_method == 'max':
            agg_output = torch.max(lstm_out, dim=1)[0]

        fc_out = self.fc(agg_output)
        softmax_out = self.softmax(fc_out)
        
        return softmax_out

### Machine Learning Model training

In [22]:
# Creating function for training of machine learning model
def train(model, train_dataloader, dev_dataloader,criterion, optimizer, epochs, early_stopping):

   train_accs = []
   dev_accs = []

   for epoch in range(epochs):

      model.train()
      correct = 0
      total = 0      
      for inputs,labels in train_dataloader:

         # Zero the parameter gradients
         optimizer.zero_grad()

         # Forward pass
         outputs = model(inputs)
         loss = criterion(outputs, labels)

         preds = torch.argmax(outputs,dim=1)
         
         # Backward pass and optimize
         loss.backward()
         optimizer.step()

         # Calculate accuracy
         correct += torch.sum(preds == labels)
         total += labels.size(0)
      train_accuracy = correct.double()/total
      train_accs.append(train_accuracy)

      model.eval()
      correct = 0
      total = 0
      with torch.no_grad():
         for inputs,labels in dev_dataloader:

            # Forward pass
            outputs = model(inputs)

            preds = torch.argmax(outputs,dim=1)

            # Calculate accuracy
            correct += torch.sum(preds == labels)
            total += labels.size(0)

      dev_accuracy = correct.double()/total
      dev_accs.append(dev_accuracy)
      
      print(f"Epoch: {epoch+1}, Train Acc: {train_accuracy}, Dev Acc: {dev_accuracy}")

      if early_stopping.early_stop(dev_accuracy):
         print(f"Early Stopping at Epoch:{epoch+1}")
         break

   return model,train_accuracy,dev_accuracy

### Initialise model

#### Average pooling aggregation method

In [31]:
avg_pool_blstm = BiLSTMModel(hidden_size=256, num_classes=5, wordEmbeddings=torch.tensor(wordEmbeddings), aggregation_method='avg', dropout_rate=0.5)

early_stopper = EarlyStopper()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(avg_pool_blstm.parameters(), lr=0.001)

In [32]:
avg_pool_blstm, train_acc,val_acc = train(avg_pool_blstm, train_dataloader, dev_dataloader, criterion=loss_fn, early_stopping=early_stopper, epochs=100, optimizer=optimizer)

Epoch: 1, Train Acc: 0.4555735056542811, Dev Acc: 0.576
Epoch: 2, Train Acc: 0.6924474959612278, Dev Acc: 0.714
Epoch: 3, Train Acc: 0.7691841680129241, Dev Acc: 0.73
Epoch: 4, Train Acc: 0.7728190630048465, Dev Acc: 0.74
Epoch: 5, Train Acc: 0.8309773828756059, Dev Acc: 0.788
Epoch: 6, Train Acc: 0.8412762520193862, Dev Acc: 0.812
Epoch: 7, Train Acc: 0.8422859450726979, Dev Acc: 0.808
Epoch: 8, Train Acc: 0.8729806138933764, Dev Acc: 0.824
Epoch: 9, Train Acc: 0.8923667205169629, Dev Acc: 0.85
Epoch: 10, Train Acc: 0.9040791599353797, Dev Acc: 0.838
Epoch: 11, Train Acc: 0.869345718901454, Dev Acc: 0.724
Epoch: 12, Train Acc: 0.8836833602584814, Dev Acc: 0.838
Epoch: 13, Train Acc: 0.9083198707592892, Dev Acc: 0.86
Epoch: 14, Train Acc: 0.9222536348949919, Dev Acc: 0.844
Epoch: 15, Train Acc: 0.9260904684975767, Dev Acc: 0.852
Epoch: 16, Train Acc: 0.9339660743134087, Dev Acc: 0.856
Epoch: 17, Train Acc: 0.9384087237479806, Dev Acc: 0.856
Epoch: 18, Train Acc: 0.9125605815831987, Dev

#### Max pooling aggregation method

In [33]:
max_pool_blstm = BiLSTMModel(hidden_size=256, num_classes=5, wordEmbeddings=torch.tensor(wordEmbeddings), aggregation_method='max', dropout_rate=0.5)

early_stopper = EarlyStopper()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(max_pool_blstm.parameters(), lr=0.001)

In [34]:
max_pool_blstm, train_acc,val_acc = train(max_pool_blstm, train_dataloader, dev_dataloader, criterion=loss_fn, early_stopping=early_stopper, epochs=100, optimizer=optimizer)

Epoch: 1, Train Acc: 0.470718901453958, Dev Acc: 0.632
Epoch: 2, Train Acc: 0.7243537964458805, Dev Acc: 0.718
Epoch: 3, Train Acc: 0.7938206785137318, Dev Acc: 0.734
Epoch: 4, Train Acc: 0.8295638126009693, Dev Acc: 0.788
Epoch: 5, Train Acc: 0.8515751211631664, Dev Acc: 0.826
Epoch: 6, Train Acc: 0.8764135702746365, Dev Acc: 0.848
Epoch: 7, Train Acc: 0.9075121163166397, Dev Acc: 0.828
Epoch: 8, Train Acc: 0.9174071082390953, Dev Acc: 0.848
Epoch: 9, Train Acc: 0.9254846526655897, Dev Acc: 0.85
Epoch: 10, Train Acc: 0.928513731825525, Dev Acc: 0.822
Epoch: 11, Train Acc: 0.941437802907916, Dev Acc: 0.844
Epoch: 12, Train Acc: 0.9489095315024233, Dev Acc: 0.866
Epoch: 13, Train Acc: 0.952140549273021, Dev Acc: 0.836
Epoch: 14, Train Acc: 0.9539579967689822, Dev Acc: 0.858
Epoch: 15, Train Acc: 0.9513327948303716, Dev Acc: 0.86
Epoch: 16, Train Acc: 0.9559773828756059, Dev Acc: 0.686
Epoch: 17, Train Acc: 0.943659127625202, Dev Acc: 0.86
Epoch: 18, Train Acc: 0.9612277867528272, Dev Ac

### Machine Learning Model evaluation
Perform ML model evaluation by running on test dataset and predicting the labels

In [35]:
test_dataloader = DataLoader(CustomDataset(testSentences, testLabels), batch_size=64, shuffle=True)

In [36]:
def test(model, test_dataloader):
   model.eval()
   correct = 0
   total = 0
   with torch.no_grad():
       for inputs, labels in test_dataloader:
           outputs = model(inputs)
           preds = torch.argmax(outputs, dim=1)
           correct += torch.sum(preds == labels)
           total += labels.size(0)
   test_accuracy = correct.double() / total
   return test_accuracy

In [37]:
test_accuracy = test(avg_pool_blstm, test_dataloader)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.906


In [38]:
test_accuracy = test(max_pool_blstm, test_dataloader)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.902
