In [1]:
import pandas as pd
import torch
import numpy as np

In [None]:
def set_seed(seed):
    #random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(100)

In [3]:
path_to_datasource = '/content/drive/MyDrive/CS728_Assgn3/AnswerTypeInference_Train_Data.txt'

In [4]:
def get_data(path):
  coarse_label = []
  fine_label = []
  question = []
  with open(path) as f:
      for line in f:
          label, ques = line.split(maxsplit=1)
          coarse_label.append(label.split(':')[0])
          fine_label.append(label.split(':')[1])
          question.append(ques[:-2])
  return question, coarse_label, fine_label

question, coarse_label, fine_label = get_data(path_to_datasource)

df = pd.DataFrame(zip(question, coarse_label, fine_label), columns=['question', 'coarse_label', 'fine_label'])
df    

Unnamed: 0,question,coarse_label,fine_label
0,How did serfdom develop in and then leave Russia,DESC,manner
1,What films featured the character Popeye Doyle,ENTY,cremat
2,How can I find a list of celebrities ' real na...,DESC,manner
3,What fowl grabs the spotlight after the Chines...,ENTY,animal
4,What is the full form of .com,ABBR,exp
...,...,...,...
5447,What 's the shape of a camel 's spine,ENTY,other
5448,What type of currency is used in China,ENTY,currency
5449,What is the temperature today,NUM,temp
5450,What is the temperature for cooking,NUM,temp


In [5]:
len(df['fine_label'].value_counts())

47

In [6]:
df[df['fine_label'] == 'instru']

Unnamed: 0,question,coarse_label,fine_label
256,What instrument is Ray Charles best known for ...,ENTY,instru
457,What musical instrument did Prewitt play in Ja...,ENTY,instru
1012,What musical instrument did Sherlock Holmes play,ENTY,instru
1247,Ray Charles plays which instrument,ENTY,instru
1361,What musical instrument did Prewitt play in Ja...,ENTY,instru
1834,What kind of guitar did Jimi Hendrix play,ENTY,instru
1853,Musician Ray Charles plays what instrument,ENTY,instru
2312,What do West Indian steel bands use as instrum...,ENTY,instru
3190,Ray Charles is best known for playing what ins...,ENTY,instru
3897,What instrument does Benny Carter play,ENTY,instru


In [7]:
#Setting the device to train/ run the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
#Initialising BERT tokenizer as well as bert-base-uncased model for accessing pre-trained BERT embeddings
!pip install transformers
from transformers import BertTokenizerFast, BertModel
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
emb_gen_model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True,
                                  )

emb_gen_model.to(device)

In [9]:
def get_cls_list(question):
  cls_list = []

  for q in question:
    tokenized_text = tokenizer(q, return_tensors='pt', padding=True).to(device)

    with torch.no_grad():
      outputs = emb_gen_model(**tokenized_text)
      #For each sample, extract out the last layer embedding of the [CLS] token
      cls_list.append(outputs[0][0][0].unsqueeze(dim=0))    #and correcting embedding size
  return cls_list

cls_list = get_cls_list(question)

In [10]:
#Train-validation split cls_list and targets

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(fine_label)


from sklearn.model_selection import train_test_split
rand_seed = 42

cls_list_train, cls_list_val, fine_label_train, fine_label_val = \
train_test_split(cls_list, fine_label, test_size= 0.1, \
                 random_state = rand_seed, stratify = fine_label)

In [11]:
#Training tensors
train_input_ids = torch.cat(cls_list_train, dim=0)
train_target = torch.tensor(le.transform(fine_label_train))

#Validation tensors
val_input_ids = torch.cat(cls_list_val, dim=0)
val_target = torch.tensor(le.transform(fine_label_val))

In [12]:
train_input_ids.size()
torch.max(train_target)

tensor(46)

In [13]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(train_input_ids, train_target)
val_dataset = TensorDataset(val_input_ids, val_target)

train_batch_size = 512

train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                          batch_size=train_batch_size,
                                          shuffle=True)

In [14]:
hidden_dim = 1000
param = {
    'i_d' : train_dataset[0][0].size(-1),
    'h_d' : hidden_dim,
    'o_d' : torch.max(train_target) + 1
}

In [15]:
#Defining the neural network model
import torch.nn as nn

class ATI_Net(nn.Module):
    def __init__(self, param: dict):
      super().__init__()
      self.param = param
      self.model = nn.Sequential(
          nn.Linear(self.param['i_d'], self.param['h_d']),  
          nn.ReLU(),
          nn.Linear(self.param['h_d'], self.param['o_d']),
      )
      
    def forward(self, input):
        return self.model(input)

#Initialise the model
model = ATI_Net(param)
model.to(device)

ATI_Net(
  (model): Sequential(
    (0): Linear(in_features=768, out_features=1000, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1000, out_features=47, bias=True)
  )
)

In [16]:
#Hyper-parameters
#Batch size
#training batch size set at dataloader initialisation

#Optimiser
import torch.optim as opt

learn_rate = 1e-3  #learning rate
optim = opt.Adam(model.parameters(), lr=learn_rate) #optimiser

#Loss function
criterion=nn.CrossEntropyLoss(reduction = 'sum')  #Loss function

In [17]:
  for _, data in enumerate(train_data_loader):

    X, label = data
    label=label.to(device)
    
    output=model(X)
    loss=criterion(output, label)

    optim.zero_grad()
    loss.backward()
    optim.step()
    break

In [18]:
X.size()

torch.Size([512, 768])

In [19]:
output.size()

torch.Size([512, 47])

In [20]:
#Returns the evaluation dataset cross-entropy loss and accuracy.

def eval_model(model, d_set): 

  preds = []
  correct = 0
  eval_batch_size = 512
  eval_loss = 0
  eval_data_loader = torch.utils.data.DataLoader(d_set,
                                          batch_size=eval_batch_size, shuffle=False)
  
  model.eval()
  for X, label in eval_data_loader:
    X=X.to(device)
    label=label.to(device)

    with torch.no_grad():
      output=model(X)
      loss=criterion(output, label)
      correct += torch.sum(torch.max(output, dim=-1).indices == label.flatten())
      preds.append(torch.max(output, dim=-1).indices.flatten())
      eval_loss += loss.item()

  pred_labels = torch.cat(preds, dim=-1)
  return (correct/len(d_set)), (eval_loss/len(d_set)), le.inverse_transform(pred_labels.to('cpu'))


In [21]:
def train_eval_1_epoch(model, epoch):

  # Reset the total loss for this epoch.
  total_train_loss = 0

  model.train()
  for _, data in enumerate(train_data_loader):

    X, label = data
    X = X.to(device)
    label = label.to(device)
    
    output=model(X)
    loss=criterion(output, label)

    optim.zero_grad()
    loss.backward()
    total_train_loss += loss.item()
    optim.step()
  
  _, eval_loss, _ = eval_model(model, val_dataset)

  print(f"For epoch {epoch}, training loss is {total_train_loss/len(train_dataset)} and evaluation loss is {eval_loss}.")


In [22]:
num_epochs = 25

for epoch_i in range(num_epochs):

  print(f"Epoch number {epoch_i}")
  train_eval_1_epoch(model, epoch_i)


Epoch number 0
For epoch 0, training loss is 2.9447884759852414 and evaluation loss is 2.633495330810547.
Epoch number 1
For epoch 1, training loss is 2.413878710280619 and evaluation loss is 2.2703651176704156.
Epoch number 2
For epoch 2, training loss is 2.0387452565258104 and evaluation loss is 1.979385976826315.
Epoch number 3
For epoch 3, training loss is 1.745997948691255 and evaluation loss is 1.746841577383188.
Epoch number 4
For epoch 4, training loss is 1.5169201060503879 and evaluation loss is 1.5715846037253356.
Epoch number 5
For epoch 5, training loss is 1.341296003538397 and evaluation loss is 1.4221541855361435.
Epoch number 6
For epoch 6, training loss is 1.1905650322747336 and evaluation loss is 1.3213162474579863.
Epoch number 7
For epoch 7, training loss is 1.076750433790018 and evaluation loss is 1.261642078776936.
Epoch number 8
For epoch 8, training loss is 0.9859147132099088 and evaluation loss is 1.1615552028893552.
Epoch number 9
For epoch 9, training loss is 

In [23]:
print(len(train_dataset))
print(len(val_dataset))

4906
546


In [24]:
train_eval = train_dataset + val_dataset
print(len(train_eval))

5452


In [25]:
acc, _, _ = eval_model(model, train_dataset)
(acc)

tensor(0.9130, device='cuda:0')

In [26]:
acc, _, _ = eval_model(model, train_eval)
acc

tensor(0.8986, device='cuda:0')

In [27]:
path_to_test_data = '/content/drive/MyDrive/CS728_Assgn3/AnswerTypeInference_Test_Data.txt'
question_test, coarse_label_test, fine_label_test = get_data(path_to_test_data)

In [28]:
cls_list_test = get_cls_list(question_test)

In [29]:
#Test dataset
test_input_ids = torch.cat(cls_list_test, dim=0)
test_target = torch.tensor(le.transform(fine_label_test))

test_dataset = TensorDataset(test_input_ids, test_target)

In [30]:
acc, _, y_preds = eval_model(model, test_dataset)
print("Accuracy of model on test dataset is", acc)

Accuracy of model on test dataset is tensor(0.8000, device='cuda:0')


In [31]:
lst = list(zip(fine_label, coarse_label))
mapping = set(lst)
mapping.remove(('other', 'LOC'))
mapping.remove(('other', 'NUM'))
lookup = dict(list(mapping))

y_coarse = []
for pred in y_preds:
  y_coarse.append(lookup[pred])

In [32]:
test_df = pd.DataFrame(zip(y_coarse, y_preds, question_test))
test_df.to_csv("test_predictions.csv",index = False, header = False)