# Quiz 1 : Classification task with CNN & BiLSTM

In [1]:
enter_name = "Kyi Thin Nu st124087"

In [2]:
import pandas as pd
import torch, torchdata, torchtext
from torch import nn
import time
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 2422
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


In [3]:
torch.__version__

'2.1.2'

In [4]:
torchtext.__version__

'0.16.2'

## Load the given dataset

1. Create a variable to your dataset PATH *example:  ./data/*
2. Load the csv files using pandas 



In [5]:
DATA_PATH = "./data"

train_data_raw = pd.read_csv('./data/train.csv')
test_data_raw = pd.read_csv('./data/test.csv')


In [6]:
train_data_raw.shape

(7613, 5)

In [7]:
test_data_raw.shape

(3263, 4)

In [8]:
train_data_raw.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
## Lets analyze the data a little

#print and show how many unique classes are in the target

classes = set(train_data_raw.target)
num_classes = len(classes)
print(num_classes)

2


In [10]:
assert num_classes > 1

In [11]:
##lets see how many columns are there
#print the columns of the train_data_raw

#write your code here
print(train_data_raw.columns)
len(train_data_raw.columns)

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')


5

1. Lets remove the keywords and location columns. We only want to focus on the text and the predictions
2. Lets split some training data to validation dataset

In [88]:
type(train_data_raw)

pandas.core.frame.DataFrame

In [89]:
test = train_data_raw.drop(['id'], axis = 1)

In [12]:
SPLIT_PER = 2 #percentage of split for validation set 2 = 2%
split =  int(len(train_data_raw) * (SPLIT_PER/100))

dropped_train = train_data_raw.drop(['id', 'keyword', 'location'], axis = 1) #drop the id, keyowrd and location columns from the train_data_raw
print(dropped_train.shape)

train_data = dropped_train[:-split]
valid_data = dropped_train[-split:]

assert train_data.shape == (len(train_data_raw) - split, 2)
assert valid_data.shape == (split, 2)

(7613, 2)


In [13]:
print(train_data_raw.shape)
print("After dropping columns and spliting!")
print(train_data.shape, valid_data.shape)

(7613, 5)
After dropping columns and spliting!
(7461, 2) (152, 2)


## Lets tokenize the data

In [14]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
 
tokens = tokenizer("We are learning torchtext in AIT!")  #some test
tokens

['We', 'are', 'learning', 'torchtext', 'in', 'AIT', '!']

In [None]:
# for x

In [70]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter):
    #loop through the data_iter, 
    # Mind that the data_iter in this case is pandas Dataframe
    for text in data_iter:
        # yield tokenizer(list(train_data[text]))
        yield tokenizer(text)

specials = ['<unk>', '<pad>', '<bos>', '<eos>'] #create array of special tags for the vocab
vocab  = build_vocab_from_iterator(yield_tokens(train_data.text), specials = specials, special_first=True)
# vocab  = build_vocab_from_iterator(yield_tokens(train_data), specials = specials, special_first=True)

#set_default_index of the vocab to unknown tag
vocab.set_default_index(vocab["<unk>"])

In [71]:
assert len(vocab) == 26442
# assert len(vocab) == 6

In [None]:
vocab_dict = vocab.get_stoi()
# vocab_dict

In [73]:
from torchtext.vocab import FastText
fast_vectors = FastText(language='simple')

fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

In [74]:
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

#since the fasttext  has 300 embedding
assert fast_embedding.shape == (len(vocab), 300)

In [75]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)
## Copy from the lab. Note that Something has to be changed

In [76]:
text_pipeline("I love to play football")

[13, 185, 10, 683, 2229]

In [77]:
label_pipeline('0')

0

## To fit the padnas dataframe to DataLoader first we must wrap it as DataSet

In [78]:
from torch.utils.data import Dataset

class PD_DATASET(Dataset):

    def __init__(self, dataframe):
        self.dataframe = dataframe
    
    def __len__(self):
        return len(self.dataframe)
        
    def __getitem__(self, idx):
        return self.dataframe.iloc[idx]

In [79]:
train = PD_DATASET(train_data)
valid = PD_DATASET(valid_data)
test =  PD_DATASET(test_data_raw)

In [80]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

pad_idx = vocab['<pad>'] ##get the pad index from the vocab

def collate_batch(batch):
    ## copy the collate_batch function from Professor's code. But it will not work right away
    #mind how the dataset that we use is structured (hint: columns)
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    #criterion expects float labels
    return torch.tensor(label_list, dtype=torch.int64), pad_sequence(text_list, padding_value=pad_idx, batch_first=True)

In [81]:
batch_size = 64

train_loader = DataLoader(train, batch_size=batch_size,
                              shuffle=True, collate_fn=collate_batch)
valid_loader = DataLoader(valid, batch_size=batch_size,
                              shuffle=True, collate_fn=collate_batch)

In [85]:
iter(train_loader)

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x16a5c1b50>

In [None]:
for label, text in train_loader:
    break

print("Label shape: ", label.shape) # (batch_size, )
print("Text shape: ", text.shape)   # (batch_size, seq len)

## First lets try CNN

In [92]:
import torch.nn as nn
import torch.nn.functional as F

## Get the Professor's code from  the lab to build the CNN model

class CNN(nn.Module):
    def __init__(self, input_dim, emb_dim, output_dim, dropout, n_filters, filter_sizes):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        
        self.conv_0 = nn.Conv2d(in_channels=1, out_channels=n_filters,
                                kernel_size=(filter_sizes[0], emb_dim))
        
        self.conv_1 = nn.Conv2d(in_channels=1, out_channels=n_filters,
                                kernel_size=(filter_sizes[1], emb_dim))
        
        self.conv_2 = nn.Conv2d(in_channels=1, out_channels=n_filters,
                                kernel_size=(filter_sizes[2], emb_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [batch_size, seq len]
        #1. embed
        embedded = self.embedding(text)
        #embedded = [batch_size, seq len, emb_size]
        embedded = embedded.unsqueeze(1)
        #embedded = [batch_size, 1, seq len, emb_size]
        #2. convolute
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
        #before squeeze = conved0_n = [batch_size, n_filters, seq_len - filter_size[n] + 1, 1]
        #after  squeeze = conved0_n = [batch_size, n_filters, seq_len - filter_size[n] + 1]
        
        #3. maxpool
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        #before squeeze = [batch_size, n_filters, 1]
        #after  squeeze = [batch_size, n_filters]
        
        #dropout
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
        
        #4. fc
        return self.fc(cat)

In [93]:
#explicitly initialize weights for better learning
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, (nn.Conv2d, nn.Conv2d)):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.kaiming_normal_(param) 

In [94]:
input_dim  = len(vocab)
emb_dim    =  300 
output_dim =  num_classes
dropout    = 0.5
n_filters  = 100 
filter_sizes = [3, 4, 5]

cnn_model = CNN(input_dim, emb_dim, output_dim, dropout, n_filters, filter_sizes).to(device)
cnn_model.apply(initialize_weights)  #apply initialize_weight
cnn_model.embedding.weight.data = fast_embedding #**<------applied the fast text embedding as the initial weights

In [95]:
batch_size = 3
seq_len    = 50 

In [96]:
import torch.optim as optim

lr=1e-3

#training hyperparameters
optimizer = optim.SGD(cnn_model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss() #combine softmax with cross entropy

In [97]:
def accuracy(preds, y):
    
    predicted = torch.max(preds.data, 1)[1]
    batch_corr = (predicted == y).sum()
    acc = batch_corr / len(y)
    
    return acc

In [98]:
def train(model, loader, optimizer, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.train() #useful for batchnorm and dropout
    
    for i, (label, text) in enumerate(loader): 
        label = label.to(device) #(batch_size, )
        text = text.to(device) #(batch_size, seq len)
                
        #predict
        predictions = model(text).squeeze(1) #output by the fc is (batch_size, 1), thus need to remove this 1
        
        #calculate loss
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)
        
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
                        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [99]:
def evaluate(model, loader, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for i, (label, text) in enumerate(loader): 
            label = label.to(device) #(batch_size, )
            text  = text.to(device)  #(seq len, batch_size)

            predictions = model(text).squeeze(1) 
            
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [100]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(valid_loader)))

In [None]:
best_valid_loss = float('inf')
num_epochs      = 5

save_path = f'./models/{cnn_model.__class__.__name__}.pt'

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(num_epochs):
        #write the code that starts the training, store the training and valid losses and accuracy
    #also print the time it took to train the model
    start_time = time.time()

    train_loss, train_acc = train(cnn_model, train_loader, optimizer, criterion, train_loader_length)
    valid_loss, valid_acc = evaluate(cnn_model, valid_loader, criterion, val_loader_length)
    
    #for plotting
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    valid_losses.append(valid_loss)
    valid_accs.append(valid_acc)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(cnn_model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
##Plot the training loss and the accuracy
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(train_losses, label = 'train loss')
ax.plot(valid_losses, label = 'valid loss')
plt.legend()
ax.set_xlabel('updates')
ax.set_ylabel('loss')

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(train_accs, label = 'train acc')
ax.plot(valid_accs, label = 'valid acc')
plt.legend()
ax.set_xlabel('updates')
ax.set_ylabel('acc')

# Lets Try the LSTM model

In [None]:
train = PD_DATASET(train_data)
valid = PD_DATASET(valid_data)

In [None]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers, 
                 bidirectional, dropout, output_dim):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.lstm      = nn.LSTM(
                            emb_dim,
                            hid_dim,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
                            dropout = dropout,
                            batch_first = True
                        )
        self.fc        = nn.Linear(hid_dim * 2, output_dim)
    
    def forward(self, text, text_length):
        #text = [batch_size, seq len]
        embedded = self.embedding(text)
        #text = [batch_size, seq len, emb_dim]
        
        #pack sequence 
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'), 
                                                            enforce_sorted=False, batch_first=True)
        
        packed_output, (hn, cn) = self.lstm(packed_embedded)
        #output is basically all the hidden states;  hn is only last hidden state; cn is last cell state
        
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        #output = [batch_size, seq len, hidden_dim * num directions]
        #hn     = [num_layers * num_directions, batch_size,  hid_dim]  #3 layers birectional - hn1f, hn1b, hn2f, hn2b, hn3f, hn3b
        #cn     = [num_layers * num_directions, batch_size,  hid_dim]
        
        hn      = torch.cat((hn[-2, :, :], hn[-1, :, :]), dim = 1)
        #hn     = [batch_size, hidden_dim * num_directions]
        
        return self.fc(hn)

In [None]:
# takes a module m as input, and it is expected to be an instance of nn.Module.
def initialize_weight(m):
    
    # If m is an instance of nn.Linear, 
    if isinstance(m, nn.Linear):
        
        # it initializes the weights using Xavier (Glorot) normal initialization and sets biases to zeros.
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    
    # If m is an instance of nn.LSTM, 
    elif isinstance(m, nn.LSTM):
        
        # iterates through the named parameters of the LSTM module.
        for name, param in m.named_parameters():
            
            # For bias parameters, it initializes them to zeros
            if 'bias' in name:
                nn.init.zeros_(param)
                
            # for weight parameters, it initializes them using orthogonal initialization.
            elif 'weight' in name:
                nn.init.orthogonal_(param)

In [None]:
input_dim  = len(vocab)
emb_dim    = 300
hidden_dim = 256
output_dim = num_classes
dropout    = 0.5
num_layers = 2
bidirectional = True 

lstm_model = LSTM(input_dim, emb_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout).to(device)
lstm_model.apply(initialize_weights) #apply initialize_weight
lstm_model.embedding.weight.data = fast_embedding #**<------applied the fast text embedding as the initial weights

In [None]:
import torch.optim as optim

lr=1e-3

#training hyperparameters
optimizer = optim.Adam(lstm_model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss() #

In [None]:
def train(model, loader, optimizer, criterion, loader_length):
    #write the code to train the model 
    epoch_loss = 0
    epoch_acc  = 0
    model.train()
    
    for i, (label, text, text_length) in enumerate(loader):
        label = label.to(device)
        text  = text.to(device)
        
        predictions = model(text, text_length).squeeze(1)
        
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)
        
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc  += acc.item()
        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [None]:
def evaluate(model, loader, criterion, loader_length):
    #write the code to evaluate the model
    epoch_loss = 0
    epoch_acc  = 0
    model.eval()
    
    with torch.no_grad():
        for i, (label, text, text_length) in enumerate(loader):
            label = label.to(device)
            text  = text.to(device)
            
            predictions = model(text, text_length).squeeze(1)
            
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)
            
            epoch_loss += loss.item()
            epoch_acc  += acc.item()
        
    # returns the average loss and accuracy over the entire training epoch.
    return epoch_loss / loader_length, epoch_acc / loader_length

In [None]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(valid_loader)))

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
best_valid_loss = float('inf')
num_epochs   = 5

save_path = f'./models/lstm_{lstm_model.__class__.__name__}.pt'

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(num_epochs):
    #write the code that starts the training, store the training and valid losses and accuracy
    #also print the time it took to train the model
    start_time = time.time()
    
    train_loss, train_acc = train(lstm_model, train_loader, optimizer, criterion, train_loader_length)
    valid_loss, valid_acc = train(lstm_model, valid_loader,   optimizer, criterion, val_loader_length)
    
    #for plotting
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    valid_losses.append(valid_loss)
    valid_accs.append(valid_acc)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #early stopping
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(lstm_model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tVal.  Loss: {valid_loss:.3f} | Val Acc: {valid_acc*100:.2f}%')

In [None]:
##Plot the losses and accuracy over all epochs
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(train_losses, label = 'train loss')
ax.plot(valid_losses, label = 'valid loss')
plt.legend()
ax.set_xlabel('updates')
ax.set_ylabel('loss')

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(train_accs, label = 'train acc')
ax.plot(valid_accs, label = 'valid acc')
plt.legend()
ax.set_xlabel('updates')
ax.set_ylabel('acc')

## Conclusion
1. Compare the two models on their time and accuracy. Which one do you think did well for the disaster classification task. and Why?

- Answer :

2. How do you think we get better results in this dataset for classification.

- Answer :

