# Machine Learning HW.4

## Recurrent Neural Network - Glove

In [1]:
import os
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optimz
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split

In [2]:
# Load the data
def load_training_data(path):
    if 'training_label' in path:
        with open(path, 'r', encoding = 'utf-8') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split() for line in lines]
        X = [line[2:] for line in lines]
        Y = [int(line[0]) for line in lines]
        return X,Y
    else:
        with open(path, 'r', encoding = 'utf-8') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split() for line in lines]
        X = lines
        return X

def load_testing_data(path):
    with open(path, 'r', encoding = 'utf-8') as f:
        lines = f.readlines()
        seq = [''.join(line.strip('\n').split(',')[1:]).strip() for line in lines]
    X = [line.split() for line in seq]
    return X     

In [3]:
class data_processing():
    
    def __init__(self, embed_dim = 100, seq_len = 128):
        self.idx = 1
        self.embed_dim = embed_dim
        self.seq_len = seq_len
        self.word2idx = {}
        self.word2vec = {}
        self.embedding_matrix = []
        
    # load the pre-trained Glove embedding vocab
    def load_embedding_vocab(self, path):
        with open(path, 'r', encoding = 'utf-8', errors = 'ignore') as f:
            for line in f:
                tokens = line.split()
                self.word2vec[tokens[0]] = np.asarray(tokens[1:], dtype = 'float32')
                
    # add <PAD> - index 0 and <UNK> - index len+1
    def add_embedding(self):
        self.word2idx['<PAD>'] = 0
        vector = torch.empty(1, self.embed_dim)
        self.embedding_matrix = torch.cat([vector, self.embedding_matrix], 0)
        self.word2idx['<UNK>'] = self.idx
        self.idx += 1
        vector = torch.empty(1, self.embed_dim)
        torch.nn.init.uniform_(vector)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
    
    # bulid the embedding matrix for RNN
    def build_embedding_matrix(self, path):
        self.load_embedding_vocab(path)
        for word in self.word2vec:
            self.word2idx[word] = self.idx
            self.embedding_matrix.append(self.word2vec[word])
            self.idx += 1
        self.embedding_matrix = torch.tensor(np.asarray(self.embedding_matrix), dtype = torch.float)
        self.add_embedding()
        assert self.embedding_matrix.shape[0] == self.idx 
        return self.embedding_matrix
    
    # turn the input sequence to the index of vocab and pad the input sequence to the max length
    def seq_to_index(self, sequence):
        seq = [self.word2idx[word] if word in self.word2idx else self.word2idx['<UNK>'] for word in sequence]
        seq = seq[:self.seq_len]
        x = (np.ones(self.seq_len) * 0).astype('int64')
        x[:len(seq)] = seq
        return x     

In [4]:
class TextDataset():

    def __init__(self, process, X, y = None):
        self.X, self.Y = [],[]
        for item in X:
            self.X.append(process.seq_to_index(item))
        if y is not None:
            self.Y = torch.LongTensor(y)
            assert len(self.X) == len(self.Y)
        else:
            self.Y = y 
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self,index):
        return self.X[index],self.Y[index] if self.Y is not None else self.X[index]

In [5]:
class RNN(nn.Module):
    
    def __init__(self,embed_matrix, embed_grade = False, hidden_dim = 128, num_layers = 1, bidirectional = False, dropout = 0.5):
        super(RNN,self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embed_matrix)
        self.embed_dim = embed_matrix.shape[1]
        self.rnn = nn.LSTM(self.embed_dim, hidden_dim, num_layers = num_layers, batch_first = True, bidirectional = bidirectional)
        self.func = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim,2 * hidden_dim),
            nn.Linear(2 * hidden_dim, hidden_dim),
            nn.Linear(hidden_dim, 2),
            nn.Sigmoid()
        )

    def forward(self,x):
        tokens = self.embedding(x)
        x_len = torch.sum(x != 0, dim = -1)
        x_pack,x_unsort = self.squeeze_embedding(tokens,x_len)
        out,(h_t,c_t) = self.rnn(x_pack)

        h_t = torch.transpose(h_t, 0, 1)[x_unsort]
        h_t = torch.transpose(h_t, 0, 1)
        out = self.func(h_t)
        return out.reshape((h_t.shape[1],2))

    def squeeze_embedding(self,tokens,x_len):
        x_sort = torch.sort(-x_len)[1].long()
        x_unsort = torch.sort(x_sort)[1].long()
        x_len = x_len[x_sort]
        tokens = tokens[x_sort]
        tokens_pack = nn.utils.rnn.pack_padded_sequence(tokens, x_len, batch_first = True)
        return tokens_pack, x_unsort

In [6]:
def evaluation(pre, label):
    acc = np.sum(np.argmax(pre.cpu().data.numpy(), axis = 1) == label.numpy())
    return acc

In [7]:
def train(model, train, valid, batch_size, epoch_num, lr, device):
    print('Begin to train.')
    optimizer = optimz.Adam(model.parameters(), lr = lr)
    criterion = nn.CrossEntropyLoss()
    best_acc = 0
    train_len,val_len = len(train) * batch_size, len(valid)
    for epoch in range(epoch_num):
        # train
        model.train()
        train_acc, train_loss = 0,0
        for idx, (data, label) in enumerate(train):
            train_pre = model(data.to(device))
            loss = criterion(train_pre, label.to(device))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_acc += evaluation(train_pre, label)
        # validation
        model.eval()
        val_acc, val_loss = 0,0
        with torch.no_grad():
            for idx, (data, label) in enumerate(valid):
                val_pre = model(data.to(device))
                loss = criterion(val_pre, label.to(device))
                val_loss += loss.item()
                val_acc += evaluation(val_pre, label)

        print('Epoch[%02d|%02d] | Train loss is : %.5f | Train acc is : %.3f | Valid loss is : %.5f | Valid acc is : %.3f '% \
            (epoch+1, epoch_num,train_loss / train_len , train_acc / train_len, val_loss / val_len, val_acc / val_len / batch_size))

        if val_acc/val_len > best_acc:
            best_acc = val_acc/val_len
            print('***** The best accuracy in validation set is %.3f'%(val_acc / val_len / batch_size))

In [8]:
def biu():
    file_path = 'ml2020spring-hw4'
    label_path = 'training_label.txt'
    nolabel_path = 'training_nolabel.txt'
    test_path = 'testing_data.txt'

    train_label_data, train_label = load_training_data(os.path.join(file_path, label_path))
    train_nolabel_data = load_training_data(os.path.join(file_path, nolabel_path))
    test_data = load_testing_data(os.path.join(file_path, test_path))

    data = train_label_data + train_nolabel_data + test_data
    print('Data load successfully.')

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    lr = 1e-6
    batch_size = 32
    max_seq_len = 32
    embed_dim = 100
    epoch_num = 50
    print(f'Super parameters set successfully, device is {device}.')

    data_pro = data_processing(embed_dim = embed_dim, seq_len = max_seq_len)
    embedding = data_pro.build_embedding_matrix('glove.6B.100d.txt')
    print('Embedding matrix build successfully.')

    train_X, val_X, train_y, val_y = train_test_split(train_label_data, train_label, train_size = 0.8)
    train_set = TextDataset(data_pro, train_X, train_y)
    valid_set = TextDataset(data_pro, val_X, val_y)
    train_loader = DataLoader(train_set, batch_size = batch_size, shuffle = True)
    valid_loader = DataLoader(valid_set, batch_size = batch_size, shuffle = True)
    print('Data Loader build successfully.')

    model = RNN(embedding).to(device)
    print('RNN model initialize successfully.')
    
    train(model, train_loader, valid_loader, batch_size, epoch_num, lr, device)

In [None]:
biu()

Data load successfully.
Super parameters set successfully, device is cpu.
Embedding matrix build successfully.




Data Loader build successfully.
RNN model initialize successfully.
Begin to train.
Epoch[01|50] | Train loss is : 0.02136 | Train acc is : 0.577 | Valid loss is : 0.66359 | Valid acc is : 0.616 
***** The best accuracy in validation set is 0.616
Epoch[02|50] | Train loss is : 0.01980 | Train acc is : 0.652 | Valid loss is : 0.61350 | Valid acc is : 0.679 
***** The best accuracy in validation set is 0.679
Epoch[03|50] | Train loss is : 0.01882 | Train acc is : 0.690 | Valid loss is : 0.59438 | Valid acc is : 0.699 
***** The best accuracy in validation set is 0.699
Epoch[04|50] | Train loss is : 0.01846 | Train acc is : 0.704 | Valid loss is : 0.58464 | Valid acc is : 0.711 
***** The best accuracy in validation set is 0.711
Epoch[05|50] | Train loss is : 0.01824 | Train acc is : 0.715 | Valid loss is : 0.58069 | Valid acc is : 0.716 
***** The best accuracy in validation set is 0.716
Epoch[06|50] | Train loss is : 0.01807 | Train acc is : 0.721 | Valid loss is : 0.57563 | Valid acc is

## 