# Machine Learning HW.4

# Recurrent Neural Network - Word2Vec

In [1]:
import os
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optimz
from torch.utils.data import Dataset,DataLoader
from gensim.models import word2vec
from sklearn.model_selection import train_test_split

In [2]:
# Load the data
def load_training_data(path):
    if 'training_label' in path:
        with open(path, 'r', encoding = 'utf-8') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split() for line in lines]
        X = [line[2:] for line in lines]
        Y = [int(line[0]) for line in lines]
        return X,Y
    else:
        with open(path, 'r', encoding = 'utf-8') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split() for line in lines]
        X = lines
        return X

def load_testing_data(path):
    with open(path, 'r', encoding = 'utf-8') as f:
        lines = f.readlines()
        seq = [''.join(line.strip('\n').split(',')[1:]).strip() for line in lines]
    X = [line.split() for line in seq]
    return X

In [3]:
class data_processing():
    '''
    Data processing class,build the embedding vocab and turn the input sequence to the index in vocab.
    '''
    def __init__(self, embed_dim = 100, seq_len = 32):
        self.embed_dim = embed_dim
        self.idx = 0
        self.seq_len = seq_len
        self.word2idx = {}
        self.idx2word = {}
        self.embedding_matrix = []

    # train the word2vec model
    def train_word2vec(self, data, window = 5):
        self.word2vec_model = word2vec.Word2Vec(data, size = self.embed_dim, window = window, iter = 10)
    
    # add <PAD> and <UNK> to embedding matrix
    def add_embedding(self, word):
        self.word2idx[word] = self.idx
        self.idx2word[self.idx] = word
        vector = torch.empty(1, self.embed_dim)
        torch.nn.init.uniform_(vector)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
        self.idx += 1

    # bulid the embedding matrix for RNN
    def build_embedding_matrix(self):
        embedding_matrix = []
        for _,word in enumerate(self.word2vec_model.wv.vocab):
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
            embedding_matrix.append(self.word2vec_model[word])
        self.embedding_matrix = torch.tensor(embedding_matrix)
        self.add_embedding('<PAD>')
        self.add_embedding('<UNK>')
        return self.embedding_matrix

    # pad the input sequence to max length
    def pad_sequence(self, sequence):
        if len(sequence) >= self.seq_len:
            return sequence[:self.seq_len]
        else:
            for _ in range(self.seq_len - len(sequence)):
                sequence.append('<PAD>')
            return sequence

    # turn the input sequence to the index of vocab
    def seq_to_index(self, sequence):
        sequence1 = self.pad_sequence(sequence)
        seq = [self.word2idx[word] if word in self.word2idx else self.word2idx['<UNK>'] for word in sequence1]
        return seq

In [4]:
class TextDataset():

    def __init__(self, process, X, y = None):
        self.X, self.Y = [],[]
        for item in X:
            self.X.append(np.array(process.seq_to_index(item)))
        if y is not None:
            self.Y = torch.LongTensor(y)
            assert len(self.X) == len(self.Y)
        else:
            self.Y = y       
        
    def __getitem__(self, index):
        if self.Y is not None:
            return self.X[index],self.Y[index]
        else:
            return self.X[index]
    
    def __len__(self):
        return len(self.X)

In [5]:
class RNN(nn.Module):

    def __init__(self, embed_matrix, embed_grade = False, hidden_dim = 128, num_layers = 1, bidirectional = False, dropout = 0.5):
        super(RNN, self).__init__()
        # 制作 embeddin layer, 并设置是否更改词向量参数
        # self.embed = nn.Embedding(embed_matrix.size(0),embed_matrix.size(1))
        # self.embed.weight = nn.Parameter(torch.tensor(embed_matrix))
        # self.embed.weight.requires_grad = embed_grade
        # self.embed_dim = embed_matrix.shape[1]

        self.embed = nn.Embedding.from_pretrained(embed_matrix)
        self.embed_dim = embed_matrix.shape[1]
        
        self.LSTM = nn.LSTM(self.embed_dim, hidden_dim, num_layers = num_layers, batch_first = True, bidirectional = bidirectional)
        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 2 * hidden_dim),
            nn.Linear(2 * hidden_dim, hidden_dim),
            nn.Linear(hidden_dim, 2),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        batch_size = x.shape[0]
        tokens = self.embed(torch.tensor(x))
        _,(ht,_) = self.LSTM(tokens)
        out = self.fc(ht)
        return out.reshape(batch_size,2)

In [6]:
def evaluation(pre, label):
    acc = np.sum(np.argmax(pre.cpu().data.numpy(), axis = 1) == label.numpy())
    return acc

In [12]:
def train(model, train, valid, batch_size, epoch_num, lr, device):
    print('Begin to train.')
    optimizer = optimz.Adam(model.parameters(), lr = lr)
    criterion = nn.CrossEntropyLoss()
    best_acc = 0
    train_len,val_len = len(train) * batch_size, len(valid)
    for epoch in range(epoch_num):
        # train
        model.train()
        train_acc, train_loss = 0,0
        for idx, (data, label) in enumerate(train):
            train_pre = model(data.to(device))
            loss = criterion(train_pre, label.to(device))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_acc += evaluation(train_pre, label)
        # validation
        model.eval()
        with torch.no_grad():
            val_acc, val_loss = 0,0
            for idx, (data, label) in enumerate(valid):
                val_pre = model(data.to(device))
                loss = criterion(val_pre, label.to(device))
                val_loss += loss.item()
                val_acc += evaluation(val_pre, label)

        print('Epoch[%02d|%02d] | Train loss is : %.5f | Train acc is : %.3f | Valid loss is : %.5f | Valid acc is : %.3f '% \
            (epoch+1, epoch_num,train_loss / train_len , train_acc / train_len, val_loss / val_len, val_acc / val_len / batch_size))

        if val_acc/val_len > best_acc:
            best_acc = val_acc/val_len
            print('***** The best accuracy in validation set is %.3f'%(val_acc / val_len / batch_size))

In [15]:
def biu():
    file_path = 'ml2020spring-hw4'
    label_path = 'training_label.txt'
    nolabel_path = 'training_nolabel.txt'
    test_path = 'testing_data.txt'

    train_label_data, train_label = load_training_data(os.path.join(file_path, label_path))
    train_nolabel_data = load_training_data(os.path.join(file_path, nolabel_path))
    test_data = load_testing_data(os.path.join(file_path, test_path))

    data = train_label_data + train_nolabel_data + test_data
    print('Data load successfully.')

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    lr = 1e-6
    batch_size = 32
    max_seq_len = 32
    embed_dim = 100
    epoch_num = 10
    print(f'Super parameters set successfully, device is {device}.')

    data_pro = data_processing(embed_dim = embed_dim, seq_len = max_seq_len)
    data_pro.train_word2vec(data)
    embedding = data_pro.build_embedding_matrix()
    print('Embedding matrix build successfully.')

    train_X, val_X, train_y, val_y = train_test_split(train_label_data, train_label, train_size = 0.8)
    train_set = TextDataset(data_pro, train_X, train_y)
    valid_set = TextDataset(data_pro, val_X, val_y)
    train_loader = DataLoader(train_set, batch_size = batch_size, shuffle = True)
    valid_loader = DataLoader(valid_set, batch_size = batch_size, shuffle = True)
    print('Data Loader build successfully.')

    model = RNN(embedding).to(device)
    print('RNN model initialize successfully.')
    
    train(model, train_loader, valid_loader, batch_size, epoch_num, lr, device)

In [16]:
biu()

Data load successfully.
Super parameters set successfully, device is cpu.




Embedding matrix build successfully.
Data Loader build successfully.
RNN model initialize successfully.
Begin to train.




Epoch[01|10] | Train loss is : 0.02163 | Train acc is : 0.513 | Valid loss is : 0.68928 | Valid acc is : 0.532 
***** The best accuracy in validation set is 0.532
Epoch[02|10] | Train loss is : 0.01997 | Train acc is : 0.639 | Valid loss is : 0.60088 | Valid acc is : 0.700 
***** The best accuracy in validation set is 0.700
Epoch[03|10] | Train loss is : 0.01803 | Train acc is : 0.730 | Valid loss is : 0.55895 | Valid acc is : 0.751 
***** The best accuracy in validation set is 0.751
Epoch[04|10] | Train loss is : 0.01722 | Train acc is : 0.756 | Valid loss is : 0.54051 | Valid acc is : 0.763 
***** The best accuracy in validation set is 0.763
Epoch[05|10] | Train loss is : 0.01671 | Train acc is : 0.768 | Valid loss is : 0.52972 | Valid acc is : 0.773 
***** The best accuracy in validation set is 0.773
Epoch[06|10] | Train loss is : 0.01640 | Train acc is : 0.778 | Valid loss is : 0.52292 | Valid acc is : 0.779 
***** The best accuracy in validation set is 0.779
Epoch[07|10] | Train l

Epoch[01|10] | Train loss is : 0.02145 | Train acc is : 0.549 | Valid loss is : 0.66974 | Valid acc is : 0.601 
***** The best accuracy in validation set is 0.601
Epoch[02|10] | Train loss is : 0.01922 | Train acc is : 0.676 | Valid loss is : 0.58231 | Valid acc is : 0.716 
***** The best accuracy in validation set is 0.716
Epoch[03|10] | Train loss is : 0.01802 | Train acc is : 0.728 | Valid loss is : 0.56958 | Valid acc is : 0.739 
***** The best accuracy in validation set is 0.739
Epoch[04|10] | Train loss is : 0.01778 | Train acc is : 0.740 | Valid loss is : 0.56905 | Valid acc is : 0.740 
***** The best accuracy in validation set is 0.740
Epoch[05|10] | Train loss is : 0.01750 | Train acc is : 0.749 | Valid loss is : 0.55735 | Valid acc is : 0.751 
***** The best accuracy in validation set is 0.751
Epoch[06|10] | Train loss is : 0.01740 | Train acc is : 0.751 | Valid loss is : 0.55570 | Valid acc is : 0.753 
***** The best accuracy in validation set is 0.753
Epoch[07|10] | Train loss is : 0.01726 | Train acc is : 0.756 | Valid loss is : 0.55236 | Valid acc is : 0.756 
***** The best accuracy in validation set is 0.756
Epoch[08|10] | Train loss is : 0.01714 | Train acc is : 0.760 | Valid loss is : 0.55184 | Valid acc is : 0.757 
***** The best accuracy in validation set is 0.757
Epoch[09|10] | Train loss is : 0.01704 | Train acc is : 0.762 | Valid loss is : 0.54674 | Valid acc is : 0.760 
***** The best accuracy in validation set is 0.760
Epoch[10|10] | Train loss is : 0.01692 | Train acc is : 0.764 | Valid loss is : 0.54165 | Valid acc is : 0.762 
***** The best accuracy in validation set is 0.762