# Machine Learning HW.4

# Recurrent Neural Network

## data processing

In [1]:
import os
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split

In [2]:
file_path = './ml2020spring-hw4'
data_path = ['training_label.txt', 'training_nolabel.txt', 'testing_data.txt']

In [3]:
def read_file(path_list):
    data_length = 1
    data = []
    for path in path_list:
        with open(path, 'r',encoding = 'utf-8',newline = '\n',errors = 'ignore') as f:
            for line in f:
                if '_label' in path:
                    text = line.split(' +++$+++ ')[1]
                elif 'test' in path:
                    text = line.split(',')[1]
                else:
                    text = line
                data.append(text)
                data_length += 1
    return data,data_length      

In [4]:
path_list = [os.path.join(file_path,item) for item in data_path]
data,data_length = read_file(path_list)
data_length

1578616

In [5]:
def word_to_index(data):
    word2idx = {}
    idx2word = {}
    idx = 1
    for item in data:
        for word in item.replace('\n','').split():
            if word not in word2idx:
                word2idx[word] = idx
                idx2word[idx] = word
                idx += 1
    return word2idx,idx2word,idx

def word_to_vector(path):
    word2vec = {}
    with open(path, 'r', encoding = 'utf-8', errors = 'ignore') as f:
        for line in f:
            tokens = line.split()
            word2vec[tokens[0]] = np.asarray(tokens[1:], dtype = 'float32')
    return word2vec

def seq_to_tokens(text, word2idx, max_len):
    unknown = idx
    text = text.replace('\n','').split()
    sequence = [word2idx[word] if word in word2idx else unknown for word in text]
    sequence = sequence[:max_len]
    x = (np.ones(max_len) * 0).astype('int64')
    x[:len(sequence)] = sequence
    return x
    
def building_embedding_matrix(path,word2idx,dim):
    word2vec = word_to_vector(path)
    embedding_matrix = np.empty((len(word2idx)+2, dim))
    for word in word2idx:
        if word in word2vec:
            vec = word2vec.get(word)
            if vec is not None:
                embedding_matrix[word2idx.get(word)] = vec
    return embedding_matrix

In [6]:
word2idx, idx2word,idx = word_to_index(data)
assert len(word2idx)+1 == idx

In [7]:
word2vec = word_to_vector('glove.6B.100d.txt')
len(word2vec)

400000

In [9]:
embedding_matrix = building_embedding_matrix('glove.6B.100d.txt',word2idx,100)
embedding_matrix.shape,idx

((255800, 100), 255799)

In [8]:
class TextDataSet():

    def __init__(self, path, word2idx, max_len = 128, data_type = 'train'):
        all_data = []
        with open(path, 'r', encoding = 'utf-8', errors = 'ignore') as f:
            for lines in f:
                if data_type == 'train':
                    tokens = lines.replace('\n','').split(' +++$+++ ')
                    label = int(tokens[0])
                else:
                    tokens = lines.replace('\n','').split(',')
                    label = 0
                seq = seq_to_tokens(tokens[1], word2idx, max_len)
                data = {
                    'data':seq,
                    'label':label,
                }
                all_data.append(data)
        self.data = all_data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self,index):
        return self.data[index]

In [10]:
text = TextDataSet(path_list[0], word2idx)
print(type(text))

<class '__main__.TextDataSet'>


In [12]:
train_loader = DataLoader(text, batch_size = 64, shuffle = True, )
type(train_loader)

torch.utils.data.dataloader.DataLoader

In [18]:
def squeeze_embedding(x,x_len):
    x_sort = torch.sort(-x_len)[1].long()
    x_unsort = torch.sort(x_sort)[1].long()
    x_len = x_len[x_sort]
    x = x[x_sort]

    x_pack_embed = nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first = True)
    return x_pack_embed,x_unsort

In [19]:
class RNN(nn.Module):
    
    def __init__(self,embed_dim,hidden_size,out_dim,embed_matrix):
        super(RNN,self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype = torch.float))
        self.rnn = nn.LSTM(embed_dim, hidden_size, batch_first = True,dropout = 0)
        self.func = nn.Sequential(
            nn.Linear(hidden_size,hidden_size*2),
            nn.ReLU(),
            nn.Linear(hidden_size*2,out_dim),
        )

    def forward(self,x):
        tokens = self.embedding(x)
        x_len = torch.sum(x != 0, dim = -1)
        x_pack,x_unsort = squeeze_embedding(tokens,x_len)
        out,(h_t,c_t) = self.rnn(x_pack)

        h_t = torch.transpose(h_t, 0, 1)[x_unsort]
        h_t = torch.transpose(h_t, 0, 1)
        out = self.func(h_t)
        return out.reshape((64,2))

    def squeeze_embedding(self,x):
        x_len = np.sum(x != 0)
        return None

In [20]:
model = RNN(100,256,2,embedding_matrix)
criditions = nn.CrossEntropyLoss()
optimzer = optim.Adam(model.parameters(), lr = 1e-4)
train_loss = 0
train_acc = 0
epoch_num = 10
for epoch in range(epoch_num):
    for idx,data in enumerate(train_loader):
        model.train()
        pre_out = model(data['data'])
        train_acc += np.sum(np.argmax(pre_out.data.numpy(), axis = 1) == data['label'].numpy())
        loss = criditions(pre_out,data['label'])
        train_loss += loss.item()
        loss.backward()
        optimzer.step()

    print('Epoch [%03d : %03d] | Train accuracy is %.5f | Train loss is %.5f' % \
            (epoch+1, epoch_num, train_acc/text.__len__(), train_loss/text.__len__() * 64))
    train_acc, train_loss = 0,0

Epoch [001 : 005] | Train accuracy is 0.64381 | Train loss is 0.00968
Epoch [002 : 005] | Train accuracy is 0.73054 | Train loss is 0.00835
Epoch [003 : 005] | Train accuracy is 0.75019 | Train loss is 0.00795
Epoch [004 : 005] | Train accuracy is 0.76012 | Train loss is 0.00773
Epoch [005 : 005] | Train accuracy is 0.76599 | Train loss is 0.00760
