# DSC 275/475: Time Series Analysis and Forecasting (Fall 2019) 
## Project 2.2 – Sequence Classification with Recurrent Neural Networks 
### Chunlei Zhou

#### Develop Helper Functions

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
import torch
import torch.nn as nn
import random
from torch.nn.utils.rnn import pad_packed_sequence
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from torch import LongTensor
from torch.nn import Embedding
from sklearn.metrics import accuracy_score
from torch.autograd import Variable

In [2]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [3]:
def findFiles(path):
    return glob.glob(path)

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters)

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Turn a line into a <line_length x 1 x n_letters>, or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

# Interpret output
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

# Get a training example (a name and its language):
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    loss = criterion(output, category_tensor)
    loss.backward()
    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    return output, loss.item()

# Just return an output given a line
def evaluate(line_tensor):
    hidden = rnn.initHidden()
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    return output

# Construct a Data Frame
def data_frame(category_lines):
    data_frame = [[],[]]
    for key in all_categories:
        for value in category_lines[key]:
            data_frame[0].append(value)
            data_frame[1].append(key)
    return data_frame

# 1. Systematic processing of data

In [4]:
criterion = nn.NLLLoss()
learning_rate = 0.005
n_iters = 100000
n_hiddens = [32, 64, 128]

In [5]:
DF = data_frame(category_lines)
total_samples = len(DF[0])
randomize_order = np.arange(0, total_samples)

## 1.1 Effect of hidden state length: 

In [6]:
print('======= 1.1 Accuracy Report =======')
for n_hidden in n_hiddens:
    rnn = RNN(n_letters, n_hidden, n_categories)
    current_loss = 0
    all_losses = []
    for iter in range(1, n_iters + 1):
        category, line, category_tensor, line_tensor = randomTrainingExample()
        output, loss = train(category_tensor, line_tensor)
        current_loss += loss
    '''confusion = torch.zeros(n_categories, n_categories)
    n_confusion = 20000
    for i in range(n_confusion):
        category, line, category_tensor, line_tensor = randomTrainingExample()
        output = evaluate(line_tensor)
        guess, guess_i = categoryFromOutput(output)
        category_i = all_categories.index(category)
        confusion[category_i][guess_i] += 1
    accuracy = sum(confusion.diag()) / sum(sum(confusion))
    print('n_hidden=', str(n_hidden)+':', 'Accuracy is %f' % accuracy.item())'''
    confusion = torch.zeros(n_categories, n_categories)
    n_confusion = total_samples
    for i in range(n_confusion):
        category = DF[1][i]
        line = DF[0][i]
        category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
        line_tensor = lineToTensor(line)
        output = evaluate(line_tensor)
        guess, guess_i = categoryFromOutput(output)
        category_i = all_categories.index(category)
        confusion[category_i][guess_i] += 1
    accuracy = sum(confusion.diag()) / sum(sum(confusion))
    print('n_hidden =', str(n_hidden)+':', 'Accuracy is %f' % accuracy.item())



	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  C:\cb\pytorch_1000000000000\work\torch\csrc\utils\python_arg_parser.cpp:1174.)
  p.data.add_(-learning_rate, p.grad.data)


n_hidden = 32: Accuracy is 0.546378


KeyboardInterrupt: 

## 1.2	Effect of systematic training:

In [None]:
n_epoch = 5
print('======= 1.2 Accuracy Report =======')
for n_hidden in n_hiddens:
    rnn = RNN(n_letters, n_hidden, n_categories)
    current_loss = 0
    all_losses = []
    for epoch in range(n_epoch):
        np.random.shuffle(randomize_order)
        for order in randomize_order:
            category = DF[1][order]
            line = DF[0][order]
            category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
            line_tensor = lineToTensor(line)
            output, loss = train(category_tensor, line_tensor)
            current_loss += loss
    confusion = torch.zeros(n_categories, n_categories)
    n_confusion = total_samples
    for i in range(n_confusion):
        category = DF[1][i]
        line = DF[0][i]
        category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
        line_tensor = lineToTensor(line)
        output = evaluate(line_tensor)
        guess, guess_i = categoryFromOutput(output)
        category_i = all_categories.index(category)
        confusion[category_i][guess_i] += 1
    accuracy = sum(confusion.diag()) / sum(sum(confusion))
    print('n_hidden =', str(n_hidden)+':', 'Accuracy is %f' % accuracy.item())

# 2. Batch training of data
## 2.1 batch size equals to the number of total samples

In [7]:
class RNN_Batch(nn.Module):
    def __init__(self):
        super(RNN_Batch, self).__init__()
        self.rnn = nn.RNN(
            input_size=n_letters,
            hidden_size=128,  # number of hidden units
            num_layers=1,  # number of layers
            batch_first=True,
        )
        self.out = nn.Linear(128, n_categories)

    def forward(self, x):
        r_out, h = self.rnn(x, None)
        out = self.out(r_out[:,-1,:])
        return out

In [8]:
feature = sorted(set(all_letters))
vectorized_seqs = [[feature.index(tok) for tok in seq]for seq in DF[0]]
print(DF[0][:5])
print(vectorized_seqs[:5])
embed = Embedding(len(feature), n_letters)
seq_lengths = LongTensor(list(map(len, vectorized_seqs)))
seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
    seq_tensor[idx, :seqlen] = LongTensor(seq)
print(seq_tensor)
seq_tensor.shape

['Khoury', 'Nahas', 'Daher', 'Gerges', 'Nazari']
[[15, 38, 45, 51, 48, 55], [18, 31, 38, 31, 49], [8, 31, 38, 35, 48], [11, 35, 48, 37, 35, 49], [18, 31, 56, 31, 48, 39]]
tensor([[15, 38, 45,  ...,  0,  0,  0],
        [18, 31, 38,  ...,  0,  0,  0],
        [ 8, 31, 38,  ...,  0,  0,  0],
        ...,
        [26, 39, 44,  ...,  0,  0,  0],
        [26, 51, 45,  ...,  0,  0,  0],
        [26, 51, 51,  ...,  0,  0,  0]])


torch.Size([20074, 19])

In [9]:
embedded_seq_tensor = embed(seq_tensor)
print(embedded_seq_tensor)
embedded_seq_tensor.shape

tensor([[[-1.6483, -0.4694,  0.7653,  ...,  2.6645,  0.6022, -0.7363],
         [ 1.4943, -0.0927, -1.8004,  ..., -1.8969,  0.3661, -1.3245],
         [-0.2795, -1.0266,  0.9308,  ..., -1.6363, -1.1754,  0.3967],
         ...,
         [ 0.7038,  1.4780,  1.9921,  ..., -0.5935, -0.7788, -1.1599],
         [ 0.7038,  1.4780,  1.9921,  ..., -0.5935, -0.7788, -1.1599],
         [ 0.7038,  1.4780,  1.9921,  ..., -0.5935, -0.7788, -1.1599]],

        [[-1.5476, -1.2363, -0.4618,  ...,  0.6502, -0.7271, -0.7366],
         [-0.4926,  0.2787,  1.1983,  ..., -0.4858, -0.3580, -0.9359],
         [ 1.4943, -0.0927, -1.8004,  ..., -1.8969,  0.3661, -1.3245],
         ...,
         [ 0.7038,  1.4780,  1.9921,  ..., -0.5935, -0.7788, -1.1599],
         [ 0.7038,  1.4780,  1.9921,  ..., -0.5935, -0.7788, -1.1599],
         [ 0.7038,  1.4780,  1.9921,  ..., -0.5935, -0.7788, -1.1599]],

        [[-0.3444,  0.7704, -0.1083,  ..., -0.0875,  1.5361, -0.8697],
         [-0.4926,  0.2787,  1.1983,  ..., -0

torch.Size([20074, 19, 57])

In [10]:
target = sorted(set(all_categories))
vectorized_y = [target.index(tok) for tok in DF[1]]
print(DF[1][:5])
print(vectorized_y[:5])
target_tensor = torch.tensor(vectorized_y,dtype = torch.long)
target_tensor.shape

['Arabic', 'Arabic', 'Arabic', 'Arabic', 'Arabic']
[0, 0, 0, 0, 0]


torch.Size([20074])

In [11]:
n_hidden = 128
batch_size = total_samples
print('======= 2.1 Accuracy Report =======')
print('Batch Size:', batch_size)

Batch Size: 20074


In [12]:
n_epoch = 5
rnn = RNN_Batch()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()

In [13]:
batch = [embedded_seq_tensor,target_tensor]

In [14]:
x = batch[0]
y = batch[1]
n_epoch = 5
for epoch in range(n_epoch): 
    optimizer.zero_grad()
    output = rnn(x)
    loss = loss_func(output, y) 
    loss.backward(retain_graph=True)
    optimizer.step()
    pred = torch.max(output, 1)[1]
    accuracy = accuracy_score(y, pred)
    print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %f' % accuracy)

Epoch:  0 | train loss: 2.8896 | test accuracy: 0.011607
Epoch:  1 | train loss: 2.1124 | test accuracy: 0.468616
Epoch:  2 | train loss: 1.9845 | test accuracy: 0.468616
Epoch:  3 | train loss: 1.8752 | test accuracy: 0.468616
Epoch:  4 | train loss: 1.9038 | test accuracy: 0.468616


## 2.2 batch size = 1000, 2000, 5000 respectively

In [None]:
print('======= 2.2 Accuracy Report =======')
batch_sizes = [1000, 2000, 5000]
N = 20000
for batch_size in batch_sizes:
    print('Batch Size =', batch_size)
    rnn = RNN_Batch()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()
    n_epoch = int(N/batch_size)
    for epoch in range(n_epoch):
        samples = random.sample(list(randomize_order), 20000)
        sample_index = [samples[i * batch_size:(i + 1) * batch_size] for i in range((len(samples) + batch_size - 1) // batch_size )]
        batch_tensors = []
        target_tensors = []
        sample = []
        target_output = []
        for index in sample_index:
            for i in index:
                sample.append(DF[0][i])
                target_output.append(DF[1][i])
            vectorized_seqs = [[feature.index(tok) for tok in seq]for seq in sample]
            seq_lengths = LongTensor(list(map(len, vectorized_seqs)))
            seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
            for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
                seq_tensor[idx, :seqlen] = LongTensor(seq)
            embedded_seq_tensor = embed(seq_tensor)
            batch_tensors.append(embedded_seq_tensor)
            vectorized_y = [target.index(tok) for tok in target_output]
            target_tensor = torch.tensor(vectorized_y,dtype = torch.long)
            target_tensors.append(target_tensor)  
        for j in range(n_epoch):
            batch = [batch_tensors[j],target_tensors[j]]
            x = batch[0]
            y = batch[1]
            optimizer.zero_grad()
            output = rnn(x)
            loss = loss_func(output, y) 
            loss.backward(retain_graph=True)
            optimizer.step()
        test_output = rnn(embedded_seq_tensor)
        pred = torch.max(test_output, 1)[1]
        accuracy = accuracy_score(y, pred)
        print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %f' % accuracy)

# 3. Model cross-validation 

In [None]:
feature = sorted(set(all_letters))
vectorized_seqs = [[feature.index(tok) for tok in seq]for seq in DF[0]]
print(DF[0][:5])
print(vectorized_seqs[:5])
embed = Embedding(len(feature), n_letters)
seq_lengths = LongTensor(list(map(len, vectorized_seqs)))
seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
    seq_tensor[idx, :seqlen] = LongTensor(seq)
print(seq_tensor)
embedded_seq_tensor = embed(seq_tensor)
print(embedded_seq_tensor)
print(embedded_seq_tensor.shape)
target = sorted(set(all_categories))
vectorized_y = [target.index(tok) for tok in DF[1]]
print(DF[1][:5])
print(vectorized_y[:5])
target_tensor = torch.tensor(vectorized_y,dtype = torch.long)
print(target_tensor.shape)

## 3.1 Five-fold Cross-Validation for RNN

In [None]:
print('======= 3.1 Accuracy Report =======')
kfold_cv = KFold(n_splits=5, random_state=None, shuffle=True)
rnn = RNN_Batch()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()
n_epoch = 5 
for epoch in range(n_epoch):
    accuracies = []
    for train_index, test_index in kfold_cv.split(np.arange(0, total_samples)):
        X_train, X_test, y_train, y_test = embedded_seq_tensor[train_index], embedded_seq_tensor[test_index], target_tensor[train_index], target_tensor[test_index]
        optimizer.zero_grad()
        output = rnn(X_train)
        loss = loss_func(output, y_train) 
        loss.backward(retain_graph=True)
        optimizer.step()
        test_output = rnn(X_test)
        pred = torch.max(test_output, 1)[1]
        accuracies.append(accuracy_score(y_test, pred))
    print("Epoch: ", epoch, "| Train Loss: %.4f" % loss.item(), '| Test accuracy: %f' % np.mean(accuracies))

## 3.2 LSTM

In [None]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()

        self.rnn = nn.LSTM(
            input_size = n_letters,
            hidden_size = 128,
            num_layers = 1,
            batch_first = True,
        )
        self.out = nn.Linear(128, n_categories)

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)
        # h_c shape (n_layers, batch, hidden_size)
        r_out, (h_n, h_c) = self.rnn(x, None)   # None represents zero initial hidden state

        # choose last time step of r_out
        out = self.out(r_out[:, -1, :])
        return out

In [None]:
print('======= 3.2 Accuracy Report =======')
lstm = LSTM()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()
n_epoch = 5
for epoch in range(n_epoch):
    optimizer.zero_grad()
    output = lstm(embedded_seq_tensor)  
    loss = loss_func(output, target_tensor) 
    loss.backward(retain_graph=True) 
    optimizer.step()
    pred = torch.max(output, 1)[1]
    accuracy = accuracy_score(target_tensor, pred)
    print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %f' % accuracy)

## 3.3 Stratified-Five-Fold Cross-Validation

In [None]:
print('======= 3.3 Accuracy Report =======')
skfold_cv = StratifiedKFold(n_splits=5)
rnn = RNN_Batch()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()
n_epoch = 5 
for epoch in range(n_epoch):
    accuracies = []
    for Train, Test in skfold_cv.split(embedded_seq_tensor,target_tensor):
        optimizer.zero_grad()
        output = rnn(embedded_seq_tensor[Train])
        loss = loss_func(output, target_tensor[Train]) 
        loss.backward(retain_graph=True)
        optimizer.step()
        test_output = rnn(embedded_seq_tensor[Test])
        pred = torch.max(test_output, 1)[1]
        accuracy = accuracy_score(target_tensor[Test], pred)
        accuracies.append(accuracy)
    print("Epoch: ", epoch, "| Train Loss: %.4f" % loss.item(), '| Test accuracy: %f' % np.mean(accuracies))