In [1]:
import numpy as np
import tools
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from matplotlib import pyplot as plt
# Some magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
lan_codes = ['en', 'es', 'pt', 'gl', 'eu', 'ca', 'fr', 'it', 'de']
Languages = ['English', 'Spanish', 'Portuguese', 'Galician', 'Basque', 'Catalan', 'French', 'Italian', 'German']

## Load the Data, Calculate Vocabulary and Perplexity

In [3]:
# Load Data
data = tools.Data()
print('Size of the vocabulary: %d characters' % len(data.vocab))

Size of the vocabulary: 509 characters


In [4]:
preplexity = data.get_perplexity()
print('Perplexity measurement is %.2f' % preplexity)

Perplexity measurement is 34.11


In [5]:
print('Percent of Invalid Characters - Train: %.5f%%' 
    % ((data.train_freq[data.vocab.index('<N>')] / data.train_freq.sum()) * 100.0))
print('Percent of Invalid Characters - Val: %.5f%%' 
    % ((data.val_freq[data.vocab.index('<N>')] / data.val_freq.sum()) * 100.0))

Percent of Invalid Characters - Train: 0.04622%
Percent of Invalid Characters - Val: 0.05987%


## Model

In [6]:
class MyRNN(nn.Module):
    def __init__(self, vocab_size, lang_size=9, char_vec_size=12, lang_vec_size=2, hidden_size=50, PAD=0):
        super(MyRNN, self).__init__()
        self.vocab_size = vocab_size
        self.lang_size = lang_size
        self.char_vec_size = char_vec_size
        self.lang_vec_size = lang_vec_size
        self.hidden_size = hidden_size
        
        self.char_encoder = nn.Embedding(self.vocab_size, self.char_vec_size)
        self.lang_encoder = nn.Embedding(self.lang_size, self.lang_vec_size)
        # the current hidden size = char_vec_size
        self.gru = nn.GRU(self.char_vec_size+self.lang_vec_size, self.hidden_size, num_layers=1)
        self.linear = nn.Linear(self.hidden_size, self.char_vec_size)
        self.decoder = nn.Linear(self.char_vec_size, self.vocab_size)
        
        # This shares the encoder and decoder weights as described in lecture.
        self.decoder.weight = self.char_encoder.weight
        self.decoder.bias.data.zero_()
        
        
        weight = torch.ones(vocab_size)
        # scores over PAD is not counted
        weight[PAD] = 0
        self.sm = nn.LogSoftmax(dim=1)
        self.crit = nn.NLLLoss(weight, size_average=False)

    def forward(self, tweets, lang, hidden=None):
        emb = torch.cat((self.lang_encoder(lang), self.char_encoder(tweets)), -1)
        output, hidden_t = self.gru(emb, hidden)
        output = F.tanh(self.linear(output))
        output = self.decoder(output)
        return output, hidden_t

    # Predefined loss function
    def loss(self, prediction, label, reduction='elementwise_mean'):
        prediction = prediction.view(-1, self.vocab_size)
        prediction = self.sm(prediction)
        loss_val = self.crit(prediction, label.view(-1))
        return loss_val

In [7]:
BATCH_SIZE = 64
FEATURE_SIZE = 15
TEST_BATCH_SIZE = 256
EPOCHS = 10
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0005
MOMENTUM = 0.5
LOG_INTERVAL = 500
device = torch.device("cpu")

In [8]:
# this cell takes a few minutes to run
train_tweets, train_lans = tools.data_encoding(data.train, data.vocab, lan_codes)
val_tweets, val_lans = tools.data_encoding(data.val, data.vocab, lan_codes)
test_tweets, test_lans = tools.data_encoding(data.test, data.vocab, lan_codes)

In [9]:
train_loader = tools.get_data_loader(train_tweets, train_lans, BATCH_SIZE, shuffle=True)
val_loader = tools.get_data_loader(val_tweets, val_lans, BATCH_SIZE)
test_loader = tools.get_data_loader(test_tweets, test_lans , BATCH_SIZE)

In [10]:
model = MyRNN(len(data.vocab), PAD=data.vocab.index('</S>'))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)



In [None]:
tools.train(model, device, train_loader, optimizer, EPOCHS, LOG_INTERVAL,verbose=True)





In [None]:
tools.test(model, device, test_loader)

## Building Prediction Method

In [None]:
tweets_val = torch.tensor(val_tweets, dtype=torch.long, device=torch.device("cpu"))
language_val = torch.tensor(val_lans, dtype=torch.long, device=torch.device("cpu"))

In [None]:
def predict(model, device, data):
    '''
    lan - language id (0-8)
    '''
    model.eval()
    test_loss = 0
    test_ppl = 0
    first_loop = True
    with torch.no_grad():

        for lan in range(9):
            label = torch.ones(data.size(), dtype=torch.long)*lan
            data, label = data.to(device), label.to(device)
            output, hidden = model(data, label)
            
            output = F.log_softmax(output, dim=2)
            #convert to numpy
            data_np = data.numpy()
            output_np = output.numpy()

            # calculate log prob for each letter of sequence (using output matrix)     
            prob = np.zeros(data_np.shape)
            for batch in range(output_np.shape[0]):
                for char in range(output_np.shape[1]):
                    prob[batch, char] = output_np[batch, char, data_np[batch, char]]

            if lan == 0:
                total_prob = np.sum(prob, axis=1)
            else:
                total_prob = np.vstack((np.sum(prob, axis=1),total_prob))
        
        # Choose language with highest character probability
        output = np.argmax(total_prob,axis=0)
        return output

pred = predict(model, device, tweets_val[:1000,:])

In [None]:
print(f'Percent Correct: {np.sum(pred == val_lans[:1000,0])/pred.shape[0]*100}')

In [None]:
print(prob[:30])
print(val_lans[:30,0].astype(int))

## Understanding Model Output

In [246]:
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    test_ppl = 0
    with torch.no_grad():
        for data, label in test_loader:
            data, label = data.to(device), label.to(device)
            output, hidden = model(data, label)
            return output, hidden, input_data, input_label
            test_loss += model.loss(output, label).item()
            test_ppl += math.exp(F.cross_entropy(output.view(-1, 509), label.view(-1), ignore_index=507))

    test_loss /= len(test_loader.dataset)
    test_ppl /= len(test_loader.dataset)
    print('test_ppl : ' + str(test_ppl))
    print('test_loss : ' + str(test_loss))
    
    return test_loss, test_ppl


In [247]:
print(output.size())
print(hidden.size())
print(input_data.size())
print(input_label.size())

output_np = output.numpy()
input_np = input_data.numpy()

print(train_tweets.shape)

torch.Size([64, 282, 509])
torch.Size([1, 282, 50])
torch.Size([64, 282])
torch.Size([64, 282])
(80175, 282)


In [248]:
output

tensor([[[-2.4818, -0.8084,  6.5227,  ..., -1.1392,  1.5219, -2.8674],
         [-4.5427, -1.2450,  4.9231,  ..., -1.1142,  1.2687, -1.2153],
         [-1.4200,  0.1716,  7.3109,  ..., -1.3143,  1.9762, -3.5480],
         ...,
         [-1.1582,  0.9264,  7.8384,  ..., -1.2702,  2.2328, -4.0198],
         [-1.1582,  0.9264,  7.8384,  ..., -1.2702,  2.2328, -4.0198],
         [-1.1582,  0.9264,  7.8384,  ..., -1.2702,  2.2328, -4.0198]],

        [[-1.9740, -0.1071,  7.9504,  ..., -1.2237,  2.1967, -3.6512],
         [-3.2058, -0.9986,  7.5842,  ..., -1.4399,  2.1403, -2.8306],
         [-1.5717,  0.9238,  8.6058,  ..., -1.3325,  2.6930, -3.9434],
         ...,
         [-0.0657,  3.1273,  7.8197,  ..., -0.8086,  2.4213, -5.0161],
         [-0.0657,  3.1273,  7.8197,  ..., -0.8086,  2.4213, -5.0161],
         [-0.0657,  3.1273,  7.8197,  ..., -0.8086,  2.4213, -5.0161]],

        [[-1.0748,  1.1423,  8.1526,  ..., -1.0914,  2.3910, -4.2921],
         [-3.0909, -0.5738,  7.6684,  ..., -1