In [53]:
import numpy as np
import tools
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from torch.nn.utils.rnn import pad_packed_sequence as unpack
from torch.nn.utils.rnn import pack_padded_sequence as pack

# Some magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
lan_codes = ['en', 'es', 'pt', 'gl', 'eu', 'ca', 'fr', 'it', 'de']
Languages = ['English', 'Spanish', 'Portuguese', 'Galician', 'Basque', 'Catalan', 'French', 'Italian', 'German']

## Load the Data, Calculate Vocabulary and Perplexity

In [19]:
# Load Data
data = tools.Data()
print('Size of the vocabulary: %d characters' % len(data.vocab))

Size of the vocabulary: 509 characters


In [20]:
preplexity = data.get_perplexity()
print('Preplexity measurement is %.2f' % preplexity)

Preplexity measurement is 34.11


In [21]:
print('Percent of Invalid Characters - Train: %.5f%%' 
    % ((data.train_freq[data.vocab.index('<N>')] / data.train_freq.sum()) * 100.0))
print('Percent of Invalid Characters - Val: %.5f%%' 
    % ((data.val_freq[data.vocab.index('<N>')] / data.val_freq.sum()) * 100.0))

Percent of Invalid Characters - Train: 0.04622%
Percent of Invalid Characters - Val: 0.05987%


## Model (Doruk)

In [36]:
class MyRNN(nn.Module):
    def __init__(self, vocab_size, PAD, lang_size=9, char_vec_size=12, lang_vec_size=2, hidden_size=50):
        
        super(MyRNN, self).__init__()
        self.vocab_size = vocab_size
        self.lang_size = lang_size
        self.char_vec_size = char_vec_size
        self.lang_vec_size = lang_vec_size
        self.hidden_size = hidden_size
        
        self.char_encoder = nn.Embedding(self.vocab_size, self.char_vec_size)
        self.lang_encoder = nn.Embedding(self.lang_size, self.lang_vec_size)
        # the current hidden size = char_vec_size
        self.gru = nn.GRU(self.char_vec_size+self.lang_vec_size, self.hidden_size, num_layers=1)
        self.linear = nn.Linear(self.hidden_size, self.char_vec_size)
        self.decoder = nn.Linear(self.char_vec_size, self.vocab_size)
        
        # This shares the encoder and decoder weights as described in lecture.
        self.decoder.weight = self.char_encoder.weight
        self.decoder.bias.data.zero_()
        
        
        weight = torch.ones(vocab_size)
        # scores over PAD is not counted
        weight[PAD] = 0
        self.sm = nn.LogSoftmax(dim=1)
        self.crit = nn.NLLLoss(weight, size_average=False)

    def forward(self, input, hidden=None):
        emb = pack(torch.cat((self.char_encoder(input[0]), self.lang_encoder(input[1])), -1), input[2])
        output, hidden_t = self.gru(emb, hidden)
        output = unpack(output)[0]
        output = F.tanh(self.linear(output))
        output = self.decoder(output)
        return output, hidden_t

    # Predefined loss function
    def loss(self, prediction, label, reduction='elementwise_mean'):
        prediction = prediction.view(-1, self.vocab_size)
        prediction = self.sm(prediction)
        loss_val = self.crit(prediction, label.view(-1))
        return loss_val

In [76]:
BATCH_SIZE = 64
FEATURE_SIZE = 15
TEST_BATCH_SIZE = 256
EPOCHS = 10
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0005
MOMENTUM = 0.5
LOG_INTERVAL = 10000
device = torch.device("cpu")

In [71]:
train_tweets, train_lans = tools.data_encoding(data.train, data.vocab, lan_codes)
val_tweets, val_lans = tools.data_encoding(data.val, data.vocab, lan_codes)
test_tweets, test_lans = tools.data_encoding(data.test, data.vocab, lan_codes)

In [77]:
train_loader = tools.get_data_loader(train_tweets, train_lans, BATCH_SIZE)
val_loader = tools.get_data_loader(val_tweets, val_lans, BATCH_SIZE)
test_loader = tools.get_data_loader(test_tweets, test_lans , BATCH_SIZE)

In [74]:
model = MyRNN(len(data.vocab),data.vocab.index('</S>'))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [75]:
tools.train(model, device, train_loader, optimizer, EPOCHS, LOG_INTERVAL, verbose=True)

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

In [82]:
torch.Tensor(train_tweets.astype(int))

tensor([[506.,   9.,   1.,  ..., 507., 507., 507.],
        [506.,   0.,  14.,  ..., 507., 507., 507.],
        [506.,   5.,   8.,  ..., 507., 507., 507.],
        ...,
        [506.,  11.,   1.,  ..., 507., 507., 507.],
        [506.,  10.,   4.,  ..., 507., 507., 507.],
        [506.,  47.,  38.,  ..., 507., 507., 507.]])

## Model (John)