In [2]:
import collections
import numpy as np
import pandas
import pickle
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_packed_sequence as unpack
from torch.nn.utils.rnn import pack_padded_sequence as pack
import torch.nn.functional as F
import math
from metrics import Metrics
import key_tools as tools

# Some magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [3]:
def LoadData(filename):
    data = pandas.read_csv(filename, header=None, sep='\t', quoting=3)
    data.columns = ['lang', 'text']
    return data
data = LoadData('../Data/train.tsv')
val_data = LoadData('../Data/val.tsv')
test_data = LoadData('../Data/test.tsv')
data.head()

Unnamed: 0,lang,text
0,es,"Alemania vs Argentina, la tercera es la vencid..."
1,en,I have gained level 39 in The Tribez and Castl...
2,pt,"Finalmente é sexta, mas ainda tenho teste"
3,fr,"""#Marée ↗ #Mimizan 14/04/2016 UTC+2 Basse mer ..."
4,es,Que ganitas de poder ir al gym jupe..


In [4]:
# Create the vocabulary tables
all_langs = data.lang.unique()
lang2idx = dict(zip(all_langs, range(len(all_langs))))
idx2lang = dict(zip(lang2idx.values(), lang2idx.keys()))

counts = collections.Counter()
for line in data.text:
    counts.update(line)

chars = set([c for c in counts if counts[c] >= 10])
chars.update(['PAD', '<S>', '</S>', 'UNK'])
char2idx = dict(zip(chars, range(len(chars))))
idx2char = dict(zip(range(len(chars)), chars))

In [5]:
# size of vocabulary
print('size of vocabulary is {0}'.format(len(chars)))
print('number of unique characters {0}'.format(len(counts)))

total_chars = sum(counts.values())
total_oovs = sum([counts[c] for c in counts if counts[c] < 10])
print('oov rate is {0:.4f}%'.format(100.0 * total_oovs / total_chars))

size of vocabulary is 510
number of unique characters 1393
oov rate is 0.0469%


In [6]:
# WARM_UP: Compute the perplexity of a unigram model
train_counts = np.zeros(len(chars))
val_counts = np.zeros(len(chars))

for line in data.text:
    for c in line:
        idx = char2idx.get(c, char2idx['UNK'])
        train_counts[idx] += 1.0
    train_counts[char2idx['</S>']] += 1.0
for line in val_data.text:
    for c in line:
        idx = char2idx.get(c, char2idx['UNK'])
        val_counts[idx] += 1.0
    val_counts[char2idx['</S>']] += 1.0
train_counts[char2idx['PAD']] += 1
train_counts = train_counts / train_counts.sum()
train_counts[char2idx['<S>']] = 1.0  # this will be zeroed out later
val_counts = val_counts / val_counts.sum()
ppl = np.exp(-(val_counts * np.log(train_counts)).sum())
print('the perplexity is {0:.2f}'.format(ppl))

the perplexity is 34.11


In [7]:
def prepare_data(char2idx, idx2char, lang2idx, idx2lang, data, save_file, max_seq_length=1000):
    sequences = []
    languages = []
    for i in range(len(data.text)):
        line = data.text[i]
        if len(line) > max_seq_length:
            continue
        seq = [char2idx['<S>']]
        for c in line:
            idx = char2idx.get(c, char2idx['UNK'])
            seq += [idx]
        seq += [char2idx['</S>']]
        sequences += [seq]
        languages += [lang2idx[data.lang[i]]]

    pickle.dump({'chars': sequences, 'langs': languages, 'ind2voc': idx2char, 'voc2ind':char2idx, 'ind2lang': idx2lang, 'lang2ind':lang2idx}, open(save_file, 'wb'))
    
prepare_data(char2idx, idx2char, lang2idx, idx2lang, data, 'chars_train.pkl')
prepare_data(char2idx, idx2char, lang2idx, idx2lang, val_data, 'chars_val.pkl')
prepare_data(char2idx, idx2char, lang2idx, idx2lang, test_data, 'chars_test.pkl')


PAD = char2idx['PAD']

In [62]:
class MyRNN(nn.Module):
    def __init__(self, vocab_size, lang_size=9, char_vec_size=12, lang_vec_size=2, hidden_size=50, PAD=0):
        
        super(MyRNN, self).__init__()
        self.vocab_size = vocab_size
        self.lang_size = lang_size
        self.char_vec_size = char_vec_size
        self.lang_vec_size = lang_vec_size
        self.hidden_size = hidden_size
        
        self.char_encoder = nn.Embedding(self.vocab_size, self.char_vec_size)
        self.lang_encoder = nn.Embedding(self.lang_size, self.lang_vec_size)
        # the current hidden size = char_vec_size
        self.gru = nn.GRU(self.char_vec_size+self.lang_vec_size, self.hidden_size, num_layers=1)
        self.linear = nn.Linear(self.hidden_size, self.char_vec_size)
        self.decoder = nn.Linear(self.char_vec_size, self.vocab_size)
        
        # This shares the encoder and decoder weights as described in lecture.
        self.decoder.weight = self.char_encoder.weight
        self.decoder.bias.data.zero_()
        
        
        weight = torch.ones(vocab_size)
        # scores over PAD is not counted
        weight[PAD] = 0
        self.sm = nn.LogSoftmax(dim=1)
        self.crit = nn.NLLLoss(weight, size_average=False)

    def forward(self, input, hidden=None):
        emb = pack(torch.cat((self.char_encoder(input[0]), self.lang_encoder(input[1])), -1), input[2])
        output, hidden_t = self.gru(emb, hidden)
        output = unpack(output)[0]
        output = F.tanh(self.linear(output))
        output = self.decoder(output)
        return output, hidden_t

    # Predefined loss function
    def loss(self, prediction, label, reduction='elementwise_mean'):
        prediction = prediction.view(-1, self.vocab_size)
        prediction = self.sm(prediction)
        loss_val = self.crit(prediction, label.view(-1))
        return loss_val

In [63]:
verbose = False

BATCH_SIZE = 64
FEATURE_SIZE = 15
TEST_BATCH_SIZE = 256
EPOCHS = 10
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0005

data_train = tools.Dataset('chars_train.pkl', BATCH_SIZE, PAD)
data_val = tools.Dataset('chars_val.pkl', TEST_BATCH_SIZE, PAD)
data_test = tools.Dataset('chars_test.pkl', TEST_BATCH_SIZE, PAD)

model = MyRNN(len(char2idx),PAD=PAD)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [51]:
# playing around
print(data_train.__dict__.keys())

data1, label1, _ = data_train[0]
print(len(data1))
print(data1[0][139])
print(data1[1].size())
print(data1[2])

print(data1[1][0])
print(data1[1][20])

tweet = ''
for c in data_train.data[0]:
    tweet += idx2char.get(c.item())
print(tweet)
print(len(data_train.data[0]))


dict_keys(['batch_size', 'PAD', 'data', 'langs', 'numBatches'])
3
tensor([429, 112, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190,
        190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190,
        190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190,
        190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190,
        190, 190, 190, 190, 190, 190, 190, 190])
torch.Size([141, 64])
[141, 140, 139, 138, 131, 126, 118, 116, 115, 114, 114, 106, 99, 95, 87, 86, 85, 80, 76, 75, 74, 73, 71, 70, 69, 68, 68, 67, 66, 66, 63, 62, 60, 60, 60, 56, 55, 54, 45, 42, 42, 40, 38, 37, 37, 36, 35, 34, 34, 34, 30, 29, 29, 28, 26, 24, 24, 23, 21, 20, 17, 10, 8, 5]
tensor([0, 0, 2, 0, 0, 1, 3, 1, 2, 0, 3, 4, 5, 3, 2, 1, 1, 0, 0, 1, 0, 1, 0, 4,
        2, 2, 6, 0, 2, 1, 2, 2, 2, 1, 2, 0, 3, 1, 2, 2, 1, 3, 0, 2, 2, 0, 1, 0,
        2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 7, 0, 2, 1])
tensor([0, 0, 2, 0, 0, 1, 3, 1, 2, 0, 3, 4, 5, 3, 2, 1, 1, 0, 0, 1,

In [57]:
for epoch in range(EPOCHS):
    if verbose: print(epoch+1)
    train_ppl = tools.train(model, data_train, epoch, optimizer, PAD)
    val_loss, val_ppl = tools.test(model, data_val, PAD)
predictions = tools.get_predictions(model, data_test)

1
tensor([[ 30,  30,  30,  ...,  30,  30,  30],
        [336, 436, 174,  ..., 278, 139, 370],
        [ 99, 429, 448,  ...,  13, 429, 429],
        ...,
        [328, 438, 124,  ..., 190, 190, 190],
        [429, 112, 190,  ..., 190, 190, 190],
        [ 56, 190, 190,  ..., 190, 190, 190]])
tensor([[0, 0, 2,  ..., 0, 2, 1],
        [0, 0, 2,  ..., 0, 2, 1],
        [0, 0, 2,  ..., 0, 2, 1],
        ...,
        [0, 0, 2,  ..., 0, 2, 1],
        [0, 0, 2,  ..., 0, 2, 1],
        [0, 0, 2,  ..., 0, 2, 1]])
OUTPUTTTTTT
tensor([[-0.3956,  0.8194, -0.1348,  ..., -0.5961,  0.6711,  0.2792],
        [-0.3956,  0.8194, -0.1348,  ..., -0.5961,  0.6711,  0.2792],
        [-0.3353,  0.6443, -0.2289,  ..., -0.4849,  0.5773,  0.0859],
        ...,
        [-0.3956,  0.8194, -0.1348,  ..., -0.5961,  0.6711,  0.2792],
        [-0.3353,  0.6443, -0.2289,  ..., -0.4849,  0.5773,  0.0859],
        [-0.3787,  0.4722, -0.1848,  ..., -0.4516,  0.5849, -0.0716]],
       grad_fn=<SelectBackward>)
torch.Size(

KeyboardInterrupt: 

In [None]:
Metrics(predictions, data_test.langs)

In [80]:
predictions = tools.get_predictions(model, data_test)

59
torch.Size([146, 2304])
[262, 262, 262, 262, 262, 262, 262, 262, 262, 448, 448, 448, 448, 448, 448, 448, 448, 448, 53, 53, 53, 53, 53, 53, 53, 53, 53, 187, 187, 187, 187, 187, 187, 187, 187, 187, 336, 336, 336, 336, 336, 336, 336, 336, 336, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 448, 176, 176, 176, 176, 176, 176, 176, 176, 176, 448, 448, 448, 448, 448, 448, 448, 448, 448, 469, 469, 469, 469, 469, 469, 469, 469, 469, 455, 455, 455, 455, 455, 455, 455, 455, 455, 454, 454, 454, 454, 454, 454, 454, 454, 454, 73, 73, 73, 73, 73, 73, 73, 73, 73, 411, 411, 411, 411, 411, 411, 411, 411, 411, 448, 448, 448, 448, 448, 448, 448, 448, 448, 176, 176, 176, 176, 176, 176, 176, 176, 176, 247, 247, 247, 247, 247, 247, 247, 247, 247, 448, 448, 448, 448, 448, 448, 448, 448, 448, 449, 449, 449, 449, 449, 449, 449, 449, 449, 176, 176, 176, 176, 176, 176, 176, 176, 176, 495, 495, 495, 495, 495, 495, 495, 495, 495, 