In [1]:
import collections
import numpy as np
import pandas
import pickle
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_packed_sequence as unpack
from torch.nn.utils.rnn import pack_padded_sequence as pack
import torch.nn.functional as F
import math
import sys
import os

# Some magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
dirname = os.path.dirname(os.getcwd())+'/Data'
def LoadData(filename):
    fullname = os.path.join(dirname, filename)
    data = pandas.read_csv(fullname, header=None, sep='\t', quoting=3, compression='gzip')
    data.columns = ['lang', 'text']
    return data
data = LoadData('train.tsv.gz')
val_data = LoadData('val.tsv.gz')
test_data = LoadData('test.tsv.gz')
data.head()

Unnamed: 0,lang,text
0,es,"Alemania vs Argentina, la tercera es la vencid..."
1,en,I have gained level 39 in The Tribez and Castl...
2,pt,"Finalmente é sexta, mas ainda tenho teste"
3,fr,"""#Marée ↗ #Mimizan 14/04/2016 UTC+2 Basse mer ..."
4,es,Que ganitas de poder ir al gym jupe..


In [3]:
# Create the vocabulary tables
all_langs = data.lang.unique()
lang2idx = dict(zip(all_langs, range(len(all_langs))))
idx2lang = dict(zip(lang2idx.values(), lang2idx.keys()))

counts = collections.Counter()
for line in data.text:
    counts.update(line)

chars = set([c for c in counts if counts[c] >= 10])
chars.update(['PAD', '<S>', '</S>', 'UNK'])
char2idx = dict(zip(chars, range(len(chars))))
idx2char = dict(zip(range(len(chars)), chars))

In [4]:
# size of vocabulary
print('size of vocabulary is {0}'.format(len(chars)))
print('number of unique characters {0}'.format(len(counts)))

total_chars = sum(counts.values())
total_oovs = sum([counts[c] for c in counts if counts[c] < 10])
print('oov rate is {0:.4f}%'.format(100.0 * total_oovs / total_chars))

size of vocabulary is 510
number of unique characters 1393
oov rate is 0.0469%


In [5]:
# WARM_UP: Compute the perplexity of a unigram model
train_counts = np.zeros(len(chars))
val_counts = np.zeros(len(chars))

for line in data.text:
    for c in line:
        idx = char2idx.get(c, char2idx['UNK'])
        train_counts[idx] += 1.0
    train_counts[char2idx['</S>']] += 1.0
for line in val_data.text:
    for c in line:
        idx = char2idx.get(c, char2idx['UNK'])
        val_counts[idx] += 1.0
    val_counts[char2idx['</S>']] += 1.0
train_counts[char2idx['PAD']] += 1
train_counts = train_counts / train_counts.sum()
train_counts[char2idx['<S>']] = 1.0  # this will be zeroed out later
val_counts = val_counts / val_counts.sum()
ppl = np.exp(-(val_counts * np.log(train_counts)).sum())
print('the perplexity is {0:.2f}'.format(ppl))

the perplexity is 34.11


In [6]:
def prepare_data(char2idx, idx2char, lang2idx, idx2lang, data, save_file, max_seq_length=1000):
    sequences = []
    languages = []
    for i in range(len(data.text)):
        line = data.text[i]
        if len(line) > max_seq_length:
            continue
        seq = [char2idx['<S>']]
        for c in line:
            idx = char2idx.get(c, char2idx['UNK'])
            seq += [idx]
        seq += [char2idx['</S>']]
        sequences += [seq]
        languages += [lang2idx[data.lang[i]]]

    pickle.dump({'chars': sequences, 'langs': languages, 'ind2voc': idx2char, 'voc2ind':char2idx, 'ind2lang': idx2lang, 'lang2ind':lang2idx}, open(save_file, 'wb'))
    
prepare_data(char2idx, idx2char, lang2idx, idx2lang, data, 'chars_train.pkl')
prepare_data(char2idx, idx2char, lang2idx, idx2lang, val_data, 'chars_val.pkl')
prepare_data(char2idx, idx2char, lang2idx, idx2lang, test_data, 'chars_test.pkl')


PAD = char2idx['PAD']

In [11]:
import key_tools as tools
verbose = True

BATCH_SIZE = 64
FEATURE_SIZE = 15
TEST_BATCH_SIZE = 256
EPOCHS = 10
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0005

data_train = tools.Dataset('chars_train.pkl', BATCH_SIZE, PAD)
data_val = tools.Dataset('chars_val.pkl', TEST_BATCH_SIZE, PAD)
data_test = tools.Dataset('chars_test.pkl', TEST_BATCH_SIZE, PAD)

model = tools.MyRNN(len(char2idx),PAD=PAD)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

for epoch in range(EPOCHS):
    if verbose: print(epoch+1)
    train_ppl = tools.train(model, data_train, epoch, optimizer, PAD)
    val_loss, val_ppl = tools.test(model, data_val, PAD)
predictions = tools.get_predictions(model, data_test)



0


KeyboardInterrupt: 

In [None]:
Metrics(predictions, data_test.langs)