In [25]:
import numpy as np
import tools
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from matplotlib import pyplot as plt
# Some magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
lan_codes = ['en', 'es', 'pt', 'gl', 'eu', 'ca', 'fr', 'it', 'de']
Languages = ['English', 'Spanish', 'Portuguese', 'Galician', 'Basque', 'Catalan', 'French', 'Italian', 'German']

## Load the Data, Calculate Vocabulary and Perplexity

In [27]:
# Load Data
data = tools.Data()
print('Size of the vocabulary: %d characters' % len(data.vocab))

Size of the vocabulary: 509 characters


In [28]:
preplexity = data.get_perplexity()
print('Perplexity measurement is %.2f' % preplexity)

Perplexity measurement is 34.11


In [29]:
print('Percent of Invalid Characters - Train: %.5f%%' 
    % ((data.train_freq[data.vocab.index('<N>')] / data.train_freq.sum()) * 100.0))
print('Percent of Invalid Characters - Val: %.5f%%' 
    % ((data.val_freq[data.vocab.index('<N>')] / data.val_freq.sum()) * 100.0))

Percent of Invalid Characters - Train: 0.04622%
Percent of Invalid Characters - Val: 0.05987%


## Model

In [170]:
class RNN(nn.Module):
    def __init__(self, ignore):
        super(RNN, self).__init__()  
        self.ignore = ignore
        tweet_embed_size = 12
        lang_embed_size = 2
        hidden_size = 50
        vocab_size = 509
        self.embedding_tweet = nn.Embedding(vocab_size, tweet_embed_size)
        self.embedding_lang = nn.Embedding(9, lang_embed_size)
        self.gru = nn.GRU(tweet_embed_size+lang_embed_size, hidden_size, num_layers=1)
        self.linear = nn.Linear(hidden_size, tweet_embed_size)
        self.decoder = nn.Linear(tweet_embed_size, vocab_size)
        self.decoder.weight = self.embedding_tweet.weight
        self.decoder.bias.data.zero_()
    
    def initial_weights(self):
        weights = torch.ones(509)
        weights[self.ignore] = 0
        return weights
        
    def forward(self, tweets, lang, hidden=None):
        emb = torch.cat((self.embedding_tweet(tweets), self.embedding_lang(lang)), -1)
        output, hidden_t = self.gru(emb, hidden)
        output = F.tanh(self.linear(output))
        output = self.decoder(output)
        return output, hidden_t

In [171]:
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 0.001
MOMENTUM = 0.0005
LOG_INTERVAL = 1000
device = torch.device("cpu")

In [8]:
# this cell takes a few minutes to run
train_tweets, train_lans = tools.data_encoding(data.train, data.vocab, lan_codes)
val_tweets, val_lans = tools.data_encoding(data.val, data.vocab, lan_codes)
test_tweets, test_lans = tools.data_encoding(data.test, data.vocab, lan_codes)

In [172]:
train_loader = tools.get_data_loader(train_tweets, train_lans, BATCH_SIZE, shuffle=True)
val_loader = tools.get_data_loader(val_tweets, val_lans, BATCH_SIZE)

In [173]:
model = RNN(data.vocab.index('</S>'))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=MOMENTUM)
criterion = nn.NLLLoss(model.initial_weights(), size_average=False)

In [174]:
tools.train(model, device, train_loader, optimizer, criterion, EPOCHS, LOG_INTERVAL,verbose=True)



KeyboardInterrupt: 

In [141]:
tools.test(model, device, val_loader, criterion, data.vocab.index('</S>'))

(44577.85694885254, 187.59736003674766)

In [None]:
tweets_val = torch.tensor(test_tweets, dtype=torch.long, device=torch.device("cpu"))

In [None]:
pred = tools.predict(model, device, tweets_val[:1000,:], val_lans[:1000,0])
print(f'Percent Correct: {pred*100}')