In [87]:
import numpy as np
import tools
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from matplotlib import pyplot as plt

# Some magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [88]:
lan_codes = ['en', 'es', 'pt', 'gl', 'eu', 'ca', 'fr', 'it', 'de']
Languages = ['English', 'Spanish', 'Portuguese', 'Galician', 'Basque', 'Catalan', 'French', 'Italian', 'German']

## Load the Data, Calculate Vocabulary and Perplexity

In [89]:
# Load Data
data = tools.Data()
print('Size of the vocabulary: %d characters' % len(data.vocab))

Size of the vocabulary: 509 characters


In [90]:
preplexity = data.get_perplexity()
print('Preplexity measurement is %.2f' % preplexity)

Preplexity measurement is 34.11


In [91]:
print('Percent of Invalid Characters - Train: %.5f%%' 
    % ((data.train_freq[data.vocab.index('<N>')] / data.train_freq.sum()) * 100.0))
print('Percent of Invalid Characters - Val: %.5f%%' 
    % ((data.val_freq[data.vocab.index('<N>')] / data.val_freq.sum()) * 100.0))

Percent of Invalid Characters - Train: 0.04622%
Percent of Invalid Characters - Val: 0.05987%


## Model

In [106]:
class MyRNN(nn.Module):
    def __init__(self, vocab_size, lang_size=9, char_vec_size=12, lang_vec_size=2, hidden_size=50, PAD=0):
        super(MyRNN, self).__init__()
        self.vocab_size = vocab_size
        self.lang_size = lang_size
        self.char_vec_size = char_vec_size
        self.lang_vec_size = lang_vec_size
        self.hidden_size = hidden_size
        
        self.char_encoder = nn.Embedding(self.vocab_size, self.char_vec_size)
        self.lang_encoder = nn.Embedding(self.lang_size, self.lang_vec_size)
        # the current hidden size = char_vec_size
        self.gru = nn.GRU(self.char_vec_size+self.lang_vec_size, self.hidden_size, num_layers=1)
        self.linear = nn.Linear(self.hidden_size, self.char_vec_size)
        self.decoder = nn.Linear(self.char_vec_size, self.vocab_size)
        
        # This shares the encoder and decoder weights as described in lecture.
        self.decoder.weight = self.char_encoder.weight
        self.decoder.bias.data.zero_()
        
        
        weight = torch.ones(vocab_size)
        # scores over PAD is not counted
        weight[PAD] = 0
        self.sm = nn.LogSoftmax(dim=1)
        self.crit = nn.NLLLoss(weight, size_average=False)

    def forward(self, tweets, lang, hidden=None):
        emb = torch.cat((self.lang_encoder(lang), self.char_encoder(tweets)), -1)
        output, hidden_t = self.gru(emb, hidden)
        output = F.softmax(self.linear(output))
        return output, hidden_t

    # Predefined loss function
    def loss(self, prediction, label, reduction='elementwise_mean'):
        prediction = prediction.view(-1, self.vocab_size)
        prediction = self.sm(prediction)
        loss_val = self.crit(prediction, label.view(-1))
        return loss_val

In [100]:
BATCH_SIZE = 64
FEATURE_SIZE = 15
TEST_BATCH_SIZE = 256
EPOCHS = 2
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0005
MOMENTUM = 0.5
LOG_INTERVAL = 500
device = torch.device("cpu")

In [101]:
# this cell takes a few minutes to run
train_tweets, train_lans = tools.data_encoding(data.train, data.vocab, lan_codes)
val_tweets, val_lans = tools.data_encoding(data.val, data.vocab, lan_codes)
test_tweets, test_lans = tools.data_encoding(data.test, data.vocab, lan_codes)

In [102]:
train_loader = tools.get_data_loader(train_tweets, train_lans, BATCH_SIZE, shuffle=True)
val_loader = tools.get_data_loader(val_tweets, val_lans, BATCH_SIZE)
test_loader = tools.get_data_loader(test_tweets, test_lans , BATCH_SIZE)

In [103]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f97e03036d0>

In [104]:
model = MyRNN(len(data.vocab), PAD=data.vocab.index('</S>'))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [107]:
tools.train(model, device, train_loader, optimizer, EPOCHS, LOG_INTERVAL,verbose=True)



In [115]:
# Save Model
import pickle
torch.save(model, 'trained_lan_model.pkl')

PicklingError: Can't pickle <class '__main__.MyRNN'>: it's not the same object as __main__.MyRNN

## Understanding Inputs and Outputs of Model

In [123]:
output, hidden, data = tools.test2(model, device, train_loader)
print(output.size())

torch.Size([64, 282])
torch.Size([64, 282])
torch.Size([64, 282, 509])


In [156]:
letter = torch.argmax(output[:,:,:], dim=2)
px.imshow(letter)

In [148]:
# Load Data
data = tools.Data()
print('Size of the vocabulary: %d characters' % len(data.vocab))


Size of the vocabulary: 509 characters
<N>


In [149]:
print(data.vocab[0])

<N>


In [117]:
import plotly.express as px
px.imshow(output[0,:,:])

In [46]:
tweet = torch.tensor(train_tweets, dtype = torch.long)
lang = torch.tensor(train_tweets, dtype = torch.long)
model(tweet,lang)

IndexError: index out of range in self

In [45]:
import inspect
inspect.getmembers(train_loader.dataset)
print(train_loader.dataset.tensors[0].size())

tweet = torch.tensor(train_tweets[0,:], dtype = torch.long)
print(tweet.size())

torch.Size([80175, 282])
torch.Size([282])
