In [1]:
import numpy as np
import tools
import torch
import torch.nn as nn
import torch.nn.functional as F

# Some magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

## Load the Data and Calculate Vocabulary

In [2]:
Languages = {
    'en':'English', 
    'es':'Spanish', 
    'pt':'Portuguese', 
    'gl':'Galician', 
    'eu':'Basque', 
    'ca':'Catalan', 
    'fr':'French', 
    'it':'Italian', 
    'de':'German'
}

lan_id = {
    'en':0, 
    'es':1, 
    'pt':2, 
    'gl':3, 
    'eu':4, 
    'ca':5, 
    'fr':6, 
    'it':7, 
    'de':8
}

In [9]:
# Load Data
Data = tools.Data()
Data.load_data()

Converting Training Data...
Converting Test Data...
Converting Validation Data...
     Stacking Tensor...

In [23]:
print('Size of the vocabulary: %d characters' % len(Data.vocab))

Size of the vocabulary: 509 characters


In [32]:
# Data is list of tensors (each tensor is single tweet)
print(f'Number of Training Samples:    {Data.train.size()[1]}')
print(f'Number of Validation Samples:  {Data.val.size()[1]}')
print(f'Number of Test Samples:        {Data.test.size()[1]}')

print(f'\n{Data.train.size()}')
print('Data Dimensions (sequence, batch, vocab)')

print(f'\n{Data.train_labels.size()}')
print('Label Dimensions (batch, language-onehot)')

Number of Training Samples:    80175
Number of Validation Samples:  11759
Number of Test Samples:        14960

torch.Size([161, 80175, 509])
Data Dimensions (sequence, batch, vocab)

torch.Size([80175, 9])
Label Dimensions (batch, language-onehot)


## Calculate Percent out of vocab for Training and Validation

In [27]:
out_of_vocab_train = torch.sum(Data.train[:, :, 0])

out_of_vocab_val = torch.sum(Data.val[:,:,0])
    
print(f'Percent of Invalid Characters - Train: {out_of_vocab_train / Data.train[:,:,0].numel() * 100 : 5.5} %')
print(f'Percent of Invalid Characters - Val:   {out_of_vocab_val / Data.val[:,:,0].numel() * 100 : 5.5} %')

Percent of Invalid Characters - Train:  0.018639 %
Percent of Invalid Characters - Val:    0.024192 %


## PREPLEXITY? Implement after model?
seems helpful: https://stackoverflow.com/questions/59209086/calculate-perplexity-in-pytorch

## Model

In [7]:
# playing around, just following the NLP from scratch tutorial
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

rnn = RNN(len(Data.vocab), 128, len(Languages))

In [8]:
input = Data.string_to_tensor('This is a test')
hidden = torch.zeros(1, 128)

output, next_hidden = rnn(input[0], hidden)
print(output.size())

torch.Size([1, 9])


## Model (John)

In [61]:
single_train = Data.train[:,0:1,:]
single_label = Data.train_labels[0:1,:]

RNN = tools.RNN_lan_class(vocab_size=509, hidden_size=100, output_size=3, num_lan = 9)

input_cat = RNN.forward((single_train, single_label), RNN.initHidden)

509


RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

In [67]:
# Debug embedding

char_embed = nn.Embedding(509,14)
char_embed(single_train[0,0,:])

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

In [66]:
single_train[0,0,:].size()

torch.Size([509])

In [71]:
# an Embedding module containing 10 tensors of size 3
embedding = nn.Embedding(10, 3)
# a batch of 2 samples of 4 indices each
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
print(input.size())
output = embedding(input)
print(output.size())


# example with padding_idx
embedding = nn.Embedding(10, 3, padding_idx=0)
input = torch.LongTensor([[0,2,0,5]])
embedding(input)


torch.Size([2, 4])
torch.Size([2, 4, 3])


tensor([[[ 0.0000,  0.0000,  0.0000],
         [ 2.4175, -0.3560, -2.3782],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.9229,  1.4340, -1.4837]]], grad_fn=<EmbeddingBackward>)

In [82]:
type(Data.vocab)

char_embed = nn.Embedding(509,14)

char_embed(Data.vocab)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list

## Data Representation Take 2


In [91]:
Data.train_pd['tweet'][1]

'Alemania vs Argentina, la tercera es la vencida: El Mundial de Brasil 2014 cierra este domingo con una reedici...'