In [1]:
import numpy as np
import tools
import torch
import torch.nn as nn
import torch.nn.functional as F

# Some magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

## Load the Data and Calculate Vocabulary

In [2]:
Languages = {
    'en':'English', 
    'es':'Spanish', 
    'pt':'Portuguese', 
    'gl':'Galician', 
    'eu':'Basque', 
    'ca':'Catalan', 
    'fr':'French', 
    'it':'Italian', 
    'de':'German'
}

In [3]:
# Load Data
Data = tools.Data()
Data.load_data()

Converting Training Data...
Converting Test Data...
Converting Validation Data...
99.99149587549961555

In [4]:
print('Size of the vocabulary: %d characters' % len(Data.vocab))

Size of the vocabulary: 509 characters


In [5]:
# Data is list of tensors (each tensor is single tweet)
print(len(Data.train))
print(len(Data.val))
print(len(Data.test))

print(f'\n{Data.train[0].size()}')
# Tensor Dimensions (sequence length, batch size, vocab)

80175
11759
14960

torch.Size([115, 1, 509])


## Calculate Percent out of vocab for Training and Validation

In [6]:
num_invalid_train = 0
num_invalid_val = 0
for k in range(len(Data.train)):
    num_invalid_train += torch.sum(Data.train[k][:, :, 0]).double()

for k in range(len(Data.val)):
    num_invalid_val += torch.sum(Data.val[k][:, :, 0]).double()
    
print(f'Percent of Invalid Characters - Train: {num_invalid_train / len(Data.train) * 100 : 5.5} %')
print(f'Percent of Invalid Characters - Val:   {num_invalid_val / len(Data.val) * 100 : 5.5} %')

Percent of Invalid Characters - Train:  3.0009 %
Percent of Invalid Characters - Val:    3.8949 %


## PREPLEXITY? Implement after model?
seems helpful: https://stackoverflow.com/questions/59209086/calculate-perplexity-in-pytorch

## Model

In [7]:
# playing around, just following the NLP from scratch tutorial
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

rnn = RNN(len(Data.vocab), 128, len(Languages))

In [8]:
input = Data.string_to_tensor('This is a test')
hidden = torch.zeros(1, 128)

output, next_hidden = rnn(input[0], hidden)
print(output.size())

torch.Size([1, 9])


## Model (John)