In [1]:
import numpy as np
import tools
import torch
import torch.nn as nn
import torch.nn.functional as F

# Some magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

## Load the Data and Calculate Vocabulary

In [2]:
Languages = {
    'en':'English', 
    'es':'Spanish', 
    'pt':'Portuguese', 
    'gl':'Galician', 
    'eu':'Basque', 
    'ca':'Catalan', 
    'fr':'French', 
    'it':'Italian', 
    'de':'German'
}

lan_id = {
    'en':0, 
    'es':1, 
    'pt':2, 
    'gl':3, 
    'eu':4, 
    'ca':5, 
    'fr':6, 
    'it':7, 
    'de':8
}

In [9]:
# Load Data
Data = tools.Data()
Data.load_data()

Converting Training Data...
Converting Test Data...
Converting Validation Data...
     Stacking Tensor...

In [None]:
print('Size of the vocabulary: %d characters' % len(Data.vocab))

In [45]:
# Data is list of tensors (each tensor is single tweet)
print(f'Number of Training Samples:    {Data.train.size()[1]}')
print(f'Number of Validation Samples:  {Data.val.size()[1]}')
print(f'Number of Test Samples:        {Data.test.size()[1]}')

print(f'\n{Data.train.size()}')
# Tensor Dimensions (sequence, batch, vocab)

Number of Training Samples:    80175
Number of Validation Samples:  11759
Number of Test Samples:        14960

torch.Size([161, 80175, 509])


In [30]:
temp = torch.cat(Data.train, dim=1)

In [39]:
float(torch.sum(temp[:,:,0])/torch.numel(temp[:,:,0])*100)

0.018639350309967995

In [92]:
labels = Data.train_pd['lan'].to_numpy()
labels_int = []
for k, label in enumerate(labels):
    labels_int.append(lan_id[label])

labels_tens = torch.tensor(labels_int)

one_hot = torch.nn.functional.one_hot(labels_tens)

In [4]:
onehot = Data.label_to_tensor(Data.val_pd['lan'].to_numpy())
onehot.size()

AttributeError: 'Data' object has no attribute 'val_pd'

tensor([0, 0, 0, 0, 0, 0, 0, 1, 0])
7


## Calculate Percent out of vocab for Training and Validation

In [32]:
num_invalid_train = 0
num_invalid_val = 0
for k in range(len(Data.train)):
    num_invalid_train += torch.sum(Data.train[k][:, :, 0]).double()

for k in range(len(Data.val)):
    num_invalid_val += torch.sum(Data.val[k][:, :, 0]).double()
    
print(f'Percent of Invalid Characters - Train: {num_invalid_train / len(Data.train) * 100 : 5.5} %')
print(f'Percent of Invalid Characters - Val:   {num_invalid_val / len(Data.val) * 100 : 5.5} %')

Percent of Invalid Characters - Train:  3.0009 %
Percent of Invalid Characters - Val:    3.8949 %


## PREPLEXITY? Implement after model?
seems helpful: https://stackoverflow.com/questions/59209086/calculate-perplexity-in-pytorch

## Model

In [7]:
# playing around, just following the NLP from scratch tutorial
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

rnn = RNN(len(Data.vocab), 128, len(Languages))

In [8]:
input = Data.string_to_tensor('This is a test')
hidden = torch.zeros(1, 128)

output, next_hidden = rnn(input[0], hidden)
print(output.size())

torch.Size([1, 9])


## Model (John)