### Import custom modules from current folder

In [247]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import nltk
from sklearn.model_selection import train_test_split
# from simple_text_representation.classes import Text
# from simple_text_representation.models import Database
import pandas as pd
import numpy as np
# from nltk.draw.tree import draw_trees

In [3]:
# database = Database('educationalTexts', 'postgres', '', '0.0.0.0', 5432)
# path = r'http://localhost/'

In [4]:
def transformToString(text):
    textStr = ''

    for paragraph in text:
        for line in paragraph:
            textStr = textStr + line
    return textStr

### Load the data

In [5]:
dfSeventh = pd.read_csv('../Data/textsSeventhgrade.csv')
dfEighth = pd.read_csv('../Data/textsEighthgrade.csv')
dfNinth = pd.read_csv('../Data/textsNinthgrade.csv')
dfTenth = pd.read_csv('../Data/textsTenthGrade.csv')
dfEleventh = pd.read_csv('../Data/textsEleventhgrade.csv')

### Transform the data to known structures

In [6]:
def transform_csv_to_structure(df):
    text_ids = df['id'].unique()
    texts = list()
    for text_id in text_ids:
        sentence_grouped = df.where(dfSeventh['id'] == 1)
        clean_df = df[pd.notnull(sentence_grouped['id'])]
        paragraphs = list()
        df_paragraphs = clean_df.groupby('paragraph_id').apply(lambda x: "%s" % '|'.join(x['value'])).values
        for df_paragraph in df_paragraphs:
            paragraphs.append(df_paragraph.split('|'))
        texts.append(paragraphs)

    return texts

In [7]:
textOfSeventhGrade = transform_csv_to_structure(dfSeventh)
textOfEightGrade = transform_csv_to_structure(dfEighth)
textOfNineGrade = transform_csv_to_structure(dfNinth)
textOfTenthGrade = transform_csv_to_structure(dfTenth)
textOfEleventhGrade = transform_csv_to_structure(dfEleventh)

textsFormatedSG = [transformToString(textArr) for textArr in textOfSeventhGrade]
textsFormatedEG = [transformToString(textArr) for textArr in textOfEightGrade]
textsFormatedNG = [transformToString(textArr) for textArr in textOfNineGrade]
textsFormatedTG = [transformToString(textArr) for textArr in textOfTenthGrade]
textsFormatedEG = [transformToString(textArr) for textArr in textOfEleventhGrade]

### Format train and test data

In [8]:
data = np.concatenate((np.array(textsFormatedSG),
                       np.array(textsFormatedEG),
                       np.array(textsFormatedNG),
                       np.array(textsFormatedTG),
                       np.array(textsFormatedEG)  )) 
labels = np.concatenate((np.full(len(textsFormatedSG), 0),
                         np.full(len(textsFormatedEG), 1),
                         np.full(len(textsFormatedNG), 2),
                         np.full(len(textsFormatedTG), 3),
                         np.full(len(textsFormatedEG), 4)))

In [9]:
len(data)

179

In [10]:
len(labels)

179

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [12]:
len(y_train)

143

In [13]:
len(x_train)

143

### Load the trained embeddings

In [14]:
EMBEDDINGS_DIMESION = 300

In [15]:
embeddingsIndex = dict()
f = open('../SBW-vectors-300-min5.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddingsIndex[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddingsIndex))

Loaded 1000654 word vectors.


### Preprocesing the data

In [258]:
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch
from torch.autograd import Variable
from math import ceil

# from torch.nn.utils.rnn import pad_sequence
# import spacy

In [152]:
# nlp = spacy.load('es')
# doc = nlp(textOfSeventhGrade[0][0][0])
# token_sentence = [token for token in doc]
# # token_sentence


In [18]:
def createEmbedingMatrix(tokenizer, vocabSize):
    embeddingMatrix = np.zeros((vocabSize, 300))
    for word, i in tokenizer.word_index.items():
        embeddingVector = embeddingsIndex.get(word)
        if embeddingVector is not None:
            embeddingMatrix[i] = embeddingVector
    return embeddingMatrix

In [19]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    vocabSize = len(tokenizer.word_index) + 1
    encodedData = tokenizer.texts_to_sequences(x)
    maxLength = EMBEDDINGS_DIMESION
    paddedData = pad_sequences(encodedData, maxlen=maxLength, padding='post')
    embeddings = createEmbedingMatrix(tokenizer, vocabSize)
    
    return paddedData

In [20]:
def createTrainData(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    vocabSize = len(tokenizer.word_index) + 1
    encodedData = tokenizer.texts_to_sequences(x)
    maxLength = EMBEDDINGS_DIMESION
    paddedData = pad_sequences(encodedData, maxlen=maxLength, padding='post')
    embeddings = createEmbedingMatrix(tokenizer, vocabSize)
    
    return paddedData, embeddings, maxLength, vocabSize

In [261]:
x_train_tokenize, x_train_embeddings, x_train_max_features, x_train_vocab_size  = createTrainData(x_train)
x_test_tokenize = tokenize(x_test)


torch_train_embeddings = torch.from_numpy(x_train_embeddings)
torch_train_tokenize = torch.from_numpy(x_train_tokenize)
torch_train_tokenize = Variable(torch_train_tokenize.type(torch.LongTensor))
torch_train_tokenize
# torch_train_embeddings
# torch_train_tokenize

Variable containing:
   54     5    22  ...      0     0     0
   41   126     7  ...      0     0     0
   41   126     7  ...      0     0     0
       ...          ⋱          ...       
   41   126     7  ...      0     0     0
    6    39    12  ...      0     0     0
    6    39    12  ...      0     0     0
[torch.LongTensor of size 143x300]

### Creating the model

In [60]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f0a0e999d30>

In [132]:
# embed = torch.ones([300, 1, 500])

In [133]:
# embed.shape

In [134]:
# m = torch.ones([300, 1, 5])

In [135]:
# d = [1, 2, 3, 4, 5]

In [136]:
# "A B C D E" -> 12 13 45 23 12 -> [1.2, 2.3] [1.3, 4.3] [0.2, 2.3] [1.2, 2.3] [1.2, 2.3] 
#                               -> [d, d, d, d, d]

In [137]:
# torch.cat((embed, m), dim=2)

In [138]:
# m.shape

In [139]:
# len(torch.chunk(embed, chunks=300))

In [140]:
# d = torch.chunk(embed, chunks=300)

In [141]:
# for v in d:
#     torch.cat((embed, m),)

In [142]:
# torch.cat((embed, m), dim=2)

In [168]:
def create_weight_matrix(dict_embeddings):
    matrix_len = len(dict_embeddings)
    weights_matrix_emb = np.zeros((matrix_len, EMBEDDINGS_DIMESION))
    
    for i, word in enumerate(dict_embeddings):
        weights_matrix_emb[i] = dict_embeddings[word]
        
    return  weights_matrix_emb

In [211]:
def create_emb_layer(emb_matrix, non_trainable=False):
    num_embeddings, embedding_dim = emb_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.weight = nn.Parameter(emb_matrix)
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [265]:
class LSTMClassifier(nn.Module):

    def __init__(self, weights_matrix, hidden_dim, vocab_size, batch_size, tagset_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim

#         self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 1)

        # The linear layer that maps from hidden state space to tag space
        # + 5 in the hidden_dim after the LSTM
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden(batch_size)

    def init_hidden(self, batch_size):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, batch_size, self.hidden_dim),
                torch.zeros(1, batch_size, self.hidden_dim))

    # Recibe el batch
    # [[1,2,3,4,5,6]] [[0.4,14,5123,123,21]]
    # [[]]
    def forward(self, text):
        embeds = self.word_embeddings(text) # batch de textos
        # Append de las metricas al batch ( con cat )
        # torch.cat((embeds, metrics), 2)
        # (seq_len, batch_size, embed_size)
        print('Embeds', embeds);
        lstm_out, self.hidden = self.lstm(embeds.view(300, 32, -1), self.hidden)
        
        
        tag_space = self.hidden2tag(lstm_out.view(len(text), -1))
        
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

### Training the model

In [213]:
#batch_size - num_hidden_neurons - epochs - embedding_size - precision
testParams2 = [
  [32, 64, 10],
  [32, 128, 10],
  [32, 64, 20],
  [32, 128, 20],
  [32, 256, 20],
  [32, 64, 30],
  [32, 128, 30],
  [32, 64, 40],
  [32, 128, 40],
  [64, 64, 40],
  [64, 128, 40],
  [64, 256, 40],
  [32, 64, 50],
]

#### Transform dictionary into matrix

In [216]:
final_weigth_matrix = torch.from_numpy(create_weight_matrix(embeddingsIndex))

In [245]:
final_weigth_matrix = final_weigth_matrix.type(torch.FloatTensor)
final_weigth_matrix


 300.0000  300.0000  300.0000  ...   300.0000  300.0000  300.0000
  -0.0296    0.0113    0.0199  ...    -0.1281   -0.0049    0.0626
  -0.0130   -0.0008    0.0326  ...    -0.1325    0.0294    0.0634
             ...                ⋱                ...             
   0.0652   -0.0578   -0.0102  ...    -0.0061   -0.0227    0.0727
   0.0889   -0.1178   -0.0349  ...    -0.0530   -0.0172   -0.0181
   0.0272   -0.0494   -0.0813  ...    -0.0485    0.0100    0.0424
[torch.FloatTensor of size 1000654x300]

In [185]:
def split_train_batchs(arr_tokens, arr_targets, batch_size):
    max_length = ceil(len(arr_tokens)/batch_size)
    torch_train_batchs = list()
    y_train_batchs = list()

    for i in range(max_length):
        torch_train_batch = arr_tokens[i*batch_size:(i+1)*batch_size]
        y_train_batch = arr_targets[i*batch_size:(i+1)*batch_size]
        torch_train_batchs.append(torch_train_batch)
        y_train_batchs.append(y_train_batch)

    data_train_batch = list(zip(torch_train_batchs, y_train_batchs))

    return data_train_batch

In [263]:
def testModel(hidden_dim, epochs, texts_batch_size, train_batchs):
    model = LSTMClassifier(final_weigth_matrix, hidden_dim, x_train_vocab_size, texts_batch_size, 5)
    loss_function = nn.NLLLoss()
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.1)
    
    for epoch in range(epochs):
        for batch_text, targets in train_texts_batchs:
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Also, we need to clear out the hidden state of the LSTM,
            # detaching it from its history on the last instance.
            model.hidden = model.init_hidden(texts_batch_size)

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
#             sentence_in = prepare_sequence(sentence, word_to_ix)
#             targets = prepare_sequence(tags, tag_to_ix)

            # Step 3. Run our forward pass.
            tag_scores = model(batch_text)
            print('Tag scores: /n', tag_scores)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            print('Loss results/n', loss)
            loss.backward()
            optimizer.step()

In [266]:
for param_batch_size, num_hidden_neurons, epochs in testParams2:
    train_texts_batchs = split_train_batchs(torch_train_tokenize, y_train, param_batch_size) # tokens -> 300 x batch_size x num_text/batch_size
    print('Hidden layer:', num_hidden_neurons)
    print('Epochs:', epochs)
    print('Batch size:', param_batch_size)
    testModel(num_hidden_neurons, epochs, param_batch_size, train_texts_batchs)


Hidden layer: 64
Epochs: 10
Batch size: 32
Embeds Variable containing:
( 0 ,.,.) = 
 -1.1183e-02 -5.3800e-04  9.8043e-02  ...  -8.6297e-02  4.3756e-02  2.1946e-02
 -1.9094e-02 -5.4838e-02  6.1867e-02  ...  -1.3209e-01 -2.4071e-02  6.1029e-02
 -4.8046e-02 -7.0493e-02  9.8030e-03  ...  -3.7049e-02 -8.6930e-03  7.0250e-03
                 ...                   ⋱                   ...                
  3.0000e+02  3.0000e+02  3.0000e+02  ...   3.0000e+02  3.0000e+02  3.0000e+02
  3.0000e+02  3.0000e+02  3.0000e+02  ...   3.0000e+02  3.0000e+02  3.0000e+02
  3.0000e+02  3.0000e+02  3.0000e+02  ...   3.0000e+02  3.0000e+02  3.0000e+02

( 1 ,.,.) = 
  1.6822e-02 -8.6496e-02  5.3342e-02  ...  -1.6591e-02 -6.9230e-03 -4.8323e-02
 -5.5095e-02  6.0450e-03  8.7281e-02  ...   2.1905e-02 -8.0660e-03  4.6292e-02
 -2.6890e-03 -6.3456e-02  9.6190e-03  ...  -3.9190e-02  2.6918e-02  7.5460e-03
                 ...                   ⋱                   ...                
  3.0000e+02  3.0000e+02  3.0000e

RuntimeError: Variable data has to be a tensor, but got Variable

### Calculate the results of the model