### Import custom modules from current folder

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import nltk
from sklearn.model_selection import train_test_split
from simple_text_representation.classes import Text
from simple_text_representation.models import Database
import pandas as pd
import numpy as np
# from nltk.draw.tree import draw_trees

In [3]:
database = Database('educationalTexts', 'postgres', '', '0.0.0.0', 5432)
path = r'http://localhost/'

In [4]:
def transformToString(text):
    textStr = ''

    for paragraph in text:
        for line in paragraph:
            textStr = textStr + line
    return textStr

### Load the data

In [5]:
dfSeventh = pd.read_csv('../Data/textsSeventhgrade.csv')
dfEighth = pd.read_csv('../Data/textsEighthgrade.csv')
dfNinth = pd.read_csv('../Data/textsNinthgrade.csv')
dfTenth = pd.read_csv('../Data/textsTenthGrade.csv')
dfEleventh = pd.read_csv('../Data/textsEleventhgrade.csv')

### Transform the data to known structures

In [6]:
def transform_csv_to_structure(df):
    text_ids = df['id'].unique()
    texts = list()
    for text_id in text_ids:
        sentence_grouped = df.where(dfSeventh['id'] == 1)
        clean_df = df[pd.notnull(sentence_grouped['id'])]
        paragraphs = list()
        df_paragraphs = clean_df.groupby('paragraph_id').apply(lambda x: "%s" % '|'.join(x['value'])).values
        for df_paragraph in df_paragraphs:
            paragraphs.append(df_paragraph.split('|'))
        texts.append(paragraphs)

    return texts

In [7]:
textOfSeventhGrade = transform_csv_to_structure(dfSeventh)
textOfEightGrade = transform_csv_to_structure(dfEighth)
textOfNineGrade = transform_csv_to_structure(dfNinth)
textOfTenthGrade = transform_csv_to_structure(dfTenth)
textOfEleventhGrade = transform_csv_to_structure(dfEleventh)

textsFormatedSG = [transformToString(textArr) for textArr in textOfSeventhGrade]
textsFormatedEG = [transformToString(textArr) for textArr in textOfEightGrade]
textsFormatedNG = [transformToString(textArr) for textArr in textOfNineGrade]
textsFormatedTG = [transformToString(textArr) for textArr in textOfTenthGrade]
textsFormatedEG = [transformToString(textArr) for textArr in textOfEleventhGrade]

### Format train and test data

In [8]:
data = np.concatenate((np.array(textsFormatedSG),
                       np.array(textsFormatedEG),
                       np.array(textsFormatedNG),
                       np.array(textsFormatedTG),
                       np.array(textsFormatedEG)  )) 
labels = np.concatenate((np.full(len(textsFormatedSG), 0),
                         np.full(len(textsFormatedEG), 1),
                         np.full(len(textsFormatedNG), 2),
                         np.full(len(textsFormatedTG), 3),
                         np.full(len(textsFormatedEG), 4)))

In [9]:
len(data)

179

In [10]:
len(labels)

179

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [12]:
len(y_train)

143

In [13]:
len(x_train)

143

### Load the trained embeddings

In [14]:
EMBEDDINGS_DIMESION = 300

In [15]:
embeddingsIndex = dict()
f = open('../SBW-vectors-300-min5.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddingsIndex[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddingsIndex))

Loaded 1000654 word vectors.


### Preprocesing the data

### Creating the model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10753b270>

In [2]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 1)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, text):
        embeds = self.word_embeddings(text)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(text), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(text), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores