### Import custom modules from current folder

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import nltk
from sklearn.model_selection import train_test_split
from simple_text_representation.classes import Text
from simple_text_representation.models import Database
import numpy as np
# from nltk.draw.tree import draw_trees

In [3]:
database = Database('educationalTexts', 'postgres', '', '0.0.0.0', 5432)
path = r'http://localhost/'

In [4]:
def transformToString(text):
    textStr = ''

    for paragraph in text:
        for line in paragraph:
            textStr = textStr + line
    return textStr

In [5]:
textOfSeventhGrade = Text.getTexts(database, grade=7)
textOfEightGrade = Text.getTexts(database, grade=8)
textOfNineGrade = Text.getTexts(database, grade=9)
textOfTenthGrade = Text.getTexts(database, grade=10)
textOfEleventhGrade = Text.getTexts(database, grade=11)

textsFormatedSG = [transformToString(textArr) for textArr in textOfSeventhGrade]
textsFormatedEG = [transformToString(textArr) for textArr in textOfEightGrade]
textsFormatedNG = [transformToString(textArr) for textArr in textOfNineGrade]
textsFormatedTG = [transformToString(textArr) for textArr in textOfTenthGrade]
textsFormatedEG = [transformToString(textArr) for textArr in textOfEleventhGrade]

### Format train and test data

In [6]:
data = np.concatenate((np.array(textsFormatedSG),
                       np.array(textsFormatedEG),
                       np.array(textsFormatedNG),
                       np.array(textsFormatedTG),
                       np.array(textsFormatedEG)  )) 
labels = np.concatenate((np.full(len(textsFormatedSG), 0),
                         np.full(len(textsFormatedEG), 1),
                         np.full(len(textsFormatedNG), 2),
                         np.full(len(textsFormatedTG), 3),
                         np.full(len(textsFormatedEG), 4)))

In [7]:
len(data)

175

In [8]:
len(labels)

175

### Preprocesing the data

In [9]:
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
def getLongestText(texts):
    longest = -1

    for text in texts:
        longest = len(text) if (len(text) > longest) else longest
    
    return longest

In [38]:
# Try to build a the tokenizer for each sentence, instead for each words in a text.

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
vocabSize = len(tokenizer.word_index) + 1
encodedData = tokenizer.texts_to_sequences(data)
maxLength = 500
paddedData = pad_sequences(encodedData, maxlen=maxLength, padding='post')

12715


### Load the trained embeddings

In [12]:
embeddingsIndex = dict()
f = open('../SBW-vectors-300-min5.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddingsIndex[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddingsIndex))

Loaded 1000654 word vectors.


In [13]:
embeddingMatrix = np.zeros((vocabSize, 300))
for word, i in tokenizer.word_index.items():
    embeddingVector = embeddingsIndex.get(word)
    if embeddingVector is not None:
        embeddingMatrix[i] = embeddingVector

In [14]:
embeddingMatrix.shape

(12715, 300)

### F1 Score for the model

In [15]:
from keras import backend as K

In [16]:
def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

In [17]:
def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [18]:
def f1(y_true, y_pred):
    currentPrecision = precision(y_true, y_pred)
    currentRecall = recall(y_true, y_pred)
    return 2*((currentPrecision*currentRecall)/(currentPrecision+currentRecall+K.epsilon()))

### Creating the model

In [19]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding, LSTM
np.random.seed(7)

In [34]:
model = Sequential()
embeddingLayer = Embedding(vocabSize, 300, weights=[embeddingMatrix], input_length=maxLength, trainable=False)
model.add(embeddingLayer)
model.add(LSTM(200))
model.add(Dropout(0.1))
model.add(Dense(5, activation='softmax'))

In [35]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 300)          3814500   
_________________________________________________________________
lstm_4 (LSTM)                (None, 200)               400800    
_________________________________________________________________
dropout_7 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 1005      
Total params: 4,216,305
Trainable params: 401,805
Non-trainable params: 3,814,500
_________________________________________________________________


In [36]:
model.compile(loss='sparse_categorical_crossentropy',
  optimizer='adam',
  metrics=['acc', precision, recall, f1])

In [37]:
model.fit(paddedData, labels,
  batch_size=32,
  epochs=5,
  verbose=1,
  validation_split=0.2,
  shuffle=True)

Train on 140 samples, validate on 35 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1302b3da0>

### Cross Validation

In [314]:
from sklearn.model_selection import KFold

### Old way

In [275]:
from keras import models
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification

np.random.seed(0)

In [276]:
# Number of features
numberOfFeatures = 1024

# Generate features matrix and target vector
features, target = make_classification(n_samples = 174,
                                       n_features = numberOfFeatures,
                                       n_informative = 3,
                                       n_redundant = 0,
                                       n_classes = 2,
                                       weights = [.5, .5],
                                       random_state = 0)

In [277]:
# Create function returning a compiled network
def createNetwork():
    
    model = Sequential()
    embeddingLayer = Embedding(vocabSize, 300, weights=[embeddingMatrix], input_length=maxLength, trainable=False)
    model.add(embeddingLayer)
    model.add(Dropout(0.1))
    model.add(LSTM(200))
    model.add(Dropout(0.1))
    model.add(Dense(5, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy',
      optimizer='adam',
      metrics=['acc'])

    return model

In [284]:
# Wrap Keras model so it can be used by scikit-learn
neuralNetwork = KerasClassifier(build_fn=createNetwork, 
                                 epochs=3, 
                                 batch_size=22, 
                                 verbose=1,
                                 validation_split=0.2,
                                 shuffle=True)

In [285]:
# Evaluate neural network using three-fold cross-validation
cross_val_score(neuralNetwork, paddedData, labels, cv=3)

Train on 92 samples, validate on 24 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 93 samples, validate on 24 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 93 samples, validate on 24 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


array([0., 0., 0.])

### Test

In [25]:
txtStr = transformToString(textOfTenthGrade[0])
len(txtStr)

8319

In [26]:
tokenizerTest = Tokenizer()
tokenizerTest.fit_on_texts([txtStr])
vocabSizeTest = len(tokenizerTest.word_index) + 1
encodedDataTest = tokenizerTest.texts_to_sequences(data)
paddedDataTest = pad_sequences(encodedDataTest, maxlen=maxLength, padding='post')

In [27]:
paddedData.shape

(175, 500)

In [28]:
result = model.predict(paddedData)

In [29]:
result

array([[0.9302939 , 0.4392127 , 0.27340063, 0.22505705, 0.0183492 ],
       [0.9302997 , 0.43920457, 0.27339292, 0.22504601, 0.0183496 ],
       [0.9302939 , 0.4392127 , 0.27340063, 0.22505705, 0.0183492 ],
       [0.3600198 , 0.7308806 , 0.6234632 , 0.7994102 , 0.00160389],
       [0.9302939 , 0.43921277, 0.27340066, 0.22505707, 0.0183492 ],
       [0.9302939 , 0.4392127 , 0.27340063, 0.22505705, 0.0183492 ],
       [0.9302939 , 0.4392127 , 0.27340063, 0.22505707, 0.0183492 ],
       [0.9302939 , 0.4392127 , 0.27340066, 0.22505707, 0.01834919],
       [0.9302939 , 0.4392127 , 0.27340063, 0.22505705, 0.01834919],
       [0.9302939 , 0.43921277, 0.27340066, 0.22505707, 0.0183492 ],
       [0.9302939 , 0.4392127 , 0.27340066, 0.22505707, 0.0183492 ],
       [0.9302939 , 0.4392127 , 0.27340063, 0.22505705, 0.0183492 ],
       [0.3923057 , 0.74577045, 0.62846714, 0.79817814, 0.00154904],
       [0.93029547, 0.4392104 , 0.27339852, 0.22505398, 0.01834941],
       [0.9302939 , 0.43921277, 0.