In [1]:
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers import TimeDistributed
from keras.models import load_model
from keras.utils import Sequence
import keras.backend as K

import numpy as np
import pandas as pd
import math
from ast import literal_eval
import time

from TextAnalisys.wordListToVecListConverter import WordListToVecListConverter
from TextAnalisys.wordToVecConverter import WordToVecConverterOneHotEncoder
from modelTrainer import ModelTrainer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def convertToVectorizedData(frame):
    converter = WordListToVecListConverter()
    wordLists = frame["text"].apply(literal_eval).values
    wordConverter = WordToVecConverterOneHotEncoder()
    wordConverter.fit(wordLists)
    converter.setWordToVecConverter(wordConverter)
    vectorizedWordLists = []
    for wordList in wordLists:
        vectorizedWordLists.append(converter.convert(wordList))
    vectorizedDataFrame = pd.DataFrame()
    vectorizedDataFrame["vector"] = vectorizedWordLists
    vectorizedDataFrame["label"] = frame["label"]
    return vectorizedDataFrame

In [3]:
def cleanDocumentRepresentationsVectors(documentRepresentations):
    emptyDocumentIndexes = []
    for idx, row in documentRepresentations.T.iteritems():
        if row["vector"] == []:
            emptyDocumentIndexes.append(idx)
    documentRepresentations.drop(emptyDocumentIndexes, inplace=True)

In [4]:
def padSequence(sequence, necessaryNumberOfWordsInDocument):
    if len(sequence) == 0:
        return None
    wordVectorLength = None
    res = None
    if len(sequence[0].shape) == 2:
        wordVectorLength = sequence[0].shape[-1]
        res = np.zeros(shape=(len(sequence), necessaryNumberOfWordsInDocument, wordVectorLength), dtype=np.int32)
    else:
        wordVectorLength = 1
        res = np.zeros(shape=(len(sequence), necessaryNumberOfWordsInDocument))
    i = 0
    while i < len(sequence):
        realNumberOfWordsInDocument = sequence[i].shape[0]
        difference = necessaryNumberOfWordsInDocument - realNumberOfWordsInDocument
        if difference > 0:
            res[i] = np.insert(sequence[i], [0] * difference, 0, axis=0)
        else:
            difference *= -1
            res[i] = np.delete(sequence[i], [-1] * difference, axis=0)
        i += 1
    return res

In [5]:
def GeneratorInfinite(X, y, batch_size, necessaryNumberOfWordsInDocument):
    modulo = len(X) % batch_size
    maxIndex = None
    numberOfDataPieces = None
    if modulo != 0:
        numberOfDataPieces = math.ceil(len(X) / batch_size)
    else:
        numberOfDataPieces = math.floor(len(X) / batch_size)
    idx = 0
    while True:
        X_res = None
        y_res = None
        beginIndexInSequence = idx * batch_size
        sequenceLength = None
        if idx < numberOfDataPieces - 1:
            sequenceLength = batch_size
        else:
            sequenceLength = len(X) - beginIndexInSequence
        X_res = pad_sequences(X[beginIndexInSequence : beginIndexInSequence + sequenceLength], 
                             necessaryNumberOfWordsInDocument, 
                             truncating='post')
        '''X_res = padSequence(X[beginIndexInSequence : beginIndexInSequence + sequenceLength], 
                             necessaryNumberOfWordsInDocument)'''
        y_res = y[beginIndexInSequence : beginIndexInSequence + sequenceLength]
        yield (X_res, y_res)
        idx += 1
        idx = idx % numberOfDataPieces

In [6]:
class GeneratorFinite(Sequence):
    def __init__(self, X, y, batchSize, necessaryNumberOfWordsInDocument):
        self.__X = X
        self.__y = y
        self.__batchSize = batchSize
        self.__necessaryNumberOfWordsInDocument = necessaryNumberOfWordsInDocument
        
        notDivisibleByBatchSize = len(self.__X) % self.__batchSize
        if notDivisibleByBatchSize != 0:
            self.__len = math.ceil(len(self.__X) / self.__batchSize)
        else:
            self.__len = math.floor(len(self.__X) / self.__batchSize)
    def __len__(self):
        return self.__len
    def __getitem__(self, idx):
        beginIndexInSequence = idx * self.__batchSize
        numberOfEntities = None
        if idx < self.__len__() - 1:
            numberOfEntities = self.__batchSize
        else:
            numberOfEntities = len(self.__X) - beginIndexInSequence
        X_res = pad_sequences(self.__X[beginIndexInSequence : beginIndexInSequence + numberOfEntities], 
                             self.__necessaryNumberOfWordsInDocument, 
                             truncating='post')
        '''X_res = padSequence(self.__X[beginIndexInSequence : beginIndexInSequence + numberOfEntities], 
                             self.__necessaryNumberOfWordsInDocument)'''
        y_res = self.__y[beginIndexInSequence : beginIndexInSequence + numberOfEntities]
        return (X_res, y_res)
    __X = None
    __y = None
    __batchSize = None
    __necessaryNumberOfWordsInDocument = None
    __len = None

In [7]:
def train_validate_test_split(df, train_percent=0.7, validate_percent=0.1, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.ix[perm[:train_end]]
    validate = df.ix[perm[train_end:validate_end]]
    test = df.ix[perm[validate_end:]]
    return train, validate, test

In [8]:
frame = pd.DataFrame.from_csv("databasePositiveNegativeSeparatedWords.csv")

  """Entry point for launching an IPython kernel.


In [9]:
data = convertToVectorizedData(frame)
cleanDocumentRepresentationsVectors(data)
train, validation, test = train_validate_test_split(data)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


In [10]:
constantNumberOfWordsPerDocument = 40
batchSize = 5000
max_features = 100000

In [11]:
trainGenerator = GeneratorInfinite(train["vector"].values, 
                                   train["label"].values, 
                                   batchSize, 
                                   constantNumberOfWordsPerDocument)

In [12]:
trainFiniteGenerator = GeneratorFinite(train["vector"].values, 
                                       train["label"].values, 
                                       batchSize, 
                                       constantNumberOfWordsPerDocument)

In [13]:
testGenerator = GeneratorFinite(test["vector"].values, 
                                test["label"].values, 
                                batchSize, 
                                constantNumberOfWordsPerDocument)

In [14]:
validationGenerator = GeneratorFinite(validation["vector"].values, 
                                validation["label"].values, 
                                batchSize, 
                                constantNumberOfWordsPerDocument)

In [15]:
model = Sequential()
model.add(Embedding(max_features, 128, input_length=constantNumberOfWordsPerDocument))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('tanh'))

model.compile(loss='mean_absolute_error',
              optimizer='adam')

In [16]:
#model = load_model("sentimentModel")

trainer = ModelTrainer(10, "sentimentModel")
trainer.setModel(model)
trainer.setGenerators(trainGenerator, testGenerator)
trainer.setTrainFiniteGeneratorForTestModel(trainFiniteGenerator)

In [17]:
trainer.train(showModelOutputDuringTrainingSteps=True)

Number of steps before testing step: 10
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Number of steps made: 10
Current loss on train data: 0.9905693893864069
Current loss: 0.9897886681148897
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Number of steps made: 20
Current loss on train data: 0.9545933449214211
Current loss: 0.9578045168853777


KeyboardInterrupt: 

In [18]:
def custom_metric(y_true, y_pred):
    return K.mean(K.less(K.abs(y_true - y_pred), 1), axis=-1)

In [19]:
model.compile(loss='mean_absolute_error',
              metrics=[custom_metric],
              optimizer='adam')

In [20]:
model.evaluate_generator(testGenerator, use_multiprocessing=True)

[0.7990858593728677, 0.6028037350069718]

In [4]:
model = load_model("sentimentModel")