In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import numpy as np
import collections
import pandas as pd
import string
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

The first step is to get all the functions into the module. Apparently, we need to put the stuff together. The first function is the buildModel function, which will build the LSTM model.

In [3]:
def buildModel(X_train,y_train,X_test,y_test,batch_size):
    print('Build model...')
    model = Sequential()
    #model.add(Embedding(max_features, 128, dropout=0.2))
    model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
    model.add(Dense(1))
    model.add(Activation('softmax'))

    model.compile(loss='categorical-crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print('Train...')
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=10,
              validation_data=(X_test, y_test))
    score, acc = model.evaluate(X_test, y_test,
                                batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)

Then, we need some basic function including load_review_dataset, create_dictionary, transform_text to get the dataset ready.

In [4]:
def load_review_dataset(path):
    with open(path,'r',encoding='utf8') as f:
        content = f.readlines()
    res = []
    for line in content:
        a = line.strip('.').lower().split()
        b = []
        exclude = set(string.punctuation)
        for word in a:
            word = ''.join(ch for ch in word if ch not in exclude)
            b.append(word)
        res.append(b)
    # res = [line.strip('.').lower().split() for line in content]
    return res

def create_dictionary(messages):
    tempDict = collections.defaultdict(int)
    for message in messages:
        setWords = set(message)
        for word in setWords:
            tempDict[word] += 1
    resDict = collections.defaultdict(int)
    i = 0
    for key,val in tempDict.items():
        if(val>=6):
            resDict[key] = i
            i +=1
    return resDict

def transform_text(messages, word_dictionary):
    numRows = len(messages)
    print(numRows)
    numCols = max(word_dictionary.values())+1
    print(numCols)
    resArray = np.zeros((numRows,numCols))
    for i in range(len(messages)):
        message = messages[i]
        for word in message:
            if(word in word_dictionary):
                col = word_dictionary[word]
                resArray[i][col] +=1
    return resArray

Now we need to load the dataset including the train_review, train_rate,test_review, test_rate using the function loadTrainTestDataSet().

In [5]:
def loadTrainTestDataSet():
    train_review_path = './data/train.review.txt'
    train_rate_path = './data/train.rating.txt'
    dev_review_path = './data/dev.review.txt'
    dev_rate_path = './data/dev.rating.txt'
    test_review_path = './data/test.review.txt'
    test_rate_path = './data/test.rating.txt'
    review_path = './data/review_full.txt'
    rate_path = './data/rate_full.txt'
    messages = load_review_dataset(review_path)
    word_dict = create_dictionary(messages)

    train_messages = load_review_dataset(train_review_path)
    train_resArray = transform_text(train_messages, word_dict)
    dev_messages = load_review_dataset(dev_review_path)
    dev_resArray = transform_text(dev_messages, word_dict)
    test_messages = load_review_dataset(test_review_path)
    test_resArray = transform_text(test_messages, word_dict)

    # resArray = transform_text(messages, word_dict)
    # print('res array shape ',resArray.shape)
    # glove = getAverageGlove(messages)
    # print('glove', glove.shape)
    # To construct the training dataset
    # numOfDataPoints = len(resArray)
    # numOfTraining = int(numOfDataPoints * 0.8)
    trainingDataX = train_resArray[:, ]
    testDataX = test_resArray[:, ]
    trainRateY = pd.read_csv(train_rate_path, header=None)
    testRateY = pd.read_csv(test_rate_path, header=None)
    # print(len(traRateY))
    trainingRateY = trainRateY[:][0]
    testRateY = testRateY[:][0]
    return (trainingDataX,trainingRateY,testDataX,testRateY)

Assemble the loading dataset and building model together. 

In [6]:
def main():
    X_train,y_train,X_test,y_test = loadTrainTestDataSet()
    batch_size = 32
    buildModel(X_train, y_train, X_test, y_test, batch_size)

In [7]:
main()

21000
4223
6814
4223
6813
4223
Build model...


  """


ValueError: The first layer in a Sequential model must get an `input_shape` or `batch_input_shape` argument.