In [1]:
import gzip
import numpy as np
import pandas as pd
import scipy.optimize
import random
from collections import defaultdict
import nltk
from sklearn import svm
import string
from sklearn import linear_model
import pickle

In [2]:
def parseData(fname):
  for l in open(fname):
    yield eval(l)
data = [d for d in parseData("train.json")]

In [3]:
reviews = [d['reviewText'] for d in data]
y = [d['categoryID'] for d in data]

In [5]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)

In [6]:
wordCount = defaultdict(int)
idf = defaultdict(int)
punctuation = set(string.punctuation)
avglen = 0
for d in data:
    r = ''.join([c for c in d['reviewText'].lower() if not c in punctuation])
    avglen += len(r.split())
    seen = []
    for w in r.split():
        wordCount[w] += 1
        if w not in seen:
            idf[w] += 1
            seen.append(w)
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
avglen /= len(data)

In [75]:
words = [x[1] for x in counts[:5000]]
wordId = dict(zip(words, range(1,len(words) + 1)))
wordSet = set(words)

In [18]:
X = []
for d in data:
    r = ''.join([c for c in d['reviewText'].lower() if not c in punctuation])
    split = r.split()
    vec = []
    for w in split:
        if w in wordId:
            vec.append(wordId[w])
    X.append(vec)

In [27]:
# yvec = []
# for d in y:
#     z = np.zeros(5)
#     z[d] = 1
#     yvec.append(z)
from keras.utils import np_utils
yvec = np_utils.to_categorical(y)

In [69]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
top_words = 5000
max_review_length = 3512
X_train = X[:180000]
y_train = yvec[:180000]
y_test = yvec[180000:]
X_test = X[180000:]
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 3512, 32)          160000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 3512, 32)          3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1756, 32)          0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 505       
Total params: 216,809
Trainable params: 216,809
Non-trainable params: 0
_________________________________________________________________
None
Train on 100000 samples, validate on 100000 samples
Epoch 1/3

KeyboardInterrupt: 

In [15]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [28]:
yvec[1]

array([ 1.,  0.,  0.,  0.,  0.], dtype=float32)

In [30]:
from keras.models import load_model

model.save('my_modeltest2.h5')  # creates a HDF5 file 'my_model.h5'

In [31]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

KeyboardInterrupt: 

In [58]:
def readGz(f):
    for l in gzip.open(f):
        yield eval(l)
        
test = []
predictions = open("lstm_prediction_category1.txt", 'w')
predictions.write("reviewerID-reviewHash,category\n")
for l in readGz("test_Category.json.gz"):
    test.append(l)
predictions.close()

In [54]:
l = data[1000]
r = ''.join([c for c in l['reviewText'].lower() if not c in punctuation])
split = r.split()
vec = []
vec1 = []
for w in split:
    if w in wordId:
        vec.append(wordId[w])
vec1.append(vec)
vec1 = sequence.pad_sequences(vec1, maxlen=max_review_length)
output = model.predict(vec1)
index = 0
cat = 0
m = output[0][0]
for i in output[0]:
    if i > m:
        i = m
        cat = index
    index += 1
print(cat)

1


In [43]:
    r = ''.join([c for c in l['reviewText'].lower() if not c in punctuation])
    split = r.split()
    vec = []
    vec1 = []
    for w in split:
        if w in wordId:
            vec.append(wordId[w])
    vec1.append(vec)
    vec1 = sequence.pad_sequences(vec1, maxlen=max_review_length)
    output = model.predict(vec1)
    index = 0
    cat = 0
    m = output[0][0]
    for i in output[0]:
        if i > m:
            i = m
            cat = index
        index += 1
    predictions.write(l['reviewerID'] + '-' + l['reviewHash'] + "," + str(cat) + "\n")

0

In [59]:
test[1]

{'helpful': {'nHelpful': 0, 'outOf': 0},
 'rating': 4.0,
 'reviewHash': 'R657711680',
 'reviewText': 'Cute product. Loved the fit. Fast shipping! I would recommended breaking it in before wearing. Hurts a bit on your first few wears. I also recommend wearing sandal socks cause your toes hurt a bit in these shoes.',
 'reviewTime': '06 8, 2014',
 'reviewerID': 'U670561057',
 'summary': 'Cute but a bit uncomfortable',
 'unixReviewTime': 1402185600}

In [60]:
test_in = []
for d in test:
    r = ''.join([c for c in d['reviewText'].lower() if not c in punctuation])
    split = r.split()
    vec = []
    for w in split:
        if w in wordId:
            vec.append(wordId[w])
    test_in.append(vec)

In [62]:
test_in = sequence.pad_sequences(test_in, maxlen=max_review_length)

In [63]:
output = model.predict_classes(test_in)

In [64]:
output[1]

0

In [67]:
predictions = open("lstm_prediction_category1.txt", 'w')
predictions.write('reviewerID-reviewHash,category\n')
for i,l in zip(output,test):
    predictions.write(l['reviewerID'] + '-' + l['reviewHash'] + "," + str(i) + "\n")
predictions.close()

In [76]:
wordId['the']

1