In [1]:
import os
import pickle
from collections import defaultdict
import string

##########################
#### Helper Functions ####
##########################
def getCleanDataPath(filename):
    return os.path.join("C:\\", "Users", "liqhtninq", "Documents", "Sentiment Analysis Model", "Clean Data", filename)

def getModelPath(filename):
    return os.path.join("C:\\", "Users", "liqhtninq", "Documents", "Sentiment Analysis Model", "Models", filename)

def readPkl(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

def getWordCounts(data):
    wordCount = defaultdict(int)
    for d in data:
        r = ''.join([c for c in d.lower() if not c in set(string.punctuation)])
        seen = []
        for w in r.split():
            wordCount[w] += 1
            if w not in seen:
                seen.append(w)
    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()
    return counts

def assignIds(counts, numWords):
    words = [x[1] for x in counts[:numWords]]
    return dict(zip(words, range(len(words))))

def parseFeaturesFromText(data, wordId):
    X = []
    for d in data:
        r = ''.join([c for c in d.lower() if not c in set(string.punctuation)])
        split = r.split()
        vec = []
        for w in split:
            if w in wordId:
                vec.append(wordId[w])
        X.append(vec)
    return X

def getMaxLength(data):
    m = 0
    for d in data:
        m = max(m, len(d))
    return m

def saveAsPkl(file, data):
    with open(file, 'wb') as f:
        pickle.dump(data, f)

In [2]:
#####################
#### Define Data ####
#####################

featuresFileName = 'Amazon Electronics Reiviews Text.pkl'
targetFileName = 'Amazon Electronics Reiviews Ratings.pkl'

features = readPkl(getCleanDataPath(featuresFileName))
targets = readPkl(getCleanDataPath(targetFileName))

In [12]:
##############################################
#### Parse the data into something usable ####
##############################################
n_samples = 100000
num_words = 2000

X = features[:n_samples]
y = targets[:n_samples]

counts = getWordCounts(X)
wordIds = assignIds(counts, num_words)
X = parseFeaturesFromText(X, wordIds)

In [13]:
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
########################################
#### Split data into train and test ####
########################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
max_review_length = getMaxLength(X)

X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [17]:
######################
#### Define Model ####
###################### 
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
import tensorflow as tf

### Model Parameters ###
embedding_vecor_length = 512
num_cnn_filters = [32,16,8]
cnn_kernel_size = [8,4,4]
pool_size = [4,2,2]
LSTM_length = 512
n_categories = 1
activation_function = 'sigmoid'
loss_function = 'binary_crossentropy'
dense_layers=[512,256,128]
lstm_dropout = 0.05
optimizer = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
### Models ### 
def buildLSTMCNNModel():
    model = Sequential()
    model.add(Embedding(num_words, embedding_vecor_length, input_length=max_review_length))
    model.add(Conv1D(filters=num_cnn_filters[0], kernel_size=cnn_kernel_size[0], padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size[0]))
    model.add(Conv1D(filters=num_cnn_filters[1], kernel_size=cnn_kernel_size[1], padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size[1]))
    model.add(Conv1D(filters=num_cnn_filters[2], kernel_size=cnn_kernel_size[2], padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size[2]))
    model.add(LSTM(LSTM_length, dropout=lstm_dropout))
    model.add(Dense(dense_layers[0], activation='relu'))
    model.add(Dense(dense_layers[1], activation='relu'))
    model.add(Dense(dense_layers[2], activation='relu')) 
    model.add(Dense(n_categories, activation=activation_function))
    model.compile(loss=loss_function, optimizer=optimizer, metrics=['accuracy'])
    return model

with tf.device("/gpu:0"):
    model = buildLSTMCNNModel()
    print(model.summary())
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=256)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 2917, 512)         1024000   
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 2917, 32)          131104    
_________________________________________________________________
max_pooling1d_23 (MaxPooling (None, 729, 32)           0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 729, 16)           2064      
_________________________________________________________________
max_pooling1d_24 (MaxPooling (None, 364, 16)           0         
_________________________________________________________________
conv1d_25 (Conv1D)           (None, 364, 8)            520       
_________________________________________________________________
max_pooling1d_25 (MaxPooling (None, 182, 8)            0         
__________

In [20]:
########################
#### Save The Model ####
########################

model_name = "LSTM_CNN_Electronics.pkl"

saveAsPkl(getModelPath(model_name), model)