In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from string import punctuation
from keras.models import Sequential
from sklearn.metrics import mean_squared_error
from keras import regularizers, layers
from keras.layers import Embedding, Dense, Dropout, Flatten, BatchNormalization, Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

f = open('datasetSplit.txt')
text = f.read()
f.close()
text = text.split()
idx_label = {} # key is train/val/test, value is the sentence number
idx_label['1'] = []
idx_label['2'] = []
idx_label['3'] = []
for i in text[1:]:
    try:
        x = i.split(',')
        label = x[1]
        idx = x[0]
        idx_label[label].append(idx)
    except ValueError:
        print(i)


f = open('datasetSentences.txt')
text_sentences = f.read()
f.close()
text_sentences = text_sentences.split('\n')
idx_sentence = {} # key is the number of sentence, value is the sentence text
for s in text_sentences[1:]:
    try:
        ss = s.split('\t')
        idx = ss[0]
        sentence = ss[1]
        idx_sentence[idx] = sentence
    except IndexError:
        print(s)


f = open('sentiment_labels.txt')
text = f.readlines()
f.close()
idx_sentiment = {} # key is the number of sentence, value is the sentiment
for s in text[1:]:
    try:
        ss = s.split('|')
        idx = ss[0]
        sentiment = ss[1].split('\n')
        idx_sentiment[idx] = float(sentiment[0])
    except IndexError:
        print(s)


f = open('dictionary.txt')
text = f.read()
f.close()
lines = text.split('\n')
phrase_sentiment = {}
for line in lines:
    try:
        ss = line.split('|')
        phrase = ss[0]
        idx = ss[1]
        phrase_sentiment[phrase] = idx
    except IndexError:
        print(line)


Xtrain = []
Xval = []
Xtest = []
for idx in idx_label['1']: # training set
    Xtrain.append(idx_sentence[idx])
for idx in idx_label['2']: # training set
    Xval.append(idx_sentence[idx])
for idx in idx_label['3']: # training set
    Xtest.append(idx_sentence[idx])


ytrain = {}
yval = {}
ytest = {}
notin = []
for i in Xtrain:
    if i not in phrase_sentiment:
        notin.append(i)
        continue
    senti_idx = phrase_sentiment[i]
    sentiment = idx_sentiment[senti_idx]
    ytrain[i] = sentiment
    
for i in Xval:
    if i not in phrase_sentiment:
        notin.append(i)
        continue
    senti_idx = phrase_sentiment[i]
    sentiment = idx_sentiment[senti_idx]
    yval[i] = sentiment

for i in Xtest:
    if i not in phrase_sentiment:
        notin.append(i)
        continue
    senti_idx = phrase_sentiment[i]
    sentiment = idx_sentiment[senti_idx]
    ytest[i] = sentiment


x_train = []
y_train = []
for k in ytrain:
    y = ytrain[k]
    x_train.append(k)
    y_train.append(y)

x_val = []
y_val = []
for k in yval:
    y = yval[k]
    x_val.append(k)
    y_val.append(y)

x_test = []
y_test = []
for k in ytest:
    y = ytest[k]
    x_test.append(k)
    y_test.append(y)

max_sentence_len = 0
for x in x_train:
    length = len(x)
    max_sentence_len = length if length > max_sentence_len else max_sentence_len

tokenizer = Tokenizer(oov_token='unk')
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
maxlen = max_sentence_len
# maxlen = 30

seq_train = tokenizer.texts_to_sequences(x_train)
seq_val = tokenizer.texts_to_sequences(x_val)
seq_test = tokenizer.texts_to_sequences(x_test)

training_data = pad_sequences(seq_train, padding='post', maxlen=maxlen)
val_data = pad_sequences(seq_val, padding='post', maxlen=maxlen)
test_data = pad_sequences(seq_test, padding='post', maxlen=maxlen)

model = Sequential()
model.add(layers.Embedding(vocab_size, 100, input_length=max_sentence_len))
model.add(layers.Conv1D(128,5))
model.add(layers.MaxPooling1D(3,3, padding='same'))
model.add(layers.Conv1D(64,5))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(layers.Dense(64, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=['accuracy'])
model.summary()



# the output of evaluate:
# [0.06994478800267066, 0.007670182166826462]
# training loss: 0.0094
# training acc: 0.0042
# validation loss: 0.0503
# validation acc: 0.0033








Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 261, 100)          1474300   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 257, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 86, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 82, 64)            41024     
_________________________________________________________________
flatten_1 (Flatten)          (None, 5248)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 5248)              0         
___________________

In [5]:
filepath = "saved-a42-cnn.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='auto', period=1)

history = model.fit(training_data, 
                    y_train,
                    validation_data=(val_data, y_val),
                    batch_size=30, 
                    epochs=100,
                    callbacks=[checkpoint])



In [6]:
model.load_weights("saved-a42-cnn.hdf5")

In [8]:
pred = model.predict(test_data)

In [10]:
MSE_scaled = mean_squared_error(y_test, pred)

In [11]:
MSE_scaled

0.04641011154552032

In [39]:
model = Sequential()
model.add(layers.Embedding(vocab_size, 100, input_length=max_sentence_len))
model.add(Dense(256, input_dim=max_sentence_len,activation='relu',kernel_regularizer=regularizers.l2(0.01)))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(64,activation='relu',kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 261, 100)          1474300   
_________________________________________________________________
dense_9 (Dense)              (None, 261, 256)          25856     
_________________________________________________________________
batch_normalization_2 (Batch (None, 261, 256)          1024      
_________________________________________________________________
dropout_5 (Dropout)          (None, 261, 256)          0         
_________________________________________________________________
dense_10 (Dense)             (None, 261, 64)           16448     
_________________________________________________________________
dropout_6 (Dropout)          (None, 261, 64)           0         
_________________________________________________________________
dense_11 (Dense)             (None, 261, 1)            65        
Total para

In [29]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 100, input_length=maxlen)
model.add(embedding_layer)

model.add(Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(3,3, padding='same'))
model.add(layers.Conv1D(64,5))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(layers.Dense(64, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 30, 100)           1474300   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 26, 128)           64128     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 9, 128)            0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 5, 64)             41024     
_________________________________________________________________
flatten_2 (Flatten)          (None, 320)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 320)               0         
___________________________

In [24]:
model.predict(test_data)

array([[0.72671306],
       [0.5388591 ],
       [0.68009716],
       ...,
       [0.2591312 ],
       [0.26152107],
       [0.5186733 ]], dtype=float32)