In [23]:
import pandas as pd
import numpy as np

In [24]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D, Conv1D, Concatenate
from keras import initializers, regularizers, constraints, optimizers, layers


from keras.preprocessing import sequence
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.initializers import Constant

from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split



In [25]:
data = pd.read_csv('reviews_cleaned')

In [26]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], 
                                                    random_state = 42, test_size =0.05, stratify= data['sentiment'])

In [27]:
y_test.values

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [28]:
num_words = 43000
embedding_dims = 300

# Tokenize our data training
tokenizer = Tokenizer(num_words=num_words, oov_token = '<UNK>')
tokenizer.fit_on_texts(X_train)

# word_index
word_index = tokenizer.word_index

# Transform our training and test data

list_tokenized_train = tokenizer.texts_to_sequences(X_train)

list_tokenized_test = tokenizer.texts_to_sequences(X_test)

# Get max training sequence length
maxlen = max([len(x) for x in list_tokenized_train])

# Creating sequence of our training set
x_train = sequence.pad_sequences(list_tokenized_train, padding = 'post', 
                                 truncating = 'post', maxlen=maxlen)

y_train = y_train.values

# Creating sequence of our ttest set
x_test = sequence.pad_sequences(list_tokenized_test, padding = 'post', 
                                 truncating = 'post', maxlen=maxlen)

y_test = y_test.values

In [29]:
# Creating a embedding using glove

"""
embeddings_index = dict()
f = open('glove.6B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

num_words = min(num_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dims))
for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
"""



'\nembeddings_index = dict()\nf = open(\'glove.6B.300d.txt\', encoding="utf8")\nfor line in f:\n    values = line.split()\n    word = values[0]\n    coefs = np.asarray(values[1:], dtype=\'float32\')\n    embeddings_index[word] = coefs\nf.close()\n\nnum_words = min(num_words, len(word_index) + 1)\nembedding_matrix = np.zeros((num_words, embedding_dims))\nfor word, i in word_index.items():\n    if i >= num_words:\n        continue\n    embedding_vector = embeddings_index.get(word)\n    if embedding_vector is not None:\n        embedding_matrix[i] = embedding_vector\n'

In [42]:
class TextCNN(Model):

    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 weight = None,
                 kernel_sizes=[2, 3, 4, 5, 6],
                 class_num=1,
                 last_activation='sigmoid'):
        super(TextCNN, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.weight = weight
        self.embedding_dims = embedding_dims
        self.kernel_sizes = kernel_sizes
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, weights=self.weight, input_length=self.maxlen, trainable=False)
        self.convs = []
        self.max_poolings = []
        for kernel_size in self.kernel_sizes:
            self.convs.append(Conv1D(128, kernel_size, activation='relu'))
            self.max_poolings.append(GlobalMaxPooling1D())
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of TextCNN must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of TextCNN must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        # Embedding part can try multichannel as same as origin paper
        embedding = self.embedding(inputs)
        convs = []
        for i in range(len(self.kernel_sizes)):
            c = self.convs[i](embedding)
            c = self.max_poolings[i](c)
            convs.append(c)
        x = Concatenate()(convs)
        output = self.classifier(x)
        return output

In [None]:
max_features = num_words
batch_size = 50
epochs = 4


model = TextCNN(maxlen, max_features, embedding_dims, weight = None)
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])



In [None]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

In [None]:
model.load_weights('model_weights.h5')

In [None]:
print('Test...')
result = model.predict(x_test)

In [None]:
y_pred = (result > 0.5)

print(classification_report(y_pred, y_test))
print('Confusion matrix:')
confusion_matrix(y_pred, y_test)