In [1]:
# Keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding, SimpleRNN

# Other
import re
import string
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
vocabulary_size = 10000

In [3]:
path = "../Clean/lemma_allresult.csv"
dataset = pd.read_csv(path)

dataset.shape

(10000, 2)

In [4]:
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(dataset['cleaned_review'])

sequences = tokenizer.texts_to_sequences(dataset['cleaned_review'])
feat = pad_sequences(sequences, maxlen=853)
target = dataset['Label'].values

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(feat,target, test_size = 0.2, train_size = 0.8, random_state = 42)

In [6]:
Y_train.shape

(8000,)

In [None]:
embeddings_index = dict()
f = open('glove.6B/glove.6B.100d.txt',encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [None]:
model_glove = Sequential()
model_glove.add(Embedding(input_dim=vocabulary_size,output_dim=100,input_length=853, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(SimpleRNN(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model_glove.summary()

In [None]:
model_glove.fit(X_train,
                Y_train,
                validation_split=0.4, 
                epochs = 10,
                verbose=1,
                batch_size=32)

In [None]:
score,acc = model_glove.evaluate(X_test, Y_test, verbose = 1, batch_size = 32)

In [None]:
test_path = "../Clean/lemma10000_master_result.csv"

test_dataset = pd.read_csv(test_path)

In [None]:
#tokenizer.fit_on_texts(test_dataset['cleaned_review'])

test_sequences = tokenizer.texts_to_sequences(test_dataset['cleaned_review'])
test_feat = pad_sequences(test_sequences, maxlen=853)
test_target = test_dataset['Label'].values

In [None]:
result = model_glove.predict(test_feat)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true=test_target , y_pred=result>0.5)

cm