In [4]:
import pandas as pd
import numpy as np
import io

In [16]:
from keras import Model, models

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input ,Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout

from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from keras.layers import Conv1D, MaxPooling1D, Embedding

import spacy
import string

In [6]:
data = pd.read_csv('../archives/reviews_cleaned.csv')

In [7]:
nlp = spacy.load('en_core_web_sm', parser=False, entity=False)

In [8]:
def cleaning(docs, nlp=nlp):
    """
    Takes a pandas series or list of texts and returns the list with the text cleaned
    """
    text = []
    for doc in nlp.pipe(docs):
        words = []
        for token in doc:
            if token.pos_ == 'PRON':
                words.append(token.text)
                continue
            if token.pos_ == 'PART':
                words.append(token.lemma_.lower())
                continue
            if token.is_alpha and token.lemma_ != '-PRON-':
                words.append(token.lemma_.lower())
        text.append(' '.join(map(str, words)))
    return text

In [9]:
def remove_punctuation(text):
    """Function that remove punctuation from a text"""
    return "".join([x for x in text if x not in string.punctuation])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned'], data['sentiment'], 
                                                    random_state = 42, test_size =0.05, stratify= data['sentiment'])

In [11]:
max_features = 50000
maxlen = 5000
embedding_dims = 300



tokenizer = Tokenizer(num_words=max_features, oov_token = '<UNK>')

tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

list_tokenized_train = tokenizer.texts_to_sequences(X_train)

maxlen = max([len(x) for x in list_tokenized_train])

list_tokenized_test = tokenizer.texts_to_sequences(X_test)


x_train = sequence.pad_sequences(list_tokenized_train, padding = 'post', 
                                 truncating = 'post', maxlen=maxlen)

x_test = sequence.pad_sequences(list_tokenized_test, padding = 'post', 
                                 truncating = 'post', maxlen=maxlen)

num_words = len(word_index) + 1


In [None]:
# Creating a embedding using glove

embeddings_index = dict()

with open('../archives/glove.6B.300d.txt', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        

embedding_matrix = np.zeros((num_words, embedding_dims))

for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector




In [None]:
kernel_sizes=[3, 4, 5]
class_num = 1
last_activation='sigmoid' 
sequence_length = data.shape[1]

batch_size = 32
epochs = 10

convs = []
max_poolings = []


for kernel_size in kernel_sizes:
    convs.append(Conv1D(256, kernel_size, activation='relu'))
    max_poolings.append(GlobalMaxPooling1D())

inputs = Input(shape=(x_train.shape[1],), dtype='int32')

embedding = Embedding(max_features, embedding_dims, weights=[embedding_matrix], input_length=maxlen)(inputs)

convs2 = []
for i in range(len(kernel_sizes)):
    c = convs[i](embedding)
    c = max_poolings[i](c)
    convs2.append(c)
x = Concatenate()(convs2)
output = Dense(class_num, activation=last_activation)(x)


model = Model(inputs=inputs, outputs=output)

model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

In [13]:
model = models.load_model('../archives/model_textcnn_imbd.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [12]:
result = model.predict(x_test)

In [13]:
y_pred = (result > 0.5)

print(classification_report(y_pred, y_test))
print('Confusion matrix:')
confusion_matrix(y_pred, y_test)

              precision    recall  f1-score   support

       False       0.91      0.93      0.92      1225
        True       0.93      0.91      0.92      1275

    accuracy                           0.92      2500
   macro avg       0.92      0.92      0.92      2500
weighted avg       0.92      0.92      0.92      2500

Confusion matrix:


array([[1139,   86],
       [ 111, 1164]], dtype=int64)

In [28]:
example = ["Joaquin Phoenix gives a tour de force performance, fearless and stunning in its emotional depth and physicality. \
            It's impossible to talk about this without referencing Heath Ledger's Oscar-winning performance from The Dark Knight, \
            widely considered the definitive live-action portrayal of the Joker, so let's talk about it. \
            The fact is, everyone is going to be stunned by what Phoenix accomplishes, because it's what many thought impossible \
            a portrayal that matches and potentially exceeds that of The Dark Knight's Clown Prince of Crime"]



In [29]:
example = remove_punctuation(example[0])

In [30]:
example

'Joaquin Phoenix gives a tour de force performance fearless and stunning in its emotional depth and physicality             Its impossible to talk about this without referencing Heath Ledgers Oscarwinning performance from The Dark Knight             widely considered the definitive liveaction portrayal of the Joker so lets talk about it             The fact is everyone is going to be stunned by what Phoenix accomplishes because its what many thought impossible             a portrayal that matches and potentially exceeds that of The Dark Knights Clown Prince of Crime'

In [31]:
example = cleaning([example])

In [32]:
example

['joaquin phoenix give a tour de force performance fearless and stunning in emotional depth and physicality impossible to talk about this without reference heath ledgers oscarwinne performance from the dark knight widely consider the definitive liveaction portrayal of the joker so let talk about it the fact be everyone be go to be stun by what phoenix accomplish because what many think impossible a portrayal that match and potentially exceed that of the dark knights clown prince of crime']

In [33]:
tokenized_example = tokenizer.texts_to_sequences(example)

In [34]:
example_test = sequence.pad_sequences(tokenized_example, padding = 'post', 
                                         truncating = 'post', maxlen=maxlen)

In [35]:
example_result = model.predict(example_test)

In [37]:
example_result

array([[0.9997079]], dtype=float32)

In [36]:
if example_result[0] > 0.5:
    print('The review is positive')
else:
    print('The review is negative')

The review is positive
