# import the library

In [None]:
import re
import ftfy
import nltk
import itertools
import numpy as np
import pandas as pd
from math import exp
import pickle as pkl
from numpy import sign
from pathlib import Path
from nltk import PorterStemmer
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from keras.models import model_from_json, Model, Sequential
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import  classification_report, confusion_matrix, accuracy_score
from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D

# Read data from folder

In [None]:
np.random.seed(1234) 

In [None]:
messages = pd.read_csv('input/cleaned_data1.csv')

In [None]:
messages.head()

In [None]:
messages.groupby('Class').describe()

In [None]:
max_length = 140
nb_max_words = 10000
embedding_dim = 300

In [None]:
positive = messages[messages['Class'] == 1]['Reviews']
negative = messages[messages['Class'] == 0]['Reviews']

In [None]:
positive = pd.DataFrame(positive)
positive.to_csv(r'input/positive.csv', header=None)

negative = pd.DataFrame(negative)
negative.to_csv(r'input/negative.csv', header=None)

In [None]:
positive.head()

In [None]:
pos_review_file_path = 'input/positive'
neg_review_file_path = 'input/negative'

In [None]:
df_pos_review = pd.read_csv(pos_review_file_path, index_col=0,header = None)
df_neg_review = pd.read_csv(neg_review_file_path, index_col=0, header = None)

In [None]:
df_pos_review = df_pos_review.rename(columns={1: 'Review'})
df_neg_review = df_neg_review.rename(columns={1: 'Review'})

In [None]:
df_pos_review.head()

In [None]:
df_neg_review.head()

# Data PreProcessing

In [None]:
cList = pkl.load(open('input/cword_dict.pkl','rb'))

In [None]:
print(cList)

In [None]:
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

In [None]:
c_re

In [None]:
def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group()]
    return c_re.sub(replace, text)

In [None]:
def clean_review(reviews):
    cleaned_review = []
    for review in reviews:
        review = str(review)
        if re.match("(\w+:\/\/\S+)", review) == None and len(review) > 10:
            review = ' '.join(re.sub("(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)", " ", review).split())
            review = ftfy.fix_text(review)
            review = expandContractions(review)
            review = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", review).split())
            stop_words = stopwords.words('english')
            word_tokens = nltk.word_tokenize(review) 
            filtered_sentence = [w for w in word_tokens if not w in stop_words]
            review = ' '.join(filtered_sentence)
            review = PorterStemmer().stem(review)
            cleaned_review.append(review)
    return cleaned_review

In [None]:
arr_pos_review = [x for x in df_pos_review['Review']]
arr_neg_review = [x for x in df_neg_review['Review']]

In [None]:
arr_neg_review

In [None]:
cleaned_pos_text = clean_review(arr_pos_review)
cleaned_neg_text = clean_review(arr_neg_review)

In [None]:
tokenizer = Tokenizer(num_words=nb_max_words)
tokenizer.fit_on_texts(cleaned_pos_text + cleaned_neg_text)

In [None]:
sequences_pos = tokenizer.texts_to_sequences(cleaned_pos_text)
sequences_neg = tokenizer.texts_to_sequences(cleaned_neg_text)

In [None]:
sequences_pos[0]

In [None]:
word_index = tokenizer.word_index

In [None]:
print('Found %s unique tokens' % len(word_index))

In [None]:
data_d = pad_sequences(sequences_pos, maxlen=max_length)
data_r = pad_sequences(sequences_neg, maxlen=max_length)
print('Shape of data_d tensor:', data_d.shape)
print('Shape of data_r tensor:', data_r.shape)

In [None]:
data_d[0]

In [None]:
print(data_d.shape, data_r.shape)

In [None]:
type(data_d)

In [None]:
data = np.concatenate((data_d, data_r))

In [None]:
labels_d = np.ones(data_d.shape[0])
labels_r = np.zeros(data_r.shape[0])
print(labels_d.shape, labels_r.shape)

In [None]:
labels = np.concatenate((labels_d, labels_r))

In [None]:
print(data.shape, labels.shape)

# Build model

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size = 0.2, random_state=42)

In [None]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

In [None]:
model = Sequential()
model.add(Embedding(nb_max_words, embedding_dim, input_length=140))
model.add(Dropout(0.2))
model.add(Conv1D(32, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

# Train Model

In [None]:
adam = Adam(lr=0.001,
    decay=1e-06
)

In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer=adam,
    metrics=['accuracy']
)

In [None]:
model.summary()

In [None]:
hist = model.fit(
    X_train,
    Y_train,
    validation_data=(X_test, Y_test),
    epochs=2,
    batch_size=100,
    shuffle=True
)

In [None]:
plt.plot(hist.history['acc'])
plt.plot(hist.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Test and Evaluat

In [None]:
Y_pred = model.predict(X_test)
Y_pred = np.round(Y_pred.flatten())

In [None]:
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy: %.2f%%" % (round(accuracy*100)))

In [None]:
print(classification_report(Y_test, Y_pred))

In [None]:
def plot_confusion_matrix(cm, classes,title='Confusion matrix'):
    plt.figure(figsize=(7,7))
    plt.imshow(cm, interpolation='nearest', cmap='binary')
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=30)
    plt.yticks(tick_marks, classes, rotation=30)
    
    
    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()

In [None]:
cm = confusion_matrix(y_pred=Y_pred, y_true=Y_test)

In [None]:
cm_plot_labels = ['Positive_Sentiment', 'Negative_Sentiment']

In [None]:
plot_confusion_matrix(cm,cm_plot_labels)

In [None]:
model_structure = model.to_json()
f = Path("model/model_structure.json")
f.write_text(model_structure)

In [None]:
model.save_weights("model/model_weights.h5")

# Bidirectional LSTM

In [None]:
from keras.layers import Bidirectional

In [2]:
bi_model = Sequential()
bi_model.add(Embedding(nb_max_words, embedding_dim, input_length=140))
bi_model.add(Dropout(0.2))
bi_model.add(Conv1D(32, 5, activation='relu'))
bi_model.add(MaxPooling1D(pool_size=2))
bi_model.add(Dropout(0.5))
bi_model.add(Bidirectional(LSTM(100)))
bi_model.add(Dense(1, activation='sigmoid'))

NameError: name 'Sequential' is not defined

# Train Model

In [None]:
adam = Adam(lr=0.001,
    decay=1e-06
)

In [None]:
bi_model.compile(
    loss='binary_crossentropy',
    optimizer=adam,
    metrics=['accuracy']
)

In [None]:
bi_model.summary()

In [None]:
hist = bi_model.fit(
    X_train,
    Y_train,
    validation_data=(X_test, Y_test),
    epochs=4,
    batch_size=100,
    shuffle=True
)

In [None]:
plt.plot(hist.history['acc'])
plt.plot(hist.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Test and Evaluat

In [None]:
Y_pred = bi_model.predict(X_test)
Y_pred = np.round(Y_pred.flatten())

In [None]:
biaccuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy: %.2f%%" % (round(biaccuracy*100)))

In [None]:
print(classification_report(Y_test, Y_pred))

In [None]:
def plot_confusion_matrix(cm, classes,title='Confusion matrix'):
    plt.figure(figsize=(7,7))
    plt.imshow(cm, interpolation='nearest', cmap='binary')
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=30)
    plt.yticks(tick_marks, classes, rotation=30)
    
    
    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()

In [None]:
cm = confusion_matrix(y_pred=Y_pred, y_true=Y_test)

In [None]:
cm_plot_labels = ['Positive_Sentiment', 'Negative_Sentiment']


In [None]:

plot_confusion_matrix(cm,cm_plot_labels)


In [None]:
model_structure = model.to_json()
f = Path("model/bi_model_structure.json")
f.write_text(model_structure)

In [None]:
model.save_weights("model/bi_model_weights.h5")

In [None]:
b = round(biaccuracy*100)
l = round(accuracy*100)

In [None]:
li_x = ['LSTM', 'BiLSTM']
li_y = [l, b]

In [None]:
import seaborn as sns
print(li_y)
sns.barplot(x=li_x, y=li_y)

# Thanks