In [None]:
#import the necessary libraries

In [None]:
import nltk
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords 
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
# load dataset

In [None]:
df = pd.read_csv("D:/Imarticus/Jupyter notebook/Paper2/Sentiment.csv")
df.head()

In [None]:
df.shape

In [None]:
df["sentiment"].unique()

### 1) Total Number of Positive and Negative Sentiments

In [None]:
p_count = 0
n_count = 0
for i in df["sentiment"]:
    if i == "Positive":
        p_count += 1
    elif i == "Negative":
        n_count += 1
print("Positive count : ", p_count)
print("Negative count : ", n_count)
print("Total positive and negative count : ", p_count + n_count)

In [None]:
df = df[df["sentiment"] != "Neutral"]

In [None]:
df.shape

### 2) Build a Sequential LSTM Model

In [None]:
df_new = df[["sentiment", "text"]]
df_new.head()

In [None]:
# Data Preprocessing

In [None]:
def remove_tags(string):
    removelist = ""
    result = re.sub('RT','',string) # Remove RT from text         
    result = result.lower()
    return result
df_new['text'] = df_new['text'].apply(lambda cw : remove_tags(cw))

In [None]:
# Removing stopwords
stop_words = set(stopwords.words('english'))
df_new['text'] = df_new['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
# Lemmatizing text
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
df_new['text'] = df_new.text.apply(lemmatize_text)
df_new.sample(15)

In [None]:
# Encoding Labels
reviews = df_new["text"].values
labels = df_new["sentiment"].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [None]:
# Train Test Split
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

In [None]:
# Tokenizing Sentences
vocab_size = 3000 
oov_tok = ''
embedding_dim = 100
max_length = 200 
padding_type='post'
trunc_type='post'

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

### Building Model

In [None]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

In [None]:
num_epochs = 5
history = model.fit(train_padded, train_labels, 
                    epochs = num_epochs, verbose = 1, 
                    validation_split = 0.1)

In [None]:
prediction = model.predict(test_padded)
pred_labels = []

for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

### 3) Based on the model, check the sentiment for the following two sentences

a. 'He is a great leader.'

b. 'He is a terrible leader.'


In [None]:
sentence = ["He is a great leader.", 
            "He is a terrible leader."]

sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, padding='post', maxlen=max_length)
prediction = model.predict(padded)
pred_labels = []

for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
        
for i in range(len(sentence)):
    print(sentence[i])
    if pred_labels[i] == 1:
        s = 'Positive'
    else:
        s = 'Negative'
    print("Predicted sentiment : ",s)