In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install ktrain
!pip install seaborn
!pip install plotly
!wget https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get install xvfb libgtk2.0-0 libgconf-2-4
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install nltk
!pip install cleantext
!pip install emoji
!pip install contractions

Collecting ktrain
  Downloading ktrain-0.27.3.tar.gz (25.3 MB)
[K     |████████████████████████████████| 25.3 MB 98 kB/s 
[?25hCollecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 41.9 MB/s 
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 46.0 MB/s 
Collecting cchardet
  Downloading cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263 kB)
[K     |████████████████████████████████| 263 kB 63.3 MB/s 
Collecting syntok
  Downloading syntok-1.3.1.tar.gz (23 kB)
Collecting seqeval==0.0.19
  Downloading seqeval-0.0.19.tar.gz (30 kB)
Collecting transformers<=4.3.3,>=4.0.0
  Downloading transformers-4.3.3-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 33.4 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.

In [3]:
import re
import emoji
import spacy # python -m spacy download en_core_web_sm
import ktrain
import cleantext
import contractions
import pandas as pd
import numpy as np

import plotly
import plotly.express as px 
import plotly.io as pio 
import plotly.offline as py 
import plotly.graph_objs as go 
import plotly.tools as tls 
import plotly.figure_factory as ff 
from plotly.subplots import make_subplots

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download("stopwords")

import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model, model_from_json
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.models import load_model

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
def cleaned_review(text):
    
    # Removing html tags
    text = re.sub('<.*?>', '', text)

    # Lower case
    text = text.lower()
    
    # Removing contractions
    text = contractions.fix(text)
    
    # Removing emojis
    text = re.sub(emoji.get_emoji_regexp(), r"", text)
    
    # Remove special characters and emojis
    text=re.sub('\x91The','The',text)
    text=re.sub('\x97','',text)
    text=re.sub('\x84The','The',text)
    text=re.sub('\uf0b7','',text)
    text=re.sub('¡¨','',text)
    text=re.sub('\x95','',text)
    text=re.sub('\x8ei\x9eek','',text)
    text=re.sub('\xad','',text)
    text=re.sub('\x84bubble','bubble',text)
    
    # Removing extra spaces and punctuations
    text = cleantext.clean(text, all=False, extra_spaces=True, lowercase=True, punct=True)

    return text

def lemmatized_stopwords(text):

    # Removing, stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.remove('no')
    stop_words.remove('nor')
    stop_words.remove('not')
    
    word_tokens = word_tokenize(text)
    
    text = " ".join([w for w in word_tokens if not w.lower() in stop_words])
    
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    
    return text

In [5]:
ktrain_predictor = ktrain.load_predictor("/content/drive/Shareddrives/NLP/NLP Assignment/Lim Ming Jun/Saved_DistilBert_Model")
lstm = load_model('/content/drive/Shareddrives/NLP/NLP Assignment/Lim Chia Chung/lstm')
cnn = load_model('/content/drive/Shareddrives/NLP/NLP Assignment/Leong Yit Wee/CNN')

In [6]:
def ktrainResult(single_review):
    # Predict the sentiment (E.g. 'pos' or 'neg')
    ktrain_result = ktrain_predictor.predict(texts=single_review)

    score = ktrain_predictor.predict(texts=single_review, return_proba=True)
    score_highest = np.amax(score)
    score_highest = f"{score_highest:.4f}"

    return ktrain_result, float(score_highest), list(score)

def KerasPreprocessingSingleReview(model, review):
    tokenizer = Tokenizer()

    imdb = pd.read_csv('/content/drive/Shareddrives/NLP/NLP Assignment/cleaned_IMDB.csv')
    imdb = imdb['cleaned_review'].values.tolist()

    tokenizer.fit_on_texts(imdb)
    review1 = tokenizer.texts_to_sequences([review])

    review2 = pad_sequences(review1, truncating = 'post', padding='post', maxlen=1430)
    score = model.predict(review2)[0]
    result = int(score.round().item())

    model_result = ['neg', 'pos']
    neg_score = 1 - score
    pos_score = score

    return model_result[result], round(float(neg_score), 5), round(float(pos_score), 5)

def KerasPreprocessingMultipleReviews(model, review):
    review1 = []

    for text in review:
        # Convert texts to sequence
        txt_to_seq = tokenizer.texts_to_sequences([text])[0]
        review1.append(txt_to_seq)

    review2 = pad_sequences(review1, truncating = 'post', padding='post', maxlen=1430)
    prediction = model.predict(review2)

    prediction_binary = []
    for ele in prediction:
        if ele > 0.5:
            prediction_binary.append(1)
        else:
            prediction_binary.append(0)

    return prediction_binary

<div align="center"><h1> Movie Sentiments Analysis App ✌ </h1></div>

<div align="center">This is a simple movie sentiment analysis app that will help you to identify whether the movie review is <b>positive</b> or <b>negative</b></div>

## 1. Single Movie Review Analysis

### 1.1 Accept user input

In [7]:
user_input = input('Enter single review below:  ')

# Distil Bert
cleaned_bert_user_input = cleaned_review(user_input)
cleaned_user_input = cleaned_review(cleaned_bert_user_input)

# Use spaCy for text lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# LSTM and CNN
clean1 = cleaned_review(user_input)
clean2 = lemmatized_stopwords(clean1)

Enter single review below:  Hi


### 1.2 Perform sentiment analysis


In [8]:
global lstm, cnn

# Distil Bert
bert_sentiment, bert_score, neg_pos_score = ktrainResult(cleaned_user_input)

# LSTM
lstm_sentiment, lstm_neg_score, lstm_pos_score = KerasPreprocessingSingleReview(lstm, clean2)

# CNN
cnn_sentiment, cnn_neg_score, cnn_pos_score = KerasPreprocessingSingleReview(cnn, clean2)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
print(f'Postive : {round(float(neg_pos_score[1]), 5)}')
print(f'Negative: {round(float(neg_pos_score[0]), 5)}')
print(f'This sentence is {bert_sentiment.upper()}')

Postive : 0.55546
Negative: 0.44454
This sentence is POS


In [10]:
print(f'Postive : {lstm_pos_score}')
print(f'Negative: {lstm_neg_score}')
print(f'This sentence is {lstm_sentiment.upper()}')

Postive : 0.18058
Negative: 0.81942
This sentence is NEG


In [11]:
print(f'Postive : {cnn_pos_score}')
print(f'Negative: {cnn_neg_score}')
print(f'This sentence is {cnn_sentiment.upper()}')

Postive : 0.05483
Negative: 0.94517
This sentence is NEG


In [12]:
algorithm = ['DistilBERT', 'LSTM', 'CNN']
pos = [neg_pos_score[1], lstm_pos_score, cnn_pos_score]
neg = [neg_pos_score[0], lstm_neg_score, cnn_neg_score]

fig = go.Figure(data=[
    go.Bar(name='Positive', x=algorithm, y=pos),
    go.Bar(name='Negative', x=algorithm, y=neg)
])

# Change the bar mode
fig.update_layout(barmode='group', 
                  title='Comparison of DistilBERT, LSTM & CNN',
                  xaxis_title='Models',
                  yaxis_title='Probability')
fig.show()
fig.write_image("/content/drive/Shareddrives/NLP/NLP Assignment/Figures/Comparison of DistilBERT, LSTM & CNN.png")

## 2. Multiple Movie Reviews Analysis

In [13]:
df = pd.read_csv('/content/drive/Shareddrives/NLP/NLP Assignment/MovieScraper/scrap_movie_reviews.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,Review,Sentiment
0,0,That's one of the best marvel movie! I love th...,NEGATIVE
1,1,"Stupid, lazy and boring, I guessed the ending ...",NEGATIVE
2,2,Disturbingly Cringe. It's all bad jokes and cr...,NEGATIVE
3,3,"F9 was fun, and after last year that is all I ...",NEGATIVE
4,4,For 29 years Candyman fans have been waiting f...,NEGATIVE


In [14]:
df['cleaned_bert_review'] = df['Review'].apply(cleaned_review)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
df['cleaned_review'] = df['cleaned_bert_review'].apply(lemmatized_stopwords)

sentiment_df = df['Sentiment']
df.drop('Sentiment', inplace=True, axis=1)
df['Sentiment'] = sentiment_df
df = df.iloc[: , 1:]

In [15]:
# Distil Bert
df["Sentiment_BERT"] = df['Sentiment'].replace("POSITIVE", "pos")
df["Sentiment_BERT"] = df["Sentiment_BERT"].replace("NEGATIVE", "neg")

y_true_bert = df["Sentiment_BERT"].tolist()
y_pred_bert = ktrain_predictor.predict(df['cleaned_bert_review'].tolist())

# LSTM
df["Sentiment"] = df['Sentiment'].replace("POSITIVE", 1)
df["Sentiment"] = df["Sentiment"].replace("NEGATIVE", 0)

tokenizer = Tokenizer()
movie_review = df['cleaned_review'].values.tolist()
tokenizer.fit_on_texts(movie_review)

y_true_lstm = df["Sentiment"].tolist()
y_pred_lstm = KerasPreprocessingMultipleReviews(lstm, movie_review)

# CNN
y_true_cnn = df["Sentiment"].tolist()
y_pred_cnn = KerasPreprocessingMultipleReviews(cnn, movie_review)

In [16]:
def model_report(y_pred, y_true, name):
    if name == 'DistilBERT':
      accuracy     = accuracy_score(y_true, y_pred)
      recall       = recall_score(y_true, y_pred, pos_label='neg')
      precision    = precision_score(y_true, y_pred, pos_label='neg')
      f1score      = f1_score(y_true, y_pred, pos_label='neg')
    else:
      accuracy     = accuracy_score(y_true, y_pred)
      recall       = recall_score(y_true, y_pred)
      precision    = precision_score(y_true, y_pred)
      f1score      = f1_score(y_true, y_pred)

    df = pd.DataFrame({"Model"           : [name],
                       "Accuracy_score"  : [accuracy],
                       "Recall_score"    : [recall],
                       "Precision_score" : [precision],
                       "F1_score"        : [f1score]
                      })
    return df

# Outputs for every model
model1 = model_report(y_pred_bert, y_true_bert, "DistilBERT")
model2 = model_report(y_pred_lstm, y_true_lstm, "LSTM")
model3 = model_report(y_true_cnn, y_pred_cnn, "CNN")

# Concat all models
model_performances = pd.concat([model1, model2, model3], axis = 0).reset_index()

model_performances = model_performances.drop(columns = "index", axis = 1)

table  = ff.create_table(np.round(model_performances, 4))

py.iplot(table)
table.write_image("/content/drive/Shareddrives/NLP/NLP Assignment/Figures/Model Report.png", )

In [18]:
def output_tracer(metric, color) :
    tracer = go.Bar(y = model_performances["Model"] ,
                    x = model_performances[metric],
                    orientation = "h", name = metric ,
                    marker = dict(line = dict(width = 0.7),
                                  color = color)
                   )
    return tracer

layout = go.Layout(dict(title = "Model Performances",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "Metrics",
                                     zerolinewidth = 1,
                                     ticklen = 5, 
                                     gridwidth = 2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     zerolinewidth=1,
                                     ticklen=5,
                                     gridwidth=2),
                        margin = dict(l = 250),
                        height = 780
                       )
                  )

trace1  = output_tracer("Accuracy_score", "#6699FF")
trace2  = output_tracer('Recall_score', "red")
trace3  = output_tracer('Precision_score', "#33CC99")
trace4  = output_tracer('F1_score', "lightgrey")

data = [trace4, trace3, trace2, trace1]
fig = go.Figure(data = data, layout = layout)
py.iplot(fig)
fig.write_image("/content/drive/Shareddrives/NLP/NLP Assignment/Figures/Model Performances.png", )