## Import Libraries

In [1]:
import pandas as pd
import pickle
import joblib

import contractions
import string 
import numpy as np
import re

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet

from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
# from transformers.modeling_bert import BertModel
# from transformers import BertTokenizer, BertConfig

from keras.models import load_model
from keras_preprocessing.sequence import pad_sequences

from ipywidgets import widgets
from ipylab import JupyterFrontEnd
from IPython.display import Javascript, display


2022-11-05 16:01:34.936083: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Clean Text Preprocessing

In [2]:
# WHAT is this? LOVED. COME ON! !@#! get rid of punctuations
# Contraction words I've = I have
# removal of stop words
# removal of numbers

# Remove emojis 
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF" 
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)


# change to lower_case
def lower_case(review):
    
    return review.lower()

# change contraction words such sa I'm = I am, shouldn't = should not
def change_contractions(review):
    
    expanded_words = [contractions.fix(word) for word in review.split()]

    expanded_review = ' '.join(expanded_words)
    return expanded_review

# Remove Punctuations
def remove_punctuations(review):
  
  regex = re.compile('[^a-zA-Z0-9]')
  #first parameter is the replacement, second parameter is your input string
  new_review = regex.sub(' ', review)
  return new_review

# Remove numbers, we choose to remove numbers is because we find that for instance a review is descrbing about something "the 2 girls in the book is so cute"
# it could lead to 2 star review. Because we realize that, those negative reviews user, will include numbers in their reviews.
# Example: I give 2 star is because, the book is completely no link, also the words used in the book have a lot of grammatical error
# As classification is supervised learning model, it is trained by using the corpus with respect to the (sentiment category). Thus, 
# if the number 2 appear even in a positive comment, it may in the end up in the negative comment depending on how strong the number 2 in the corpus is.
# Thus, this will make the predictions go wrong. To reduce confusion for the model to learn, we decided to remove numbers
def remove_numbers(review):
    
    mapping = str.maketrans('', '', string.digits)
    new_review = review.translate(mapping)
    
    return new_review

# Remove extra whitespaces
def remove_extra_whitespace(reviews):
    return " ".join(reviews.split())

# We dont want to remove words that are from the whitelist, the reason is because it can have better meaning in our sentences
# The reason why i dont want remove one two three four etc is because, for instance if a user comment One start, this could mean it is a negative review straight away
def remove_stopwords(text):
    stopwords_list = stopwords.words('english')
    whitelist = ["not", "no", "cannot", "do", "must", "should", "would", "could"]
    words = text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
    return " ".join(clean_words)

def get_wordnet_pos(text):
    # Map POS tag to first character lemmatize() accepts
    tags = nltk.pos_tag(text)
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    tags = [tag_dict.get(tag[1][0],  wordnet.NOUN) for tag in tags]
    return tags

def lemmaSentence(reviews):
    lemmatizer = WordNetLemmatizer()
    lemma_text = ''
    tok_text = word_tokenize(reviews)
    tags = get_wordnet_pos(tok_text)
    for i in range(len(tok_text)):
        lemma_text = lemma_text + ' ' + lemmatizer.lemmatize(tok_text[i], tags[i])
    return lemma_text[1:] 

def clean_text(data, single_input=True):
    if single_input:
        data = lower_case(data)
        data = change_contractions(data)
        data = remove_emojis(data)
        data = remove_punctuations(data)
        data = remove_numbers(data)
        data = remove_stopwords(data)
        data = remove_extra_whitespace(data)
        data = lemmaSentence(data)
    else:
        data['concat_review'] = data['concat_review'].apply(lower_case)
        data['concat_review'] = data['concat_review'].apply(change_contractions)
        data['concat_review'] = data['concat_review'].apply(remove_emojis)
        data['concat_review'] = data['concat_review'].apply(remove_punctuations)
        data['concat_review'] = data['concat_review'].apply(remove_numbers)
        data['concat_review'] = data['concat_review'].apply(remove_stopwords)
        data['concat_review'] = data['concat_review'].apply(remove_extra_whitespace)
        data['concat_review'] = data['concat_review'].apply(lemmaSentence)
        
    return data

## Helper Functions

In [3]:
def print_pred_result(model, original_text, cleaned_text, detected_polarity):
    print("Original text sequence: " + original_text)
    print("Preprocessed text sequence: " + cleaned_text)
    print("Model: " + model)
    if original_text == "" or original_text is None:
        detected_polarity = ""
    elif detected_polarity > 0.5:
        detected_polarity = "Positive"
    else:
        detected_polarity = "Negative"
    print("Detected polarity: " + detected_polarity)

# Import Trained Models

## Fitting TF-IDF with Train Dataset

In [4]:
train_data = pd.read_csv('./train_test_dataset/train_df_imbalanced.csv')
clean_train_data_df = clean_text(train_data, single_input=False)
differences = clean_train_data_df["polarity"].value_counts()[1]-clean_train_data_df["polarity"].value_counts()[-1]
train_balanced_df = clean_train_data_df.drop(clean_train_data_df[clean_train_data_df["polarity"] == 1].sample(differences,random_state=42).index)

In [5]:
corpus = np.array(train_balanced_df["concat_review"].values)
tfidf_vect = TfidfVectorizer(min_df=5, max_features=10000, ngram_range=(1,2), lowercase=False, tokenizer=word_tokenize)
tfidf_vect.fit_transform(corpus)

<15233x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 547653 stored elements in Compressed Sparse Row format>

## Naive Bayes (TF-IDF)

In [6]:
with open('./trained_models/Naive_Bayes_TFIDF_model.pkl', 'rb') as handle:
    NB_trained_model = joblib.load(handle)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## Linear SVC (TF-IDF)

In [7]:
with open('./trained_models/Linear_SVC_TFIDF_model.pkl', 'rb') as handle:
    LinearSVC_trained_model = joblib.load(handle)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:
with open('./trained_models/Logistic_Regression_TFIDF_model.pkl', 'rb') as handle:
    LogisticRegression_trained_model = joblib.load(handle)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## LSTM

In [9]:
LSTM_trained_model = load_model('./trained_models/LSTM_model.h5')

with open('./trained_models/LSTM_tokenizer.pickle', 'rb') as handle:
    LSTM_tokenizer = pickle.load(handle)

2022-11-05 16:02:34.789617: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Google BERT

In [10]:
# class SentimentData(Dataset):
#     def __init__(self, dataframe, tokenizer, max_len):
#         self.tokenizer = tokenizer
#         self.data = dataframe
#         self.text = dataframe.text
#         self.targets = self.data.polarity
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.text)

#     def __getitem__(self, index):
#         text = str(self.text[index])
#         text = " ".join(text.split())

#         inputs = self.tokenizer.encode_plus(
#             text,
#             None,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             pad_to_max_length=True,
#             return_token_type_ids=True
#         )
#         ids = inputs['input_ids']
#         mask = inputs['attention_mask']
#         token_type_ids = inputs["token_type_ids"]


#         return {
#             'ids': torch.tensor(ids, dtype=torch.long),
#             'mask': torch.tensor(mask, dtype=torch.long),
#             'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
#             'targets': torch.tensor(self.targets[index], dtype=torch.float)
#         }

# class BertClass(torch.nn.Module):
#     def __init__(self):
#         super(BertClass, self).__init__()
#         self.l1 = BertModel.from_pretrained("bert-base-uncased")
#         self.pre_classifier = torch.nn.Linear(768, 768)
#         self.dropout = torch.nn.Dropout(0.2)
#         self.classifier = torch.nn.Linear(768, 2)

#     def forward(self, input_ids, attention_mask, token_type_ids):
#         output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
#         hidden_state = output_1[0]
#         pooler = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = torch.nn.ReLU()(pooler)
#         pooler = self.dropout(pooler)
#         output = self.classifier(pooler)
#         return output

# def BERT_predict(input_loader):
#   preds = []
#   for _, data in tqdm(enumerate(input_loader, 0)):
#     ids = data['ids'].to(device, dtype = torch.long)
#     mask = data['mask'].to(device, dtype = torch.long)
#     token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#     targets = data['targets'].to(device, dtype = torch.long)
#     pred = BERT_trained_model(ids, mask, token_type_ids)
#     big_val, big_idx = torch.max(pred.data, dim=1)
#     if big_idx == 1:
#       preds.append(1)
#     else:
#       preds.append(0)
#   return preds
  
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# BERT_trained_model = torch.load('./trained_models/BERT_model.bin', map_location=device)
# config = BertConfig.from_pretrained("bert-base-uncased")
# BERT_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

# Polarity Detection on Text Sequence

In [21]:
# app = JupyterFrontEnd()

# app.commands.list_commands()

original_text_label = widgets.Label("Text Sequence")
original_text_input = widgets.Text()
display(original_text_label)
display(original_text_input)

# submit_button = widgets.Button(description="Detect polarity!")
# display(submit_button)

# def run_all(ev):
#     app.commands.execute('notebook:run-all-below')

# submit_button.on_click(run_all)



Label(value='Text Sequence')

Text(value='')

In [12]:
original_text = original_text_input.value
cleaned_text = clean_text(original_text)

### Naive Bayes Polarity Detection (TF-IDF)

In [13]:
input = np.array([cleaned_text])
tfidf_input = tfidf_vect.transform(input)

NB_pred = NB_trained_model.predict(tfidf_input)[0]
print_pred_result("[Naive Bayes, TF-IDF]", original_text, cleaned_text, NB_pred)

Original text sequence: This book is really lovely and easy to read
Preprocessed text sequence: book really lovely easy read
Model: [Naive Bayes, TF-IDF]
Detected polarity: Negative


### Linear SVC Polarity Detection (TF-IDF)

In [14]:
input = np.array([cleaned_text])
tfidf_input = tfidf_vect.transform(input)

LinearSVC_pred = LinearSVC_trained_model.predict(tfidf_input)[0]
print_pred_result("[Linear SVC, TF-IDF]", original_text, cleaned_text, LinearSVC_pred)

Original text sequence: This book is really lovely and easy to read
Preprocessed text sequence: book really lovely easy read
Model: [Linear SVC, TF-IDF]
Detected polarity: Positive


### Linear Logistic Polarity Detection (TF-IDF)

In [15]:
input = np.array([cleaned_text])
tfidf_input = tfidf_vect.transform(input)

LinearLogistic_pred = LogisticRegression_trained_model.predict(tfidf_input)[0]
print_pred_result("[Logistic Regression, TF-IDF]", original_text, cleaned_text, LinearLogistic_pred)

Original text sequence: This book is really lovely and easy to read
Preprocessed text sequence: book really lovely easy read
Model: [Logistic Regression, TF-IDF]
Detected polarity: Negative


### LSTM Polarity Detection

In [16]:
# LSTM
token_text = pad_sequences(LSTM_tokenizer.texts_to_sequences([cleaned_text]), maxlen=1000)

LSTM_pred = LSTM_trained_model.predict(token_text)[0][0]
print_pred_result("[LSTM]", original_text, cleaned_text, LSTM_pred)

Original text sequence: This book is really lovely and easy to read
Preprocessed text sequence: book really lovely easy read
Model: [LSTM]
Detected polarity: Positive


### Google BERT Polarity Detection

In [17]:
# # Google BERT
# input = {'text': [cleaned_text], 'polarity': [1]}
# df = pd.DataFrame(input)
# BERT_input = DataLoader(SentimentData(df, BERT_tokenizer, 256))
# BERT_pred = BERT_predict(BERT_input)[0]

# print_pred_result("[Google BERT]", original_text, cleaned_text, BERT_pred)