In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import array
# import string
# from string import punctuation
# from itertools import chain

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_selection import SelectPercentile, chi2, f_regression, f_classif
from sklearn.utils import shuffle

! pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
# from sklearn import svm

# nltk.download('stopwords')
# from nltk.stem import WordNetLemmatizer
# from nltk import ngrams

import re
import string

! pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


! pip install wordcloud
from wordcloud import WordCloud, STOPWORDS

! pip install tensorflow

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense, Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
import io
import json

## Loading dataset and exploration

In [None]:
df = pd.read_csv("csv's/TrainingDataset.csv")

In [None]:
df.columns

In [None]:
df.shape

In [None]:
print(df.isnull().sum())

In [None]:
df.head(2)

In [None]:
pos = df[df['rating']>3]
neg = df[df['rating']<3]
print(pos.shape, neg.shape)

# Largely imbalanced
May have to oversample/undersample

In [None]:
counts = [pos.shape[0], neg.shape[0]]

# Create a bar chart
plt.bar(['Positive Ratings (>3)', 'Negative Ratings (<3)'], counts, color=['green', 'red'])
plt.title('Count of Positive and Negative Ratings')
plt.ylabel('Count')
plt.xlabel('Rating Categories')
plt.show()

## Preprocessing:
- Done via a sentiment map
- resulting column added to original dataframe

In [None]:
sentiment_map = {1:0, 2:0, 4:1, 5:1}

temp = df[df['rating'] != 3]
y = temp['rating'].map(sentiment_map)

df.loc[:, 'sentiment'] = y

In [None]:
df.head(2)

## Preparing dataframe for preprocessing

In [None]:
reviews_to_clean = pd.DataFrame()
reviews_to_clean = df['reviewText']

In [None]:
print(reviews_to_clean.head(3))

In [None]:
print(type(reviews_to_clean))

In [None]:
print(reviews_to_clean.shape)

In [None]:
stop_words = stopwords.words('english')

## Cleaning HTML aspects
- includes tags and https:// 

In [None]:
def clean_html(text):
    # Remove URLs
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub('<.*?>+', '', text)
    return text

## Creating tokens

In [None]:
def tokenize_df(text):
    if isinstance(text, str):  # If the input is a string
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token.isalpha()]  # Keep only alphabetic tokens
        return tokens
    elif isinstance(text, (int, float)):  # If the input is an integer or float
        return text  # Return the number as is
    else:
        return []

## Removing stopwords

In [None]:
def remove_stopwords(tokens):
    stop_words = (stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

## Lemmatization:
- Spelling or semantics of words may have been affected via tokenisation or punctuation removal. This was done to try to preserve value of those that may have been affected

In [None]:
# def lemmatize_text(tokens):
#     lemma = WordNetLemmatizer()
#     lemma_token = [lemma.lemmatize(word) for word in tokens]
#     return lemma_token

def lemmatize_text(tokens):
    if tokens is None:
        return []  # Return an empty list if tokens is None
    lemma = WordNetLemmatizer()
    lemma_tokens = [lemma.lemmatize(word) for word in tokens if word is not None]  # Check for None
    return lemma_tokens

## Cleaning text utilizing prior functions

In [None]:
def clean_text(text):
    if not isinstance(text, str):  # Check if the input is not a string
        return ''  # Return an empty string or handle it as needed
    text = clean_html(text)
    tokens = tokenize_df(text)
    tokens = [token.lower() for token in tokens]
    tokens = remove_stopwords(tokens)
    tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]
    tokens = lemmatize_text(tokens)
    return ' '.join(tokens)

In [None]:
cleaned_reviews = reviews_to_clean.apply(clean_text)

## Convert all instances of float or integer data to string:
- This was done to avoid instances of float or integer.
- Chose not to drop the values since it may hold value in our analysis

In [None]:
cleaned_reviews.apply(lambda x: re.sub(r"\s+", " ", x).strip()if isinstance(x, str) else x)

## Dropping values:
- This was done to account for any rows that had no data. Avoids NaN representation for text_fit

In [None]:
cleaned_reviews = cleaned_reviews.fillna('').astype(str)

In [None]:
cleaned_reviews.to_csv('cleaned_reviews.csv', index=False)

In [None]:
print(cleaned_reviews.isnull().sum())

In [None]:
cleaned_reviews.head(3)

In [None]:
cleaned_reviews.shape

In [None]:
def print_cloud(data):
    text = " ".join(review for review in data)
    print ("There are {} words in the combination of all reviews.".format(len(text)))
    stopwords_set = set(stopwords.words('english'))
    wordcloud = WordCloud(stopwords=stopwords_set, background_color="white").generate(text)
    return wordcloud

## Visualisation
- Small visualisation to check if cleaning went as expected

In [None]:
wordcloud = print_cloud(cleaned_reviews)
fig = plt.figure(1)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
X = cleaned_reviews
print(type(X))
print(X.shape)

In [None]:
y = df['sentiment']
print(type(y))
print(y.shape)
print(y.isnull().sum()) 

## Dropping NaN 
- Dropped those rows with NaN representations in sentiments based on the assumption their rating was a 3/5. Justified since the data loss is negligible when compared to that which remains

In [None]:
y = y.dropna()
X = X[y.index] 
print(X.shape)
print(y.shape)

In [None]:
def text_fit(X, y, model, clf_model, coef_show=1):
    X_c = model.fit_transform(X)
    print('# features: {}'.format(X_c.shape[1]))
    X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=0)
    print('# train records: {}'.format(X_train.shape[0]))
    print('# test records: {}'.format(X_test.shape[0]))
    clf = clf_model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    recall = recall_score(y_test,y_pred)
    print ('Model Recall: {}'.format(recall))
    print ('Model F1-Score: {}'.format(f1_score(y_test,y_pred)))
    if coef_show == 1: 
        w = model.get_feature_names_out()
        coef = clf.coef_.tolist()[0]
        coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
        coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
        print('')
        print('-Top 20 positive-')
        print(coeff_df.head(20).to_string(index=False))
        print('')
        print('-Top 20 negative-')        
        print(coeff_df.tail(20).to_string(index=False))

## Interpretation
- A recall of approximately 0.99 indicates that the model is very effective at identifying positive sentiments in the reviews.
- The output lists the top 20 words (or n-grams) that are most strongly associated with positive sentiment, along with their coefficients.
- The words listed here are the strongest indicators of positive sentiment in the reviews. The coefficients represent the weight of each word in the logistic regression model.

In [None]:
tfidf_n = TfidfVectorizer(ngram_range=(1,2), stop_words = 'english')
text_fit(X, y, tfidf_n, LogisticRegression())

In [None]:
y.head(3)

In [None]:
reviews = []
sentiments = []

reviews = X
sentiments = y
sentiments = np.array(list(map(lambda x: 1 if x==1 else 0, sentiments)))

In [None]:
print(sentiments)

In [None]:
len(reviews), len(sentiments)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews, sentiments, 
                                                test_size=0.20, random_state=42)

In [None]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [None]:
tokenizer_json = word_tokenizer.to_json()
with io.open('embedded.json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [None]:
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

In [None]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
smote = SMOTE(sampling_strategy='minority', random_state=42)

In [None]:
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_resampled, y_resampled = smote.fit_resample(X_train_flat, y_train)

In [None]:
X_test_flat = X_test.reshape(X_test.shape[0], -1)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test_flat, y_test)

In [None]:
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [None]:
embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
embedding_matrix.shape

In [None]:
from keras.layers import LSTM

In [None]:
lstm_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], trainable=False)

lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))

lstm_model.add(Dense(1, activation='sigmoid'))

In [None]:
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', 
                    metrics=['acc'])

In [None]:
lstm_model_history = lstm_model.fit(X_resampled, y_resampled, batch_size=128, epochs=8, 
                                        verbose=1,validation_split=0.2)

In [None]:
y_pred = lstm_model.predict(X_test_resampled)

In [None]:
y_pred = np.round(y_pred * 1).astype(int)
y_pred = y_pred.flatten()
y_pred

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test_resampled, y_pred))

In [None]:
score = lstm_model.evaluate(X_test_resampled, y_test_resampled, verbose=1)

In [None]:
lstm_model.save('lstm_model.keras')

In [None]:
import matplotlib.pyplot as plt

plt.plot(lstm_model_history.history['acc'])
plt.plot(lstm_model_history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(lstm_model_history.history['loss'])
plt.plot(lstm_model_history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()