In [None]:
# Anthony Scampini
# 10/27/24
# D213: Advanced Data Analytics Task 2
# Western Governors University

In [None]:
# Import libraries

import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [None]:
# Import the data from three distinct datasets into a list of lists.

filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']
lists = []

for filename in filenames:
    with open(filename) as data:
        for line in data:
            record = line.split('\t')
            lists.append(record)
            
print(lists[:5])

In [None]:
# Initial review indicates that there is a \n "newline" denotation after each 0 or 1 in the sentiment column.
# Those need to be removed.

for record in lists:
    record[1] = record[1].replace('\n','')
    
print(lists[:5])

In [None]:
# The reviews are analyzed to determine the presence of any special characters.

char_list = []
for record in lists:
    for words in record:
        for chars in words:
            if chars not in char_list:
                char_list.append(chars)
                
print(char_list)

In [None]:
# There are a variety of upper-case and lower-case letters, as well as special characters and numbers.
# All non-alphanumeric characters are removed and converted to lowercase.

for record in lists:
    record[0] = record[0].lower()
    record[0] = re.sub('[^a-zA-Z0-9\s]', ' ', record[0])

char_list = []
for record in lists:
    for words in record:
        for chars in words:
            if chars not in char_list:
                char_list.append(chars)
                
print(char_list)

print(lists[:10])

In [None]:
# There is trailing whitespace on each review and some reviews contain double-or triple spaces.
# The double spaces and trailing whitespace is removed.

for record in lists:
    record[0] = re.sub(' +', ' ', record[0])
    record[0] = record[0].strip()
    
print(lists[:10])

In [None]:
# To improve the model efficiency, stopwords are removed from the reviews.
# First, the stopwords are loaded from the NLTK library.

nltk.download('stopwords')
print(stopwords.words('english'))

In [None]:
# Because we already removed special characters from the reviews, we need to remove the single quotes from the stopwords
# as they will not match.

cleaned_stopwords = []

for word in stopwords.words('english'):
    cleaned_stopwords.append(word.replace('\'',''))

print(cleaned_stopwords)

In [None]:
# Stopwords can now be removed from the reviews.

for record in lists:
    record[0] = ' '.join([word for word in record[0].split() if word not in cleaned_stopwords])
    
print(lists[:10])

In [None]:
# Remove any one character words.

for record in lists:
    record[0] = ' '.join([word for word in record[0].split() if len(word) > 1])

In [None]:
# The cleaned reviews are imported into a dataframe.

reviews = pd.DataFrame(lists, columns=['Review','Sentiment'])
print(reviews.head(10).to_string(index=False))

In [None]:
# Check the dataframe for null values

for (review, sentiment) in reviews.items():
    print('Total missing values in variable %s is: ' % review + str(reviews[review].isnull().sum()))

In [None]:
# Check the dataframe to ensure all sentiment values are 0 or 1.

print(reviews.Sentiment.value_counts())

In [None]:
# Perform tokenization, acquire the vocabulary size needed for the model.
# One is added to the vocabularly to account for a padding value.

tokenized = Tokenizer()
tokenized.fit_on_texts(reviews.Review)
print('Vocabulary size: ', len(tokenized.word_index)+1)
for key, val in tokenized.word_index.items():
    print(val, ':', key)

In [None]:
# Find min, median, and max for lengths of number of words in each reviews. (Elleh, n.d.)

review_len = []
for word_len in reviews.Review:
    review_len.append(len(word_len.split(" ")))
    
print('Maximum length of sequences: ', np.max(review_len))
print('Minimum length of sequences: ', np.min(review_len))
print('Median length of sequences: ', round(np.median(review_len)))
print('Mean length of sequences: ', round(np.mean(review_len)))

In [None]:
# The maximum length is 44 and that value will be used for the padding.
# In order to find the optimal word embedding length, we take the vocabulary size and acquire the fourth root.

vocab_size = len(tokenized.word_index)+1
embed_size = vocab_size**0.25
print('Embedding size: ', embed_size)

In [None]:
# Now that we have acquired the vocabulary size, maximum word length and optimal embedding size, the vocabulary can be
# tokenized so that the words are represented by numbers.

vocab_tokenized = Tokenizer(num_words=5019, oov_token='OOV')
vocab_tokenized.fit_on_texts(reviews.Review)
encoded_reviews = vocab_tokenized.texts_to_sequences(reviews.Review)

print('Before: ',lists[1][0])
print('After: ',encoded_reviews[1])
print('')
print('Before: ',lists[20][0])
print('After: ',encoded_reviews[20])
print('')
print('Before: ',lists[50][0])
print('After: ',encoded_reviews[50])
print('')
print('Before: ',lists[100][0])
print('After: ',encoded_reviews[100])

In [None]:
# The vectorized reviews are then padded so that each record has an equal shape.
# We previously determined that the maximum sequence length is 44, thus all records
# are padded to 44.

padded_reviews = pad_sequences(encoded_reviews, maxlen=44)
padded_reviews = pd.DataFrame(padded_reviews)

In [None]:
# Output the padded reviews

padded_reviews

In [None]:
# The reviews are split into training and testing datasets.

X_train, X_test, y_train, y_test = train_test_split(padded_reviews, reviews.Sentiment.astype(int), test_size=0.2)

In [None]:
# Output the training and testing data to CSV for the assessment.

X_train.to_csv('d213_task2_Scampini_Xtrain.csv', index=False)
X_test.to_csv('d213_task2_Scampini_Xtest.csv', index=False)
y_train.to_csv('d213_task2_Scampini_ytrain.csv', index=False)
y_test.to_csv('d213_task2_Scampini_ytest.csv', index=False)

In [None]:
# Create sequential model using TensorFlow with five layers:
# Embedding, Pooling, Dense Relu, Dense Softmax, and Dense Sigmoid.
# 24 nodes for Relu and Softmax were found to be optimal in expirimentation.
# (Elleh, n.d.)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, round(embed_size)),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(24, activation='softmax'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=30, batch_size=48,
                    callbacks=tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2, restore_best_weights=True),
                    verbose=True)
model.summary()

In [None]:
# Create a line graph to visualization the accuracy and loss throughout the epochs.

plt.figure(figsize = [10,4])
plt.plot(history.history['accuracy'],label='Accuracy')
plt.plot(history.history['loss'],label='Loss')
plt.xlabel('Epochs')
plt.grid()
plt.legend()
plt.show()

In [None]:
# Evaluate the accuracy of the model. (Elleh, n.d.)

result = model.evaluate(X_test,y_test)

In [None]:
# Save the model

model.save("D213_Task2_Scampini_Model.keras")