# Sentiment Analysis (SA) Project
## Depedri Kevin
### Mat: 229358

# 0.0 Import of the required libraries
Before starting the project, all the required libraries are imported here, giving also a short description of the task they will be used for

In [2]:
# Loading of the datasets, slicing of dictionaries, substitution inside strings
import nltk
from nltk.corpus import movie_reviews as mr
from nltk.corpus import subjectivity as subj
import itertools
import re

# Management of stopwords
from nltk.corpus import stopwords

# Tools for NLP
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer

# Management and visualization of data
import numpy as np
import pandas as pd
from IPython.display import display

# Plot of charts and results
import plotly.express as px
import cufflinks as cf
from sklearn import metrics
from sklearn.metrics import classification_report

# Split of data
from sklearn.model_selection import train_test_split

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB

# Neural Network
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [3]:
# Download missing data
nltk.download('stopwords')
nltk.download('movie_reviews')
nltk.download('subjectivity')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Library settings
cf.go_offline()
cf.set_config_file(offline=True, world_readable=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package subjectivity to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is alrea

# 1.0 Movie-Review Dataset

## 1.1 Import and organization of data

In [4]:
# Print the total number of reviews present in the dataset
mr_dataset_length = len(mr.fileids())
print("Total number of reviews present in the dataset:", mr_dataset_length)

# Extract the ids of each review file present in the dataset, we will use this information in the following Dataframes
mr_dataset_file_ids = mr.fileids()

# Of these, we know that the first 1000 are negative, while the last 1000 reviews are positive
negative_mr_file_ids = mr_dataset_file_ids[:1000]
positive_mr_file_ids = mr_dataset_file_ids[1000:]

# Extract from the dataset the negative reviews and put them inside a pandas Dataframe and inside two lists where each review is divided in sentences or in words
negative_reviews_dataframe = pd.DataFrame(columns=["File Name", "Review Text", "Category"])
negative_reviews_sents = []
negative_reviews_words = []
negative_reviews_len = len(negative_mr_file_ids)

for i in range(negative_reviews_len):
    # Take a slice of the name as id
    negative_reviews_dataframe.loc[i, "File Name"] = negative_mr_file_ids[i][4:-4]
    # Insert raw text into the dataframe
    negative_reviews_dataframe.loc[i, "Review Text"] = mr.raw(negative_mr_file_ids[i])
    # Assign the correct category to the review
    negative_reviews_dataframe.loc[i, "Category"] = mr.categories(negative_mr_file_ids[i])[0]
    # Extract the already processed sents => General list, composed of reviews(second list) which are divided in sentences (third list)
    # Useful since it allows us to have each review divided in sentences. We will use it when it is time to remove the objective sentences
    negative_reviews_sents.append(mr.sents(negative_mr_file_ids[i]))
    # Extract the already processed words => General list, composed of reviews built as list of words (second list)
    negative_reviews_words.append(mr.words(negative_mr_file_ids[i]))
# Display the generated dataframe for negative reviews
print("Resulting dataframe containing {} negative reviews:".format(negative_reviews_len))
display(negative_reviews_dataframe)

# Extract from the dataset the positive reviews and put them inside a pandas Dataframe and inside two lists where each review is divided in sentences or in words, as seen above
positive_reviews_dataframe = pd.DataFrame(columns=["File Name", "Review Text", "Category"])
positive_reviews_sents = []
positive_reviews_words = []
positive_reviews_len = len(positive_mr_file_ids)

for i in range(positive_reviews_len):
    positive_reviews_dataframe.loc[i, "File Name"] = positive_mr_file_ids[i][4:-4]
    positive_reviews_dataframe.loc[i, "Review Text"] = mr.raw(positive_mr_file_ids[i])
    positive_reviews_dataframe.loc[i, "Category"] = mr.categories(positive_mr_file_ids[i])[0]
    positive_reviews_sents.append(mr.sents(positive_mr_file_ids[i]))
    positive_reviews_words.append(mr.words(positive_mr_file_ids[i]))
# Display the generated dataframe for positive reviews
print("Resulting dataframe containing {} positive reviews:".format(positive_reviews_len))
display(positive_reviews_dataframe)

Total number of reviews present in the dataset: 2000
Resulting dataframe containing 1000 negative reviews:


Unnamed: 0,File Name,Review Text,Category
0,cv000_29416,"plot : two teen couples go to a church party ,...",neg
1,cv001_19502,the happy bastard's quick movie review \ndamn ...,neg
2,cv002_17424,it is movies like these that make a jaded movi...,neg
3,cv003_12683,""" quest for camelot "" is warner bros . ' firs...",neg
4,cv004_12641,synopsis : a mentally unstable man undergoing ...,neg
...,...,...,...
995,cv995_23113,"if anything , "" stigmata "" should be taken as ...",neg
996,cv996_12447,"john boorman's "" zardoz "" is a goofy cinematic...",neg
997,cv997_5152,the kids in the hall are an acquired taste . \...,neg
998,cv998_15691,there was a time when john carpenter was a gre...,neg


Resulting dataframe containing 1000 positive reviews:


Unnamed: 0,File Name,Review Text,Category
0,cv000_29590,films adapted from comic books have had plenty...,pos
1,cv001_18431,every now and then a movie comes along from a ...,pos
2,cv002_15918,you've got mail works alot better than it dese...,pos
3,cv003_11664,""" jaws "" is a rare film that grabs your atten...",pos
4,cv004_11636,moviemaking is a lot like being the general ma...,pos
...,...,...,...
995,cv995_21821,wow ! what a movie . \nit's everything a movie...,pos
996,cv996_11592,"richard gere can be a commanding actor , but h...",pos
997,cv997_5046,"glory--starring matthew broderick , denzel was...",pos
998,cv998_14111,steven spielberg's second epic film on world w...,pos


In [5]:
# Merge the two previous dataframes into a single dataframe containing all the reviews
all_reviews_dataframe = pd.concat([negative_reviews_dataframe, positive_reviews_dataframe], ignore_index=True)
# Show the obtained dataframe, this represents all the movie_review dataset
print("Resulting dataframe containing all {} reviews:".format(len(all_reviews_dataframe)))
display(all_reviews_dataframe)

# Merge also the previous list into two complete list, one for sentences and one for word, this will be useful in the subjectivity classification step
all_reviews_sents = negative_reviews_sents + positive_reviews_sents
all_reviews_words = negative_reviews_words + positive_reviews_words

Resulting dataframe containing all 2000 reviews:


Unnamed: 0,File Name,Review Text,Category
0,cv000_29416,"plot : two teen couples go to a church party ,...",neg
1,cv001_19502,the happy bastard's quick movie review \ndamn ...,neg
2,cv002_17424,it is movies like these that make a jaded movi...,neg
3,cv003_12683,""" quest for camelot "" is warner bros . ' firs...",neg
4,cv004_12641,synopsis : a mentally unstable man undergoing ...,neg
...,...,...,...
1995,cv995_21821,wow ! what a movie . \nit's everything a movie...,pos
1996,cv996_11592,"richard gere can be a commanding actor , but h...",pos
1997,cv997_5046,"glory--starring matthew broderick , denzel was...",pos
1998,cv998_14111,steven spielberg's second epic film on world w...,pos


## 1.2 Descriptive statistics

### 1.2.1 Reviews general statistics

In [6]:
def general_dataframe_statistic(number_reviews, review_sents, review_words, category, output_data=False):
    """
    Compute some general statistics for the given dataframe data
    :param number_reviews: number of reviews in the dataframe
    :param review_sents: sentences present in the reviews
    :param review_words: words present in the reviews
    :param category: category of the reviews
    :param output_data: if true, return the computed statistics
    :return: list with the computed values if output_data=True
    """
    # Extract words and sents from the list created in the previous point, then count their total number
    print("Total number of {} reviews: {}".format(category, number_reviews))
    used_words = [word for i in range(number_reviews) for word in review_words[i]]
    total_number_words = len(used_words)
    used_sents = [sent for i in range(number_reviews) for sent in review_sents[i]]
    total_number_sentences = len(used_sents)

    # Compute the wanted statistics
    average_words_per_sentence = round(total_number_words / total_number_sentences)
    average_words_per_review = round(total_number_words / number_reviews)
    average_sentences_per_review = round(total_number_sentences / number_reviews)

    # Print the obtained results
    print("Total number of words: {}".format(total_number_words))
    print("Total number of sentences: {}".format(total_number_sentences))
    print("Average number of words per sentence: {}\n".format(average_words_per_sentence))
    print("Average number of words per review: {}".format(average_words_per_review))
    print("Average number of sentences per review: {}".format(average_sentences_per_review))

    if output_data:
        return [number_reviews, total_number_words, total_number_sentences, average_words_per_sentence, average_words_per_review, average_sentences_per_review]

In [7]:
general_negative_stats = general_dataframe_statistic(negative_reviews_len, negative_reviews_sents, negative_reviews_words, "negative", True)

Total number of negative reviews: 1000
Total number of words: 751256
Total number of sentences: 32025
Average number of words per sentence: 23

Average number of words per review: 751
Average number of sentences per review: 32


In [8]:
general_positive_stats = general_dataframe_statistic(positive_reviews_len, positive_reviews_sents, positive_reviews_words, "positive", True)

Total number of positive reviews: 1000
Total number of words: 832564
Total number of sentences: 33233
Average number of words per sentence: 25

Average number of words per review: 833
Average number of sentences per review: 33


In [9]:
# Create dataframe with statistics
general_stats = pd.DataFrame()
general_stats["Statistics"] = ["Total number of reviews", "Total number of words", "Total number of sentences", "Average number of words per sentence", "Average number of words per review", "Average number of sentences per review"]
general_stats["Negative"] = general_negative_stats
general_stats["Positive"] = general_positive_stats
general_stats = general_stats.set_index('Statistics')
general_stats

Unnamed: 0_level_0,Negative,Positive
Statistics,Unnamed: 1_level_1,Unnamed: 2_level_1
Total number of reviews,1000,1000
Total number of words,751256,832564
Total number of sentences,32025,33233
Average number of words per sentence,23,25
Average number of words per review,751,833
Average number of sentences per review,32,33


### 1.2.1 Reviews specific statistic

In [10]:
def specific_dataframe_statistic(dataframe, review_sents, review_words, category):
    """
    Compute some specific statistics for each review present in the given dataframe and update the dataframe with these new data
    :param dataframe: dataframe containing the reviews
    :param review_sents: sentences present in the reviews
    :param review_words: words present in the reviews
    :param category: category of the reviews
    :return:
    """
    # Add some new empty columns to the dataframe to save the data
    dataframe["# Chars"] = 0
    dataframe["# Sents"] = 0
    dataframe["# Words"] = 0
    dataframe["# Unique-Words"] = 0
    dataframe["# Stop-Words"] = 0
    dataframe["# Unique-Stop-Words"] = 0

    # Load the stop-words from NLTK
    stop_words = stopwords.words('english')

    # For each sentence in the dataframe compute number of: chars, sents, words, stop-words, unique-words and unique-stop-words
    for i in range(len(dataframe)):
        dataframe.loc[i, "# Chars"] = len(dataframe["Review Text"][i])
        dataframe.loc[i, "# Sents"] = len(review_sents[i])
        dataframe.loc[i, "# Words"] = len(review_words[i])
        dataframe.loc[i, "# Stop-Words"] = len([word for word in review_words[i] if word in stop_words])
        dataframe.loc[i, "# Unique-Words"] = len(set(review_words[i]))
        dataframe.loc[i, "# Unique-Stop-Words"] = len(set.intersection(set(review_words[i]), set(stop_words)))

    # Display the updated dataframe
    print("Resulting updated dataframe containing {} {} reviews:".format(len(dataframe), category))
    display(dataframe)

In [11]:
specific_dataframe_statistic(negative_reviews_dataframe, negative_reviews_sents, negative_reviews_words, "negative")

Resulting updated dataframe containing 1000 negative reviews:


Unnamed: 0,File Name,Review Text,Category,# Chars,# Sents,# Words,# Unique-Words,# Stop-Words,# Unique-Stop-Words
0,cv000_29416,"plot : two teen couples go to a church party ,...",neg,4043,35,879,354,383,86
1,cv001_19502,the happy bastard's quick movie review \ndamn ...,neg,1370,12,304,157,124,47
2,cv002_17424,it is movies like these that make a jaded movi...,neg,2848,24,581,280,234,68
3,cv003_12683,""" quest for camelot "" is warner bros . ' firs...",neg,2929,21,629,320,224,66
4,cv004_12641,synopsis : a mentally unstable man undergoing ...,neg,4418,38,901,385,353,76
...,...,...,...,...,...,...,...,...,...
995,cv995_23113,"if anything , "" stigmata "" should be taken as ...",neg,8356,70,1701,584,731,104
996,cv996_12447,"john boorman's "" zardoz "" is a goofy cinematic...",neg,6211,43,1251,527,481,83
997,cv997_5152,the kids in the hall are an acquired taste . \...,neg,2342,22,467,245,191,59
998,cv998_15691,there was a time when john carpenter was a gre...,neg,3183,26,683,297,264,71


In [12]:
specific_dataframe_statistic(positive_reviews_dataframe, positive_reviews_sents, positive_reviews_words, "positive")

Resulting updated dataframe containing 1000 positive reviews:


Unnamed: 0,File Name,Review Text,Category,# Chars,# Sents,# Words,# Unique-Words,# Stop-Words,# Unique-Stop-Words
0,cv000_29590,films adapted from comic books have had plenty...,pos,4227,27,862,428,313,76
1,cv001_18431,every now and then a movie comes along from a ...,pos,4096,38,814,320,321,78
2,cv002_15918,you've got mail works alot better than it dese...,pos,2421,19,495,253,222,70
3,cv003_11664,""" jaws "" is a rare film that grabs your atten...",pos,6092,41,1222,513,459,79
4,cv004_11636,moviemaking is a lot like being the general ma...,pos,3898,27,807,362,304,69
...,...,...,...,...,...,...,...,...,...
995,cv995_21821,wow ! what a movie . \nit's everything a movie...,pos,4241,47,889,332,392,76
996,cv996_11592,"richard gere can be a commanding actor , but h...",pos,1916,24,386,205,166,54
997,cv997_5046,"glory--starring matthew broderick , denzel was...",pos,6460,51,1232,515,476,72
998,cv998_14111,steven spielberg's second epic film on world w...,pos,3543,37,704,281,288,62


### 1.2.2 Reviews length analysis

In [109]:
def plot_histogram(neg_dataframe, pos_dataframe, number_bins):
    """
    Plot the distribution of the review length both for positive and negative reviews. This allows to compare them and to see if one polarity has a higher number of words per review than the other polarity
    :param neg_dataframe: dataframe containing the negative reviews
    :param pos_dataframe: dataframe containing the positive reviews
    :param number_bins: number of bins wanted in the final plot
    :return:
    """
    # Merge the required data from the two dataframe and plot a histogram for both the positive and negative reviews.
    # The single histograms can be visualized interacting with the chart under the legend 'variable'
    temp = pd.DataFrame()
    temp["neg"] = neg_dataframe['# Words']
    temp["pos"] = pos_dataframe['# Words']
    fig = px.histogram(temp, nbins=number_bins, barmode='overlay', labels={'value':'Number of words in each review'}, title='Number of words distribution for positive and negative reviews', marginal='box', template="plotly", width=800).update_layout(yaxis={"title": "Number of reviews"})
    fig.show()
# ["plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]
plot_histogram(negative_reviews_dataframe, positive_reviews_dataframe, 250)

### 1.2.3 Reviews word frequency analysis

In [110]:
def remove_punctuation(review_words):
    """
    Remove the punctuation from the given list of words
    :param review_words: list of words in input
    :return: list of words without the punctuation
    """
    punctuation_to_remove = {'.', ',', ';', ':', "'", '"', '*', '#', '!', '?', '-', '--', '_', '/', '(', ')', '[', ']', '{', '}'}
    return [word for word in review_words if word not in punctuation_to_remove]

def order_frequency(FreqDict, number_words=1):
    """
    Given a FrequencyDictionary sort its element in descending order and return a slice of a given dimension
    :param FreqDict: input FrequencyDictionary (not ordered)
    :param number_words: number of words that we want to be present in the ordered output FrequencyDictionary
    :return: FrequencyDictionary ordered in descending ordered (composed of number_words words)
    """
    return dict(sorted(FreqDict.items(), key=lambda item: item[1], reverse=True)[:number_words])

def most_frequent_terms(review_words, punctuation=False, remove_stopwords=False, n_word_plot=25):
    """
    Given a list of words, remove punctuation and stopwords(if required), then compute a FrequencyDictionary, order it and plot a slice of it as histogram
    :param review_words: list of words present in the review
    :param punctuation: if True the punctuation is removed (this allows us to have more clean and useful results)
    :param remove_stopwords: if True the english stop-words are removed (this allows us to have more meaningful results)
    :param n_word_plot: number of words that we want to show in the histogram plot
    :return: FrequencyDictionary without punctuation and stopwords, list of all words with their frequency
    """
    # Extract the words from the given list of words
    used_words = [word for i in range(len(review_words)) for word in review_words[i]]

    # If required remove punctuation
    if punctuation:
        used_words = remove_punctuation(used_words)

    # If required remove stop-words
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        used_words = [word for word in used_words if word not in stop_words]

    # Create the frequency list with all the terms present in the list of words
    reviews_freq = nltk.FreqDist(used_words)
    print("Found {} word occurrences for a total of {} unique words:".format(len(used_words), len(reviews_freq)))
    ordered_reviews_freq = order_frequency(reviews_freq, len(reviews_freq))
    display(ordered_reviews_freq)

    # Order and slice the dictionary taking only the number of words that we want to plot in the histogram
    data_for_hist = dict(itertools.islice(ordered_reviews_freq.items(), n_word_plot))
    print("Top {} most frequent words shown in the histogram: \n{}".format(n_word_plot, data_for_hist))
    fig = px.bar(x=data_for_hist.keys(), y=data_for_hist.values(), labels={'x':"Word", 'y':"Number of occurrences"}, template="plotly", width=800)
    fig.show()
    return ordered_reviews_freq

# Number of words shown in the next histograms
number_terms = 25

#### 1.2.3.a Reviews analysis with stop-words

In [111]:
neg_word_freq_w_stopwords = most_frequent_terms(negative_reviews_words, True, False, number_terms)

Found 632512 word occurrences for a total of 28462 unique words:


{'the': 35058,
 'a': 17910,
 'and': 15680,
 'of': 15487,
 'to': 15420,
 'is': 11136,
 'in': 10097,
 's': 8854,
 'that': 7803,
 'it': 7756,
 'with': 4941,
 'this': 4930,
 'as': 4900,
 'i': 4787,
 'for': 4701,
 'film': 4287,
 'but': 4142,
 'his': 3999,
 'he': 3928,
 'on': 3658,
 't': 3555,
 'movie': 3246,
 'are': 3236,
 'be': 3145,
 'one': 2800,
 'by': 2795,
 'you': 2722,
 'an': 2692,
 'have': 2661,
 'not': 2651,
 'who': 2635,
 'they': 2493,
 'at': 2491,
 'was': 2463,
 'from': 2268,
 'all': 2190,
 'has': 2155,
 'her': 2066,
 'there': 2044,
 'so': 1975,
 'like': 1888,
 'out': 1835,
 'about': 1802,
 'up': 1741,
 'or': 1655,
 'what': 1625,
 'when': 1584,
 'just': 1563,
 'if': 1550,
 'some': 1520,
 'more': 1506,
 'she': 1493,
 'can': 1457,
 'their': 1428,
 'which': 1418,
 'no': 1411,
 'even': 1386,
 'only': 1357,
 'we': 1324,
 'into': 1260,
 'time': 1168,
 'than': 1166,
 'good': 1163,
 'him': 1155,
 'would': 1090,
 'been': 1068,
 'get': 1052,
 'do': 1037,
 'bad': 1034,
 'much': 1011,
 'its':

Top 25 most frequent words shown in the histogram: 
{'the': 35058, 'a': 17910, 'and': 15680, 'of': 15487, 'to': 15420, 'is': 11136, 'in': 10097, 's': 8854, 'that': 7803, 'it': 7756, 'with': 4941, 'this': 4930, 'as': 4900, 'i': 4787, 'for': 4701, 'film': 4287, 'but': 4142, 'his': 3999, 'he': 3928, 'on': 3658, 't': 3555, 'movie': 3246, 'are': 3236, 'be': 3145, 'one': 2800}


In [112]:
pos_word_freq_w_stopwords = most_frequent_terms(positive_reviews_words, True, False, number_terms)

Found 706052 word occurrences for a total of 30397 unique words:


{'the': 41471,
 'a': 20196,
 'and': 19896,
 'of': 18636,
 'to': 16517,
 'is': 14059,
 'in': 11725,
 's': 9659,
 'it': 8351,
 'that': 8121,
 'as': 6478,
 'with': 5851,
 'his': 5588,
 'for': 5260,
 'film': 5230,
 'he': 4936,
 'this': 4648,
 'but': 4492,
 'i': 4102,
 'on': 3727,
 'are': 3713,
 'by': 3466,
 'who': 3057,
 'an': 3052,
 'one': 3052,
 'be': 3029,
 'not': 2926,
 't': 2855,
 'from': 2731,
 'you': 2594,
 'has': 2564,
 'movie': 2525,
 'at': 2495,
 'was': 2477,
 'her': 2456,
 'they': 2332,
 'have': 2240,
 'all': 2183,
 'more': 1841,
 'like': 1802,
 'out': 1802,
 'which': 1743,
 'there': 1726,
 'about': 1721,
 'so': 1708,
 'what': 1697,
 'their': 1694,
 'when': 1674,
 'up': 1664,
 'she': 1648,
 'or': 1493,
 'him': 1478,
 'some': 1465,
 'we': 1451,
 'can': 1425,
 'most': 1391,
 'into': 1363,
 'just': 1342,
 'than': 1308,
 'its': 1276,
 'will': 1272,
 'if': 1249,
 'good': 1248,
 'story': 1246,
 'time': 1243,
 'also': 1200,
 'even': 1179,
 'only': 1138,
 'very': 1130,
 'well': 1123,
 '

Top 25 most frequent words shown in the histogram: 
{'the': 41471, 'a': 20196, 'and': 19896, 'of': 18636, 'to': 16517, 'is': 14059, 'in': 11725, 's': 9659, 'it': 8351, 'that': 8121, 'as': 6478, 'with': 5851, 'his': 5588, 'for': 5260, 'film': 5230, 'he': 4936, 'this': 4648, 'but': 4492, 'i': 4102, 'on': 3727, 'are': 3713, 'by': 3466, 'who': 3057, 'an': 3052, 'one': 3052}


#### 1.2.3.b Reviews analysis without stop-words

In [113]:
neg_word_freq_wo_stopwords = most_frequent_terms(negative_reviews_words, True, True, number_terms)

Found 334299 word occurrences for a total of 28314 unique words:


{'film': 4287,
 'movie': 3246,
 'one': 2800,
 'like': 1888,
 'even': 1386,
 'time': 1168,
 'good': 1163,
 'would': 1090,
 'get': 1052,
 'bad': 1034,
 'much': 1011,
 'character': 942,
 'story': 923,
 'plot': 917,
 'two': 912,
 'characters': 873,
 'make': 851,
 'first': 832,
 'could': 791,
 'see': 784,
 'well': 783,
 'really': 781,
 'also': 767,
 'way': 764,
 'little': 726,
 'scene': 670,
 'people': 666,
 'never': 653,
 'films': 652,
 'action': 652,
 'director': 651,
 'know': 645,
 'scenes': 636,
 'man': 632,
 'big': 597,
 'movies': 571,
 'new': 569,
 'made': 567,
 'another': 555,
 'go': 552,
 'end': 531,
 'better': 531,
 'life': 529,
 'something': 529,
 'best': 504,
 'seems': 502,
 'work': 500,
 'every': 495,
 'nothing': 494,
 'back': 488,
 'many': 488,
 'us': 485,
 'enough': 484,
 'script': 478,
 'audience': 470,
 'going': 470,
 'around': 469,
 'still': 461,
 'think': 459,
 'love': 458,
 'funny': 451,
 'thing': 448,
 'gets': 442,
 'actually': 439,
 'look': 437,
 'makes': 431,
 'real': 

Top 25 most frequent words shown in the histogram: 
{'film': 4287, 'movie': 3246, 'one': 2800, 'like': 1888, 'even': 1386, 'time': 1168, 'good': 1163, 'would': 1090, 'get': 1052, 'bad': 1034, 'much': 1011, 'character': 942, 'story': 923, 'plot': 917, 'two': 912, 'characters': 873, 'make': 851, 'first': 832, 'could': 791, 'see': 784, 'well': 783, 'really': 781, 'also': 767, 'way': 764, 'little': 726}


In [114]:
pos_word_freq_wo_stopwords = most_frequent_terms(positive_reviews_words, True, True, number_terms)

Found 376055 word occurrences for a total of 30246 unique words:


{'film': 5230,
 'one': 3052,
 'movie': 2525,
 'like': 1802,
 'good': 1248,
 'story': 1246,
 'time': 1243,
 'also': 1200,
 'even': 1179,
 'well': 1123,
 'character': 1078,
 'life': 1057,
 'much': 1038,
 'would': 1019,
 'first': 1004,
 'two': 999,
 'characters': 986,
 'see': 965,
 'way': 929,
 'get': 897,
 'films': 884,
 'best': 829,
 'make': 791,
 'people': 789,
 'many': 780,
 'really': 777,
 'little': 775,
 'man': 764,
 'great': 751,
 'scene': 727,
 'new': 723,
 'never': 721,
 'world': 665,
 'love': 661,
 'scenes': 638,
 'could': 636,
 'movies': 635,
 'plot': 596,
 'us': 588,
 'director': 586,
 'still': 586,
 'back': 572,
 'know': 572,
 'however': 567,
 'another': 566,
 'go': 561,
 'makes': 561,
 'performance': 549,
 'seen': 541,
 'something': 532,
 'seems': 531,
 'end': 531,
 'work': 520,
 'action': 520,
 'made': 517,
 'though': 510,
 'may': 502,
 'role': 488,
 'years': 488,
 'although': 488,
 'year': 485,
 'real': 485,
 'take': 476,
 'big': 467,
 'old': 463,
 'almost': 455,
 'right':

Top 25 most frequent words shown in the histogram: 
{'film': 5230, 'one': 3052, 'movie': 2525, 'like': 1802, 'good': 1248, 'story': 1246, 'time': 1243, 'also': 1200, 'even': 1179, 'well': 1123, 'character': 1078, 'life': 1057, 'much': 1038, 'would': 1019, 'first': 1004, 'two': 999, 'characters': 986, 'see': 965, 'way': 929, 'get': 897, 'films': 884, 'best': 829, 'make': 791, 'people': 789, 'many': 780}


In [19]:
print("The stopwords that have been removed are:\n", (stopwords.words('english')))

The stopwords that have been removed are:
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'n

## 1.3 First classification phase (Baseline #1 without the objectivity-removal step and without pre-processing and fine-tuning)

In [20]:
# Definition of two lists, the former encompasses all the ground truths, while the latter encompasses all the reviews
ground_truth_vector = all_reviews_dataframe["Category"]
reviews_vector = all_reviews_dataframe["Review Text"]

# Convert to numpy and split of the dataset
reviews_vector = reviews_vector.to_numpy()
ground_truth_vector = ground_truth_vector.to_numpy()

# Define the number of iteration that will be used in all the next iterative procedure
K = 20

# Define functions to show in a more detailed fashion the results of the obtained prediction
def generate_confusion_matrix(references, predictions, square_shape=True, display_matrix=False):
    """
    Compute the confusion matrix, plot it and return it as a Pandas Dataframe
    :param references: list of ground truths that need to be compared with the predictions
    :param predictions: list of predictions that need to be compared with the ground truths
    :param square_shape: if True the confusion matrix is plotted in a squared fashion
    :param display_matrix: if True the matrix is displayed after the computation
    :return: confusion matrix (pandas Dataframe)
    """
    # Compute the confusion matrix
    matrix = metrics.confusion_matrix(references, predictions)

    # Decide if we want to represent it in a squared or linear shape
    if square_shape:
        confusion_matrix = pd.DataFrame(matrix, columns=['Positive', 'Negative'], index=['Positive', 'Negative'])
    else:
        confusion_matrix = pd.DataFrame(columns=['TN', 'FN', 'FP', 'TP'], index=[0]) # (matrix[0][0], matrix[0][1], matrix[1][0], matrix[1][1])
        confusion_matrix.iloc[0] = (matrix[0][0], matrix[0][1], matrix[1][0], matrix[1][1])

    # Eventually display it if required
    if display_matrix:
        display(confusion_matrix)
    return confusion_matrix

def plot_roc_compute_auc(classifier, references, test_elements, pos_label_roc):
    """
    Plot the ROC (receiver operating characteristic) and compute the AUC (area under the curve)
    :param references: list of ground truths
    :param test_elements: list of test elements (test inputs)
    :param pos_label_roc: label of the positive sample
    :return:
    """
    # Compute the class probability prediction
    classifier_y_pred_prob = classifier.predict_proba(test_elements)[::,1]

    # Extract False Positive Rate (fpr) and True Positive Rate (tpr) using the class probability prediction
    fpr, tpr, _ = metrics.roc_curve(references, classifier_y_pred_prob, pos_label=pos_label_roc)
    # Plot tpr over fpr to get the Receiver Operating Characteristic (ROC)
    fig = px.line(x=fpr, y=tpr, title='Receiver Operating Characteristic', labels={'x':'False Positive Rate', 'y':'True Positive Rate'}, template="plotly", width=800)
    fig.show()

    # Compute the Area Under the Curve (AUC) of the previous ROC curve
    auc = round(metrics.roc_auc_score(references, classifier_y_pred_prob), 5)
    display("AUC: {}".format(auc))

def plot_accuracy_distribution_histogram(accuracy_values, number_of_folds):
    """
    Plot a histogram to show how the different accuracy values obtained in the K-fold iterative procedure are distributed
    :param accuracy_values: list of accuracy values obtained in the iterative procedure
    :param number_of_folds: number of folds in the iterative procedure
    :return:
    """
    # Compute and plot the histogram
    fig = px.histogram(accuracy_values, nbins=20*number_of_folds, labels={'value':'Accuracy'}, title='Accuracy distribution over {} folds'.format(number_of_folds), marginal='violin', template="plotly", width=800).update_layout(yaxis={"title": "Number of folds"}, showlegend=False)
    fig.show()

def prediction_analysis(classifier, references, predictions, test_elements, pos_label_roc, accuracy_values, number_of_folds):
    """
    Perform a complete analysis over the obtained results using tools such as confusion matrix, ROC, AUC and accuracy distribution histogram.
    :param references: list of ground truths that need to be compared with the predictions
    :param predictions: list of predictions that need to be compared with the ground truths
    :param test_elements: list of test elements (test inputs)
    :param pos_label_roc: label of the positive sample
    :param accuracy_values: list of accuracy values obtained in the iterative procedure
    :param number_of_folds: number of folds in the iterative procedure
    :return:
    """
    # Compute confusion matrix and display it
    display("Confusion matrix for the latest prediction:")
    generate_confusion_matrix(references, predictions, True, True)

    # Compute ROC and AUC and plot them
    display("Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC) for the latest prediction:")
    plot_roc_compute_auc(classifier, references, test_elements, pos_label_roc)

    # Plot the Accuracy Distribution over K-Folds
    display("Accuracy distribution over {} folds:".format(number_of_folds))
    plot_accuracy_distribution_histogram(accuracy_values, number_of_folds)

In [39]:
# Create DataFrame where the results of all the classification procedures will be saved
classification_results = pd.DataFrame()
classification_results["Type of Processing"] = ["Raw Text - FullDict", "Raw Text - 80% Dict", "Object Removal (OR)", "OR + PreProcessing1", "OR + PreProcessing2", "OR + PreProcessing3",
                                       "OR + PreProcessing4", "OR + PreProcessing5"]
classification_results = classification_results.set_index('Type of Processing')
classification_results["Multinomial Naive-Bayes Accuracy (%)"] = np.nan
classification_results["Bi-directional LSTM Accuracy (%)"] = np.nan
classification_results["Bi-directional LSTM-ATT Accuracy (%)"] = np.nan
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,,,
Raw Text - 80% Dict,,,
Object Removal (OR),,,
OR + PreProcessing1,,,
OR + PreProcessing2,,,
OR + PreProcessing3,,,
OR + PreProcessing4,,,
OR + PreProcessing5,,,


### 1.3.1 Classification using all the data to build the dictionary

In [40]:
# Perform for the first time a sentiment classification of all the reviews, but using 100% of the data to build the dictionary of the vectorizer.

# Initialization of vectorizer and classifier
vectorizer = CountVectorizer(lowercase=True)
classifier = MultinomialNB()

# Define variables to save the results
total_accuracy, average_accuracy, i = 0, 0, 0
test_reference, prediction, test_vector = 0, 0, 0
accuracy_vector = []

# Build the dictionary of the vectorizer using all the reviews (100% of data)
vectorizer.fit(reviews_vector)

# Run the iterative procedure
for x in range(K):
    i += 1
    # Split the dataset in train set (80%) and test set (20%) with shuffle=True
    # Here the dictionary used will encompass all the information (since it has been fitted on all the reviews), this should theoretically lead to higher but misleading results
    X_train, X_test, train_reference, test_reference = train_test_split(reviews_vector, ground_truth_vector, train_size=0.8, shuffle=True, stratify=ground_truth_vector)

    # Return vectorized reviews for the train-set based on the dictionary learnt over all the dataset
    train_vector = vectorizer.transform(X_train).toarray()
    # Return vectorized reviews for the test-set based on the dictionary learnt over all the dataset
    test_vector = vectorizer.transform(X_test).toarray()

    # Train (or fit) the MultinomialBN model over the train vector
    classifier.fit(train_vector, train_reference)

    # Predict labels for the test vector
    prediction = classifier.predict(test_vector)

    # Evaluate the model over the test vector
    accuracy = metrics.accuracy_score(test_reference, prediction)
    accuracy_vector.append(accuracy)

    # Print current iteration accuracy and overall average accuracy
    print("Accuracy for iteration number {}/{}: {:.3}".format(i, K, accuracy))
    total_accuracy += accuracy
    average_accuracy = total_accuracy / i
    print("Average accuracy over all iterations: {:.3}".format(average_accuracy))
    print("------------------------------------------------")

# Perform a detailed analysis for the latest prediction
prediction_analysis(classifier, test_reference, prediction, test_vector, 'pos', accuracy_vector, K)

Accuracy for iteration number 1/20: 0.82
Average accuracy over all iterations: 0.82
------------------------------------------------
Accuracy for iteration number 2/20: 0.83
Average accuracy over all iterations: 0.825
------------------------------------------------
Accuracy for iteration number 3/20: 0.802
Average accuracy over all iterations: 0.817
------------------------------------------------
Accuracy for iteration number 4/20: 0.82
Average accuracy over all iterations: 0.818
------------------------------------------------
Accuracy for iteration number 5/20: 0.818
Average accuracy over all iterations: 0.818
------------------------------------------------
Accuracy for iteration number 6/20: 0.828
Average accuracy over all iterations: 0.82
------------------------------------------------
Accuracy for iteration number 7/20: 0.807
Average accuracy over all iterations: 0.818
------------------------------------------------
Accuracy for iteration number 8/20: 0.802
Average accuracy o

'Confusion matrix for the latest prediction:'

Unnamed: 0,Positive,Negative
Positive,165,33
Negative,28,174


'Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC) for the latest prediction:'

'AUC: 0.89593'

'Accuracy distribution over 20 folds:'

In [42]:
# Add the result to the Dataframe
average_accuracy_rawtext_fulldict = average_accuracy
classification_results.loc["Raw Text - FullDict", "Multinomial Naive-Bayes Accuracy (%)"] = round(average_accuracy_rawtext_fulldict, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,,,
Object Removal (OR),,,
OR + PreProcessing1,,,
OR + PreProcessing2,,,
OR + PreProcessing3,,,
OR + PreProcessing4,,,
OR + PreProcessing5,,,


#### 1.3.2 Classification using only the test set (80% of the data) to build the dictionary

In [31]:
def split_vectorize_fit_predict(input_vector, reference_vector, iterations, pos_label_roc, return_avg_accuracy=False, return_classifier_vectorizer=False):
    """
    Initialize vectorizer and classifier (Multinomial Naive Bayes), then for the specified number of iterations:
    1) Run the procedure of splitting in train-set (80% of data) and test-set (20% of data).
    2) Fit the vectorizer on the train set
    3) Convert the train-set and the test-set in sequences using the previously fitted vectorizer
    4) Train and test the model
    5) Print accuracy of the current iteration and update the average accuracy
    :param input_vector: input data (list of reviews)
    :param reference_vector: ground truths for the input data
    :param iterations: number of iterations required
    :param pos_label_roc: label of the positive sample for the plot of the ROC curve
    :param return_avg_accuracy: if true the final average accuracy is returned
    :param return_classifier_vectorizer: if true the trained vectorizer and classifier of the latest iteration are returned
    :return: print of accuracy and average accuracy, Plot of ROC, AUC and confusion matrix. If required returns the average accuracy, the vectorizer and the classifier
    """
    # Initialization of vectorizer and classifier
    vectorizer = CountVectorizer(lowercase=True)
    classifier = MultinomialNB()

    # Define variables to save the results
    total_accuracy, average_accuracy, i = 0, 0, 0
    test_reference, prediction, test_vector = 0, 0, 0
    accuracy_vector = []

    # Run the iterative procedure
    for x in range(iterations):
        i += 1
        # Split the dataset in train set (80%) and test set (20%) with shuffle=True to randomize the results and with stratify option enabled to generate balanced splits.
        # Here the dictionary used will encompass only 80% of the information (since it has been fitted on the train set), leading to more accurate test performance measurements
        X_train, X_test, train_reference, test_reference = train_test_split(input_vector, reference_vector, train_size=0.8, shuffle=True, stratify=reference_vector)

        # Learn the words dictionary using all the reviews contained in the training-set
        train_vector = vectorizer.fit_transform(X_train).toarray()
        # Return vectorized reviews for the test-set based on the training-set learnt dictionary
        test_vector = vectorizer.transform(X_test).toarray()

        # Train (or fit) the MultinomialBN model over the train vector
        classifier.fit(train_vector, train_reference)

        # Predict labels for the test vector
        prediction = classifier.predict(test_vector)

        # Evaluate the model over the test vector
        accuracy = metrics.accuracy_score(test_reference, prediction)
        accuracy_vector.append(accuracy)

        # Print current iteration accuracy and overall average accuracy
        print("Accuracy for iteration number {}/{}: {:.3}".format(i, iterations, accuracy))
        total_accuracy += accuracy
        average_accuracy = total_accuracy / i
        print("Average accuracy over all iterations: {:.3}".format(average_accuracy))
        print("------------------------------------------------")

    # Perform a detailed analysis for the latest prediction
    prediction_analysis(classifier, test_reference, prediction, test_vector, pos_label_roc, accuracy_vector, iterations)
    if return_avg_accuracy and return_classifier_vectorizer:
        return average_accuracy, classifier, vectorizer,
    elif return_avg_accuracy:
        return average_accuracy
    elif return_classifier_vectorizer:
        return classifier, vectorizer

In [44]:
# Perform for the first time a sentiment classification of all the reviews as above, but using only 80% of the data to build the dictionary of the vectorizer.
average_accuracy_rawtext_80dict = split_vectorize_fit_predict(reviews_vector, ground_truth_vector, K, 'pos', True)

Accuracy for iteration number 1/20: 0.823
Average accuracy over all iterations: 0.823
------------------------------------------------
Accuracy for iteration number 2/20: 0.83
Average accuracy over all iterations: 0.826
------------------------------------------------
Accuracy for iteration number 3/20: 0.807
Average accuracy over all iterations: 0.82
------------------------------------------------
Accuracy for iteration number 4/20: 0.81
Average accuracy over all iterations: 0.818
------------------------------------------------
Accuracy for iteration number 5/20: 0.805
Average accuracy over all iterations: 0.815
------------------------------------------------
Accuracy for iteration number 6/20: 0.825
Average accuracy over all iterations: 0.817
------------------------------------------------
Accuracy for iteration number 7/20: 0.848
Average accuracy over all iterations: 0.821
------------------------------------------------
Accuracy for iteration number 8/20: 0.825
Average accuracy

'Confusion matrix for the latest prediction:'

Unnamed: 0,Positive,Negative
Positive,162,38
Negative,41,159


'Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC) for the latest prediction:'

'AUC: 0.87158'

'Accuracy distribution over 20 folds:'

In [45]:
# Add the result to the Dataframe
classification_results.loc["Raw Text - 80% Dict", "Multinomial Naive-Bayes Accuracy (%)"] = round(average_accuracy_rawtext_80dict, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),,,
OR + PreProcessing1,,,
OR + PreProcessing2,,,
OR + PreProcessing3,,,
OR + PreProcessing4,,,
OR + PreProcessing5,,,


# 2.0 Subjectivity-Objectivity Dataset

## 2.1 Import and organization of data

In [46]:
# Print the total number of sentences present in the dataset
subj_dataset_length = len(subj.sents())
print("Total number of reviews present in the dataset:", subj_dataset_length)
n_instances_per_type = 5000

# Extract from the dataset the subjective and objective sentences (under Tuple format) and put them inside two lists
subj_docs = [(sent, 'subj') for sent in subj.sents(categories='subj')[:n_instances_per_type]]
obj_docs = [(sent, 'obj') for sent in subj.sents(categories='obj')[:n_instances_per_type]]

# From the previously defined list of tuples extract the lists of words and build the relative sentences (subjective sentences)
subjective_sentences_words = [subj_docs[i][0] for i in range(n_instances_per_type)]
subjective_sentences_sents = [[' '.join(word)] for word in subjective_sentences_words]

# Create a pandas dataframe and insert both the sentence text and the information about the category ("subj" in that case)
subjective_sentences_dataframe = pd.DataFrame(columns=["Sentence Text", "Category"])
for i in range(0, n_instances_per_type):
    subjective_sentences_dataframe.loc[i, "Sentence Text"] = subjective_sentences_sents[i][0]
    subjective_sentences_dataframe.loc[i, "Category"] = "subj"
# Display the generated dataframe for subjective sentences
print("Resulting dataframe containing {} subjective sentences:".format(len(subjective_sentences_dataframe)))
display(subjective_sentences_dataframe)

# From the previously defined list of tuples extract the lists of words and build the relative sentences (objective sentences)
objective_sentences_words = [obj_docs[i][0] for i in range(n_instances_per_type)]
objective_sentences_sents = [[' '.join(word)] for word in objective_sentences_words]

# Create a pandas dataframe and insert both the sentence text and the information about the category ("obj" in that case)
objective_sentences_dataframe = pd.DataFrame(columns=["Sentence Text", "Category"])
for i in range(0, n_instances_per_type):
    objective_sentences_dataframe.loc[i, "Sentence Text"] = objective_sentences_sents[i][0]
    objective_sentences_dataframe.loc[i, "Category"] = "obj"
# Display the generated dataframe for objective sentences
print("Resulting dataframe containing {} objective sentences:".format(len(objective_sentences_dataframe)))
display(objective_sentences_dataframe)

Total number of reviews present in the dataset: 10000
Resulting dataframe containing 5000 subjective sentences:


Unnamed: 0,Sentence Text,Category
0,"smart and alert , thirteen conversations about...",subj
1,"color , musical bounce and warm seas lapping o...",subj
2,it is not a mass-market entertainment but an u...,subj
3,a light-hearted french film about the spiritua...,subj
4,my wife is an actress has its moments in looki...,subj
...,...,...
4995,"a haunted , bountiful film that demands patien...",subj
4996,"the movie's gloomy atmosphere is fascinating ,...",subj
4997,it aimlessly and unsuccessfully attempts to fu...,subj
4998,"an authentically vague , but ultimately purpos...",subj


Resulting dataframe containing 5000 objective sentences:


Unnamed: 0,Sentence Text,Category
0,the movie begins in the past where a young boy...,obj
1,emerging from the human psyche and showing cha...,obj
2,spurning her mother's insistence that she get ...,obj
3,amitabh can't believe the board of directors a...,obj
4,"she , among others excentricities , talks to a...",obj
...,...,...
4995,"in the end , they discover that balance in lif...",obj
4996,a counterfeit 1000 tomin bank note is passed i...,obj
4997,enter the beautiful and mysterious secret agen...,obj
4998,after listening to a missionary from china spe...,obj


In [47]:
# Merge the two previous dataframes into a single dataframe containing all the sentences
all_sentences_dataframe = pd.concat([subjective_sentences_dataframe, objective_sentences_dataframe], ignore_index=True)

# Show the obtained dataframe, this represents all the movie_review dataset
print("Resulting dataframe containing all {} sentences:".format(len(all_sentences_dataframe)))
display(all_sentences_dataframe)

Resulting dataframe containing all 10000 sentences:


Unnamed: 0,Sentence Text,Category
0,"smart and alert , thirteen conversations about...",subj
1,"color , musical bounce and warm seas lapping o...",subj
2,it is not a mass-market entertainment but an u...,subj
3,a light-hearted french film about the spiritua...,subj
4,my wife is an actress has its moments in looki...,subj
...,...,...
9995,"in the end , they discover that balance in lif...",obj
9996,a counterfeit 1000 tomin bank note is passed i...,obj
9997,enter the beautiful and mysterious secret agen...,obj
9998,after listening to a missionary from china spe...,obj


## 2.2 Objectivity and Subjectivity classification

In [49]:
# Definition of two lists, the former encompasses all the ground truths, while the latter encompasses all the sentences
sentence_ground_truth_vector = all_sentences_dataframe["Category"]
sentence_vector = all_sentences_dataframe["Sentence Text"]

# Convert to numpy and split of the dataset
sentence_ground_truth_vector = sentence_ground_truth_vector.to_numpy()
sentence_vector = sentence_vector.to_numpy()

In [51]:
# Perform a classification of objective and subjective reviews with a Multinomial Naive Bayes trained over 80% of the data
subjectivity_classifier, subjectivity_vectorizer = split_vectorize_fit_predict(sentence_vector, sentence_ground_truth_vector, K, 'subj', False, True)

Accuracy for iteration number 1/20: 0.921
Average accuracy over all iterations: 0.921
------------------------------------------------
Accuracy for iteration number 2/20: 0.926
Average accuracy over all iterations: 0.924
------------------------------------------------
Accuracy for iteration number 3/20: 0.92
Average accuracy over all iterations: 0.923
------------------------------------------------
Accuracy for iteration number 4/20: 0.916
Average accuracy over all iterations: 0.921
------------------------------------------------
Accuracy for iteration number 5/20: 0.913
Average accuracy over all iterations: 0.919
------------------------------------------------
Accuracy for iteration number 6/20: 0.92
Average accuracy over all iterations: 0.92
------------------------------------------------
Accuracy for iteration number 7/20: 0.917
Average accuracy over all iterations: 0.919
------------------------------------------------
Accuracy for iteration number 8/20: 0.911
Average accuracy

'Confusion matrix for the latest prediction:'

Unnamed: 0,Positive,Negative
Positive,907,93
Negative,74,926


'Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC) for the latest prediction:'

'AUC: 0.97328'

'Accuracy distribution over 20 folds:'

# 3.0 Polarity classification

## 3.1 Remove subjective sentences from each review of the dataset

In [52]:
# Define a new list of reviews, where each review is composed of subjective only sentences
all_reviews_sents_subjective = []

# Analyse each review of the dataset
for review in all_reviews_sents:
    # The subjective review will be composed only of the sentences within itself that are classified as subjective
    subjective_review = []

    # For each sentence in the review
    for sent in review:
        # Join together the different words that compose the sentence
        joint_sent = [' '.join(sent)]

        # Vectorize the sentence to then feed it to the classifier
        vectorized_joint_sent = subjectivity_vectorizer.transform(joint_sent)

        # Compute a prediction over that sentence
        prediction_joint_sent = subjectivity_classifier.predict(vectorized_joint_sent)

        # If the sentence is classified as subjective keep it, otherwise move onto the next one
        if str(prediction_joint_sent[0]) == "subj":
            # Here we want an output without the [ ], so we index it to the element 0, which is the sentence
            subjective_review.append(joint_sent[0])

    # Once a review has been fully processed, add it into a new list, being sure that each sentence in the review ends with a new line '\n'
    all_reviews_sents_subjective.append('\n'.join(subjective_review))

## 3.2 Second classification phase (Baseline #2 with the objectivity-removal step, but without pre-processing and fine-tuning)

In [53]:
# Now that we have reviews composed only of subjective sentences (extracted by our previously trained NB classifier) we perform for the second time a sentiment classification for all the reviews, which should have a higher accuracy due to the performed objectivity-removal step.
average_accuracy_object_removal = split_vectorize_fit_predict(all_reviews_sents_subjective, ground_truth_vector, K, 'pos', True)

Accuracy for iteration number 1/20: 0.85
Average accuracy over all iterations: 0.85
------------------------------------------------
Accuracy for iteration number 2/20: 0.823
Average accuracy over all iterations: 0.836
------------------------------------------------
Accuracy for iteration number 3/20: 0.868
Average accuracy over all iterations: 0.847
------------------------------------------------
Accuracy for iteration number 4/20: 0.83
Average accuracy over all iterations: 0.843
------------------------------------------------
Accuracy for iteration number 5/20: 0.88
Average accuracy over all iterations: 0.85
------------------------------------------------
Accuracy for iteration number 6/20: 0.868
Average accuracy over all iterations: 0.853
------------------------------------------------
Accuracy for iteration number 7/20: 0.84
Average accuracy over all iterations: 0.851
------------------------------------------------
Accuracy for iteration number 8/20: 0.815
Average accuracy ov

'Confusion matrix for the latest prediction:'

Unnamed: 0,Positive,Negative
Positive,175,25
Negative,35,165


'Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC) for the latest prediction:'

'AUC: 0.9201'

'Accuracy distribution over 20 folds:'

In [54]:
# Add the result to the Dataframe
classification_results.loc["Object Removal (OR)", "Multinomial Naive-Bayes Accuracy (%)"] = round(average_accuracy_object_removal, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,,,
OR + PreProcessing2,,,
OR + PreProcessing3,,,
OR + PreProcessing4,,,
OR + PreProcessing5,,,


# 4.0 Text Pre-Processing

In [55]:
# Print the len and the raw text of the review n.1, this review will be used in the next steps to show how the text changes applying gradually more pre-processing
print("Chars:", len(all_reviews_sents_subjective[1]))
all_reviews_sents_subjective[1]

Chars: 1077


'the happy bastard \' s quick movie review damn that y2k bug .\n. . going for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance .\nwe don \' t know why the crew was really out in the middle of nowhere , we don \' t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don \' t know why donald sutherland is stumbling around drunkenly throughout .\nhere , it \' s just " hey , let \' s chase these people around with some robots " .\nthe acting is below average , even from the likes of curtis .\nyou \' re more likely to get a kick out of her work in halloween h20 .\nsutherland is wasted and baldwin , well , he \' s acting like a baldwin , of course .\nthe real star here are stan winston \' s robot design , some schnazzy cgi , and the occasional good gore shot , like picking into someone \' s brain .\nso , if robots and body parts real

## 4.1 Tokenize

In [56]:
def tokenize_reviews(review_sentences):
    """
    Tokenize the review raw text and return a list of tokenized reviews
    :param review_sentences: list of raw text reviews
    :return: list of tokenized reviews
    """
    tokenized_reviews = []
    state = 1
    # For each review in the input list
    for review in review_sentences:
        # Update the user on the status of the process
        print("Processing review: {}".format(state), end='\r')
        state += 1
        # Perform tokenization and append the tokenized review
        tokenized_reviews.append(word_tokenize(review))
    print()
    return tokenized_reviews

In [57]:
# Apply tokenization to the raw text and print review n.1 and its len
all_reviews_sents_processed1 = tokenize_reviews(all_reviews_sents_subjective)
print("Tokens:", len(all_reviews_sents_processed1[1]))
print(all_reviews_sents_processed1[1])

Processing review: 2000
Tokens: 242
['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', 'review', 'damn', 'that', 'y2k', 'bug', '.', '.', '.', 'going', 'for', 'the', 'gore', 'and', 'bringing', 'on', 'a', 'few', 'action', 'sequences', 'here', 'and', 'there', ',', 'virus', 'still', 'feels', 'very', 'empty', ',', 'like', 'a', 'movie', 'going', 'for', 'all', 'flash', 'and', 'no', 'substance', '.', 'we', 'don', "'", 't', 'know', 'why', 'the', 'crew', 'was', 'really', 'out', 'in', 'the', 'middle', 'of', 'nowhere', ',', 'we', 'don', "'", 't', 'know', 'the', 'origin', 'of', 'what', 'took', 'over', 'the', 'ship', '(', 'just', 'that', 'a', 'big', 'pink', 'flashy', 'thing', 'hit', 'the', 'mir', ')', ',', 'and', ',', 'of', 'course', ',', 'we', 'don', "'", 't', 'know', 'why', 'donald', 'sutherland', 'is', 'stumbling', 'around', 'drunkenly', 'throughout', '.', 'here', ',', 'it', "'", 's', 'just', '``', 'hey', ',', 'let', "'", 's', 'chase', 'these', 'people', 'around', 'with', 'some', 'robots', 

## 4.2 Correction of tokenization

In [58]:
def correct_tokenization(tokenized_review_sentences):
    """
    Correct the errors performed by the previous tokenization procedure
    :param tokenized_review_sentences: list of tokenized reviews
    :return: list of tokenized corrected reviews
    """
    reviews_corrected = []
    state = 1
    # For each review in the input list
    for review in tokenized_review_sentences:
        # Update the user on the status of the process
        print("Processing review: {}".format(state), end='\r')
        state += 1
        temp_review = []
        # For each token in the tokenized review
        for token in review:
            # Perform the correction of the tokenization
            if token == 'don':
                temp_review.append('don\'t')
            elif token == 'doesn':
                temp_review.append('doesn\'t')

            elif token == 'didn':
                temp_review.append('didn\'t')
            elif token == 'couldn':
                temp_review.append('couldn\'t')

            elif token == 're':
                temp_review.append('are')
            elif token == 'aren':
                temp_review.append('aren\'t')

            elif token == 's':
                temp_review.append('is')
            elif token == 'isn':
                temp_review.append('isn\'t')

            elif token == 've':
                temp_review.append('have')
            elif token == 'haven':
                temp_review.append('haven\'t')
            elif token == 'hasn':
                temp_review.append('hasn\'t')

            # If we have just "'t" we remove it since, if needed, it is already fixed by the previous lines
            elif token == 't':
                continue
            else:
                temp_review.append(token)
        # Append the corrected tokenized review
        reviews_corrected.append(temp_review)
    print()
    return reviews_corrected

In [59]:
# Apply correction of the tokenization to the tokenized text and print review n.1 and its len
all_reviews_sents_processed2 = correct_tokenization(all_reviews_sents_processed1)
print("Tokens:", len(all_reviews_sents_processed2[1]))
print(all_reviews_sents_processed2[1])

Processing review: 2000
Tokens: 239
['the', 'happy', 'bastard', "'", 'is', 'quick', 'movie', 'review', 'damn', 'that', 'y2k', 'bug', '.', '.', '.', 'going', 'for', 'the', 'gore', 'and', 'bringing', 'on', 'a', 'few', 'action', 'sequences', 'here', 'and', 'there', ',', 'virus', 'still', 'feels', 'very', 'empty', ',', 'like', 'a', 'movie', 'going', 'for', 'all', 'flash', 'and', 'no', 'substance', '.', 'we', "don't", "'", 'know', 'why', 'the', 'crew', 'was', 'really', 'out', 'in', 'the', 'middle', 'of', 'nowhere', ',', 'we', "don't", "'", 'know', 'the', 'origin', 'of', 'what', 'took', 'over', 'the', 'ship', '(', 'just', 'that', 'a', 'big', 'pink', 'flashy', 'thing', 'hit', 'the', 'mir', ')', ',', 'and', ',', 'of', 'course', ',', 'we', "don't", "'", 'know', 'why', 'donald', 'sutherland', 'is', 'stumbling', 'around', 'drunkenly', 'throughout', '.', 'here', ',', 'it', "'", 'is', 'just', '``', 'hey', ',', 'let', "'", 'is', 'chase', 'these', 'people', 'around', 'with', 'some', 'robots', '``', 

## 4.3 Remove punctuation

In [60]:
def remove_punctuation(tokenized_review_sentences):
    """
    Remove the punctuation from the given list of tokenized reviews
    :param tokenized_review_sentences: list of tokenized reviews
    :return: list of tokenized reviews without the punctuation
    """
    # List of the punctuation to remove
    punctuation_to_remove = {'.', ',', ';', ':', '"', "'", "''", "``", '*', '#', '!', '?', '-', '--', '_', '/', '(', ')', '[', ']', '{', '}'}
    reviews_wo_punctuation = []
    state = 1
    # For each review in the input list
    for review in tokenized_review_sentences:
        # Update the user on the status of the process
        print("Processing review: {}".format(state), end='\r')
        state += 1
        # Remove the punctuation if we find it
        review_mod = [word for word in review if word not in punctuation_to_remove]
        # Append the review without punctuation
        reviews_wo_punctuation.append(review_mod)
    print()
    return reviews_wo_punctuation

In [61]:
# Apply removal of punctuation to the tokenized corrected text and print review n.1 and its len
all_reviews_sents_processed3 = remove_punctuation(all_reviews_sents_processed2)
print("Tokens:", len(all_reviews_sents_processed3[1]))
print(all_reviews_sents_processed3[1])

Processing review: 2000
Tokens: 193
['the', 'happy', 'bastard', 'is', 'quick', 'movie', 'review', 'damn', 'that', 'y2k', 'bug', 'going', 'for', 'the', 'gore', 'and', 'bringing', 'on', 'a', 'few', 'action', 'sequences', 'here', 'and', 'there', 'virus', 'still', 'feels', 'very', 'empty', 'like', 'a', 'movie', 'going', 'for', 'all', 'flash', 'and', 'no', 'substance', 'we', "don't", 'know', 'why', 'the', 'crew', 'was', 'really', 'out', 'in', 'the', 'middle', 'of', 'nowhere', 'we', "don't", 'know', 'the', 'origin', 'of', 'what', 'took', 'over', 'the', 'ship', 'just', 'that', 'a', 'big', 'pink', 'flashy', 'thing', 'hit', 'the', 'mir', 'and', 'of', 'course', 'we', "don't", 'know', 'why', 'donald', 'sutherland', 'is', 'stumbling', 'around', 'drunkenly', 'throughout', 'here', 'it', 'is', 'just', 'hey', 'let', 'is', 'chase', 'these', 'people', 'around', 'with', 'some', 'robots', 'the', 'acting', 'is', 'below', 'average', 'even', 'from', 'the', 'likes', 'of', 'curtis', 'you', 'are', 'more', 'lik

## 4.4 Remove stop-words

In [62]:
def remove_stopwords(tokenized_review_sentences, stopwords_language):
    """
    Remove the punctuation from the given list of reviews
    :param tokenized_review_sentences: list of tokenized reviews
    :param stopwords_language: language of the stopwords to remove (only languages supported from NLTK)
    :return: list of tokenized reviews without stopwords
    """
    # Get the stopwords from NLTK for the specified language
    stop_words = set(stopwords.words(stopwords_language))
    reviews_wo_stopwords = []
    state = 1
    # For each review in the input list
    for review in tokenized_review_sentences:
        # Update the user on the status of the process
        print("Processing review: {}".format(state), end='\r')
        state += 1
        # Remove the stopwords if we find it
        review_mod = [word for word in review if word not in stop_words]
        # Append the review without stopwords
        reviews_wo_stopwords.append(review_mod)
    print()
    return reviews_wo_stopwords

In [63]:
# Show stop words that will be removed
print("Removed stopwords:", stopwords.words('english'))

Removed stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'sam

In [64]:
# Apply removal of stopwords to the punctuation removed text and print review n.1 and its len
all_reviews_sents_processed4 = remove_stopwords(all_reviews_sents_processed3, 'english')
print("Tokens:", len(all_reviews_sents_processed4[1]))
print(all_reviews_sents_processed4[1])

Processing review: 2000
Tokens: 98
['happy', 'bastard', 'quick', 'movie', 'review', 'damn', 'y2k', 'bug', 'going', 'gore', 'bringing', 'action', 'sequences', 'virus', 'still', 'feels', 'empty', 'like', 'movie', 'going', 'flash', 'substance', 'know', 'crew', 'really', 'middle', 'nowhere', 'know', 'origin', 'took', 'ship', 'big', 'pink', 'flashy', 'thing', 'hit', 'mir', 'course', 'know', 'donald', 'sutherland', 'stumbling', 'around', 'drunkenly', 'throughout', 'hey', 'let', 'chase', 'people', 'around', 'robots', 'acting', 'average', 'even', 'likes', 'curtis', 'likely', 'get', 'kick', 'work', 'halloween', 'h20', 'sutherland', 'wasted', 'baldwin', 'well', 'acting', 'like', 'baldwin', 'course', 'real', 'star', 'stan', 'winston', 'robot', 'design', 'schnazzy', 'cgi', 'occasional', 'good', 'gore', 'shot', 'like', 'picking', 'someone', 'brain', 'robots', 'body', 'parts', 'really', 'turn', 'movie', 'otherwise', 'pretty', 'much', 'sunken', 'ship', 'movie']


## 4.5 Remove numbers

In [65]:
def remove_number(tokenized_review_sentence):
    """
    Remove the numbers from the given list of reviews
    :param tokenized_review_sentence: list of tokenized reviews
    :return: list of tokenized reviews without numbers
    """
    reviews_wo_numbers = []
    state = 1
    for review in tokenized_review_sentence:
        print("Processing review: {}".format(state), end='\r')
        state += 1
        review_wo_number = []
        for token in review:
            # Remove the number present in the token
            cleaned_token = (''.join([char for char in token if not char.isdigit()]))
            # If the tokens contain something then append it, otherwise move onto the next token
            if len(cleaned_token) > 0 :
                review_wo_number.append(cleaned_token)
        reviews_wo_numbers.append(review_wo_number)
    print()
    return reviews_wo_numbers

In [66]:
# Apply removal of numbers to the stop-word removed text and print review n.1 and its len
all_reviews_sents_processed5 = remove_number(all_reviews_sents_processed4)
print("Tokens:", len(all_reviews_sents_processed5[1]))
print(all_reviews_sents_processed5[1])

Processing review: 2000
Tokens: 98
['happy', 'bastard', 'quick', 'movie', 'review', 'damn', 'yk', 'bug', 'going', 'gore', 'bringing', 'action', 'sequences', 'virus', 'still', 'feels', 'empty', 'like', 'movie', 'going', 'flash', 'substance', 'know', 'crew', 'really', 'middle', 'nowhere', 'know', 'origin', 'took', 'ship', 'big', 'pink', 'flashy', 'thing', 'hit', 'mir', 'course', 'know', 'donald', 'sutherland', 'stumbling', 'around', 'drunkenly', 'throughout', 'hey', 'let', 'chase', 'people', 'around', 'robots', 'acting', 'average', 'even', 'likes', 'curtis', 'likely', 'get', 'kick', 'work', 'halloween', 'h', 'sutherland', 'wasted', 'baldwin', 'well', 'acting', 'like', 'baldwin', 'course', 'real', 'star', 'stan', 'winston', 'robot', 'design', 'schnazzy', 'cgi', 'occasional', 'good', 'gore', 'shot', 'like', 'picking', 'someone', 'brain', 'robots', 'body', 'parts', 'really', 'turn', 'movie', 'otherwise', 'pretty', 'much', 'sunken', 'ship', 'movie']


## 4.6 Correction of elongated words

In [67]:
def check_next(token, index, chars_to_rem):
    """
    Recursive function called by the below defined 'remove_elongations' function. Used to move iteratively inside a repetition of chars and to define how many of them need to be removed
    :param token: token under analysis
    :param index: index of the char in the token under analysis
    :param chars_to_rem: number of chars that will be removed at the end of the analysis
    :return: chars_to_remove(see above) and word_under_processing(used to define if the word need to be )
    """
    # Save locally the number of chars that need to be removed up to this point
    chars_to_remove = chars_to_rem

    # Extract the current char and the next one
    char1 = token[index]
    char2 = token[index+1]

    # If they are different put bool value to True (i.e., move on the next iteration of this cycle), it is now possible to check the next char
    if char1 != char2:
        word_under_processing = True

    # Otherwise, if they are the same check if there are other chars after this one, and eventually run a recursive algorithm to check the length of the repetition
    else:
        chars_to_remove += 1
        if index+2 == len(token):
            word_under_processing = False
        else:
            output = check_next(token, index+1, chars_to_remove)
            chars_to_remove = output[0]
            word_under_processing = output[1]

    return chars_to_remove, word_under_processing

def remove_elongations(tokenized_review_sentences):
    """
    Analyze each word in the tokenized reviews and look for sequences of identical chars that are longer than 2. If we have more than 2 identical chars, then remove the identical chars in excess above this threshold. To perform this task we rely on the previously defined function 'check_next'
    :param tokenized_review_sentences: list of tokenized reviews
    :return: list of tokenized reviews without elongations
    """
    corrected_reviews = []
    state = 1
    # For each review in the input list
    for review in tokenized_review_sentences:
        # Update the user on the status of the process
        print("Processing review: {}".format(state), end='\r')
        state += 1

        # Create a list where the corrected review will be saved
        corrected_review = []

        # For each token in the review
        for token in review:
            current_token = token

            word_under_processing = True
            # Until the token in under processing (word_under_processing=True) the following procedure is iterated
            while word_under_processing:
                for i in range(len(current_token)):
                    # Initialize the removal index and the number of chars to remove
                    removal_starting_index = 0
                    chars_to_remove = 0

                    # If this is the last or the second to last char break (we can move on the next token), we will have 2 equal chars in the worst case
                    if i + 1 == len(current_token) or i + 2 == len(current_token):
                        word_under_processing = False
                        break
                    else:
                        # Otherwise extract the current char and the next one
                        char1 = current_token[i]
                        char2 = current_token[i+1]

                        # If they are different continue (i.e., move on the next iteration of this cycle), it is now possible to check the next char
                        if char2 != char1:
                            continue

                        # If they are the same save the index and run a recursive algorithm to check the length of the repetition
                        else:
                            removal_starting_index = i
                            output = check_next(current_token, i+1, chars_to_remove)

                            # Finally, extract the length of the repetition and the boolean value (to understand if the token has been terminated or not)
                            chars_to_remove = output[0]
                            word_under_processing = output[1]

                            # If at least one char has been removed, update the token, in this way the next iteration over that token will work on the corrected one
                            if chars_to_remove != 0:
                                current_token = current_token[:removal_starting_index] + current_token[removal_starting_index+chars_to_remove:]

            corrected_review.append(current_token)
        corrected_reviews.append(corrected_review)
    print()
    return corrected_reviews

def remove_elongations_v2(tokenized_review_sentences):
    """
    Smarter version of the previously introduced function, here the removal of elongations is performed using the library 're' and importing the module 'sub'
    :param tokenized_review_sentences: list of tokenized reviews
    :return: list of tokenized reviews without elongations
    """
    corrected_reviews = []
    state = 1
    # For each review in the input list
    for review in tokenized_review_sentences:
        # Update the user on the status of the process
        print("Processing review: {}".format(state), end='\r')
        state += 1
        corrected_review = []
        # For each token in the review
        for token in review:
            # Append the token without elongations
            corrected_review.append(re.sub(r'(\w)\1{2,}', r'\1\1', token))
        corrected_reviews.append(corrected_review)
    print()
    return corrected_reviews

In [68]:
# Apply correction of elongated words to the number removed text and print review n.1 and its len
all_reviews_sents_processed6 = remove_elongations(all_reviews_sents_processed5)
print("Tokens:", len(all_reviews_sents_processed6[1]))
print(all_reviews_sents_processed6[1])

Processing review: 2000
Tokens: 98
['happy', 'bastard', 'quick', 'movie', 'review', 'damn', 'yk', 'bug', 'going', 'gore', 'bringing', 'action', 'sequences', 'virus', 'still', 'feels', 'empty', 'like', 'movie', 'going', 'flash', 'substance', 'know', 'crew', 'really', 'middle', 'nowhere', 'know', 'origin', 'took', 'ship', 'big', 'pink', 'flashy', 'thing', 'hit', 'mir', 'course', 'know', 'donald', 'sutherland', 'stumbling', 'around', 'drunkenly', 'throughout', 'hey', 'let', 'chase', 'people', 'around', 'robots', 'acting', 'average', 'even', 'likes', 'curtis', 'likely', 'get', 'kick', 'work', 'halloween', 'h', 'sutherland', 'wasted', 'baldwin', 'well', 'acting', 'like', 'baldwin', 'course', 'real', 'star', 'stan', 'winston', 'robot', 'design', 'schnazzy', 'cgi', 'occasional', 'good', 'gore', 'shot', 'like', 'picking', 'someone', 'brain', 'robots', 'body', 'parts', 'really', 'turn', 'movie', 'otherwise', 'pretty', 'much', 'sunken', 'ship', 'movie']


## 4.7 POS and Lemmatization

In [69]:
def pos_and_lemmatize(tokenized_review_sentences):
    """
    Perform POS(Part of Speech) analysis, then use the information extracted from that phase to perform lemmatizaion
    :param tokenized_review_sentences: list of tokenized reviews
    :return: list of tokenized and lemmatized reviews
    """
    reviews_pos_tag = []
    state = 1
    # For each review in the input list
    for review in tokenized_review_sentences:
        # Update the user on the status of the process
        print("Processing review: {}".format(state), end='\r')
        state += 1
        # Perform POS and append the results in the previously defined list
        review_pos_tag = nltk.pos_tag(review)
        reviews_pos_tag.append(review_pos_tag)

    # Apply a change in the POS tags to match the one required as input by the WordNetLemmatizer
    # Here Valid options are `"n"` for nouns, `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"` for satellite adjectives.
    corrected_reviews_pos_tag = []
    # For each review in the input list
    for review in reviews_pos_tag:
        corrected_review_pos_tag = []
        # Take the token and its previously computed POS tag
        for token, pos in review:
            # Convert the POS tag to match the required input of the WordNetLemmatizer
            if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS' : pos = 'n'
            elif pos == 'VB' or pos == 'VBG' or pos == 'VBD' or pos == 'VBN' or pos == 'VBP' or pos == 'VBZ' : pos = 'v'
            elif pos == 'JJ' or pos == 'JJR' or pos == 'JJS' : pos = 'a'
            elif pos == 'RB' or pos == 'RBR' or pos == 'RBS' or pos == 'WRB' : pos = 'r'
            elif pos == '' : pos = 's'
            corrected_review_pos_tag.append((token, pos))
        corrected_reviews_pos_tag.append(corrected_review_pos_tag)

    # Initialize the WordNetLemmatizer
    wnl = WordNetLemmatizer()
    lemmatized_reviews_pos_tag = []
    # For each review in the input list
    for review in corrected_reviews_pos_tag:
        lemmatized_review_pos_tag = []
        # Take the token and its previously computed and converted POS tag
        for token, pos in review:
            # If the POS tag is available, use it jointly with the token to lemmatize it
            if pos == 'n' or pos == 'v' or pos == 'a' or pos == 'r' or pos == 's':
                lemma = wnl.lemmatize(word=token, pos=pos)
            # Otherwise, if POS tag is not available, lemmatize using just the token
            else:
                lemma = wnl.lemmatize(word=token)

            lemmatized_review_pos_tag.append(lemma)
        lemmatized_reviews_pos_tag.append(lemmatized_review_pos_tag)
    print()
    return lemmatized_reviews_pos_tag

In [70]:
# Apply POS and Lemmatization to the elongation corrected text and print review n.1 and its len
all_reviews_sents_processed7 = pos_and_lemmatize(all_reviews_sents_processed6)
print("Tokens:", len(all_reviews_sents_processed7[1]))
print(all_reviews_sents_processed7[1])

Processing review: 2000
Tokens: 98
['happy', 'bastard', 'quick', 'movie', 'review', 'damn', 'yk', 'bug', 'go', 'gore', 'bring', 'action', 'sequence', 'virus', 'still', 'feel', 'empty', 'like', 'movie', 'go', 'flash', 'substance', 'know', 'crew', 'really', 'middle', 'nowhere', 'know', 'origin', 'take', 'ship', 'big', 'pink', 'flashy', 'thing', 'hit', 'mir', 'course', 'know', 'donald', 'sutherland', 'stumble', 'around', 'drunkenly', 'throughout', 'hey', 'let', 'chase', 'people', 'around', 'robot', 'act', 'average', 'even', 'like', 'curtis', 'likely', 'get', 'kick', 'work', 'halloween', 'h', 'sutherland', 'waste', 'baldwin', 'well', 'act', 'like', 'baldwin', 'course', 'real', 'star', 'stan', 'winston', 'robot', 'design', 'schnazzy', 'cgi', 'occasional', 'good', 'gore', 'shot', 'like', 'pick', 'someone', 'brain', 'robot', 'body', 'part', 'really', 'turn', 'movie', 'otherwise', 'pretty', 'much', 'sunken', 'ship', 'movie']


## 4.8 Remove low-frequency and high-frequency words

In [71]:
def remove_frequency_word(tokenized_review_sentences, upper_bound, lower_bound):
    """
    Remove the words that appear in the reviews a number of time which is lower than the lower_bound(rare words) or higher than the upper_bound(too frequent words)
    :param tokenized_review_sentences: list of tokenized reviews
    :param upper_bound: upper-bound frequency, words with a higher frequency will be removed
    :param lower_bound: lower-bound frequency, words with a lower frequency will be removed
    :return: list of tokenized reviews without words below and over threshold
    """
    # Extract the list of all words present in the input list of reviews
    starting_word = [word for review in tokenized_review_sentences for word in review]
    # Compute the FrequencyDictionary of this list of words
    word_freq = nltk.FreqDist(starting_word)

    # Define the terms to remove based on the give threshold and on the FrequencyDictionary compute previously
    terms_to_remove = []
    for entry in word_freq:
        if word_freq[entry] < lower_bound or word_freq[entry] > upper_bound:
            terms_to_remove.append(entry)

    reviews_wo_low_high_freq = []
    state = 1
    # For each review in the input list
    for review in tokenized_review_sentences:
        # Update the user on the status of the process
        print("Processing review: {}".format(state), end='\r')
        state += 1
        # Append the review without the terms to remove
        review_mod = [word for word in review if word not in terms_to_remove]
        reviews_wo_low_high_freq.append(review_mod)
    print()
    return reviews_wo_low_high_freq

In [72]:
# Apply removal of low and high frequency words from the POS and Lemmatized text and print review n.1 and its len
all_reviews_sents_processed8 = remove_frequency_word(all_reviews_sents_processed7, 3000, 30)
print("Tokens:", len(all_reviews_sents_processed8[1]))
print(all_reviews_sents_processed8[1])

Processing review: 2000
Tokens: 74
['happy', 'bastard', 'quick', 'review', 'damn', 'bug', 'go', 'gore', 'bring', 'action', 'sequence', 'still', 'feel', 'empty', 'go', 'flash', 'substance', 'know', 'crew', 'really', 'middle', 'nowhere', 'know', 'take', 'ship', 'big', 'flashy', 'thing', 'hit', 'course', 'know', 'around', 'throughout', 'hey', 'let', 'chase', 'people', 'around', 'robot', 'act', 'average', 'even', 'likely', 'get', 'kick', 'work', 'halloween', 'waste', 'baldwin', 'well', 'act', 'baldwin', 'course', 'real', 'star', 'robot', 'design', 'cgi', 'occasional', 'good', 'gore', 'shot', 'pick', 'someone', 'brain', 'robot', 'body', 'part', 'really', 'turn', 'otherwise', 'pretty', 'much', 'ship']


## 4.9 Negation Handling

In [73]:
def handle_negations(tokenized_review_sentences):
    """
    Handle the negative sentences appending a '_NEG' tag to the words that come after the token which negates that sentence
    :param tokenized_review_sentences: list of tokenized reviews
    :return: list of tokenized reviews with negations expressed by the '_NEG' tag
    """
    reviews_handled_negation = []
    state = 1
    # For each review in the input list
    for review in tokenized_review_sentences:
        # Update the user on the status of the process
        print("Processing review: {}".format(state), end='\r')
        state += 1
        # Handle negations
        reviews_handled_negation.append(mark_negation(review))
    print()
    return reviews_handled_negation

In [74]:
# Apply negation handling to the text where low and high frequency words have been removed. Then print review n.1 and its len
all_reviews_sents_processed9 = handle_negations(all_reviews_sents_processed8)
print("Tokens:", len(all_reviews_sents_processed9[1]))
print(all_reviews_sents_processed9[1])

Processing review: 2000
Tokens: 74
['happy', 'bastard', 'quick', 'review', 'damn', 'bug', 'go', 'gore', 'bring', 'action', 'sequence', 'still', 'feel', 'empty', 'go', 'flash', 'substance', 'know', 'crew', 'really', 'middle', 'nowhere', 'know_NEG', 'take_NEG', 'ship_NEG', 'big_NEG', 'flashy_NEG', 'thing_NEG', 'hit_NEG', 'course_NEG', 'know_NEG', 'around_NEG', 'throughout_NEG', 'hey_NEG', 'let_NEG', 'chase_NEG', 'people_NEG', 'around_NEG', 'robot_NEG', 'act_NEG', 'average_NEG', 'even_NEG', 'likely_NEG', 'get_NEG', 'kick_NEG', 'work_NEG', 'halloween_NEG', 'waste_NEG', 'baldwin_NEG', 'well_NEG', 'act_NEG', 'baldwin_NEG', 'course_NEG', 'real_NEG', 'star_NEG', 'robot_NEG', 'design_NEG', 'cgi_NEG', 'occasional_NEG', 'good_NEG', 'gore_NEG', 'shot_NEG', 'pick_NEG', 'someone_NEG', 'brain_NEG', 'robot_NEG', 'body_NEG', 'part_NEG', 'really_NEG', 'turn_NEG', 'otherwise_NEG', 'pretty_NEG', 'much_NEG', 'ship_NEG']


## 4.10 Putting all together

In [75]:
def full_pre_processing(review_sentences, tune_token=False, rem_punct=False, rem_stopwords=(False, 'english'), rem_numbers=False, corr_elong_words=(False, 'v2'), app_pos_lemmatize=False, rem_freq_words=(False, 3000, 30), handle_neg=False):
    """
    Put together all the previously define pre-processing function in a single function. Each pre-processing step (except for Tokenization) can be enabled or disabled using its own boolean variable. Furthermore, tuple has been defined for the pre-processing functions that have options to configure, and status-information are returned from each step to understand at which point of the processing we are. Finally, perform joining of the reviews to move from a list of tokens back to raw (processed) text.
    :param review_sentences: list of raw text reviews
    :param tune_token: enable correction of tokenization
    :param rem_punct: enable removal of punctuation
    :param rem_stopwords: (tuple)enable removal of stop-words, specify the language of the stop-words (between the ones supported by NLTK)
    :param rem_numbers: enable removal of numbers
    :param corr_elong_words: (tuple)enable correction of elongated words, specify the version of the algorithm to use (v2 is more efficient)
    :param app_pos_lemmatize: enable POS and lemmatization
    :param rem_freq_words: (tuple)enable words frequency cut-offs, specify high-frequency and low-frequency cut off values (default ones are 3000 and 30)
    :param handle_neg: enable handling of negations
    :return: list of reviews in raw (processed) text
    """
    # Perform Tokenization
    i = 1
    print("STEP {}: TOKENIZATION .....".format(i))
    processed_review_sentences = tokenize_reviews(review_sentences)
    print("TOKENIZATION PERFORMED\n")

    # Perform Correction of Tokenization
    if tune_token:
        i+=1
        print("STEP {}: FINE-TUNING TOKENIZATION .....".format(i))
        processed_review_sentences = correct_tokenization(processed_review_sentences)
        print("FINE-TUNING PERFORMED\n")

    # Perform removal of punctuation
    if rem_punct:
        i+=1
        print("STEP {}: PUNCTUATION REMOVAL .....".format(i))
        processed_review_sentences = remove_punctuation(processed_review_sentences)
        print("PUNCTUATION REMOVED\n")

    # Perform removal of stop-words
    if rem_stopwords[0]:
        i+=1
        print("STEP {}: STOPWORDS REMOVAL .....".format(i))
        processed_review_sentences = remove_stopwords(processed_review_sentences, rem_stopwords[1])
        print("STOPWORDS REMOVED\n")

    # Perform removal of numbers
    if rem_numbers:
        i+=1
        print("STEP {}: NUMBERS REMOVAL .....".format(i))
        processed_review_sentences = remove_number(processed_review_sentences)
        print("NUMBERS REMOVED\n")

    # Perform correction of elongated words
    if corr_elong_words[0]:
        i+=1
        print("STEP {}: ELONGATED WORDS CORRECTION .....".format(i))
        if corr_elong_words[1] == 'v2':
            processed_review_sentences = remove_elongations_v2(processed_review_sentences)
        else:
            processed_review_sentences = remove_elongations(processed_review_sentences)
        print("ELONGATED WORDS CORRECTED\n")

    # Perform POS and lemmatize
    if app_pos_lemmatize:
        i+=1
        print("STEP {}: POS-LEMMATIZATION .....".format(i))
        processed_review_sentences = pos_and_lemmatize(processed_review_sentences)
        print("POS-LEMMATIZATION COMPLETED\n")

    # Perform removal of frequency-words
    if rem_freq_words[0]:
        i+=1
        print("STEP {}: LOW&HIGH FREQUENCY TOKEN REMOVAL .....".format(i))
        processed_review_sentences = remove_frequency_word(processed_review_sentences, rem_freq_words[1], rem_freq_words[0])
        print("LOW&HIGH FREQUENCY TOKEN REMOVED\n")

    # Perform handling of negations
    if handle_neg:
        i+=1
        print("STEP {}: HANDLING NEGATIONS .....".format(i))
        processed_review_sentences = handle_negations(processed_review_sentences)
        print("NEGATION HANDLED\n")

    # Join together sentences
    i+=1
    print("STEP {}: JOINTING TOKENS .....".format(i))
    processed_review_sentences_jointed = []
    state = 1
    for review in processed_review_sentences:
        print("Processing review: {}".format(state), end='\r')
        state += 1
        processed_review_sentences_jointed.append(' '.join(review))
    print("\nTOKENS JOINTED\n")

    print("PROCEDURE COMPLETED")
    return processed_review_sentences_jointed

In [76]:
# Example of a full-pre-processing with a chosen selection of steps to perform
final_processed_reviews = full_pre_processing(all_reviews_sents_subjective, tune_token=False, rem_punct=True, rem_stopwords=(False, 'english'), rem_numbers=True, corr_elong_words=(False, 'v2'), app_pos_lemmatize=False, rem_freq_words=(True, 2000, 30), handle_neg=False)
# Print review n.1 and its len after the full-pre-processing
print("\nTokens:", len(final_processed_reviews[1]))
print(final_processed_reviews[1])

STEP 1: TOKENIZATION .....
Processing review: 2000
TOKENIZATION PERFORMED

STEP 2: PUNCTUATION REMOVAL .....
Processing review: 2000
PUNCTUATION REMOVED

STEP 3: NUMBERS REMOVAL .....
Processing review: 2000
NUMBERS REMOVED

STEP 4: LOW&HIGH FREQUENCY TOKEN REMOVAL .....
Processing review: 2000
LOW&HIGH FREQUENCY TOKEN REMOVED

STEP 5: JOINTING TOKENS .....
Processing review: 2000
TOKENS JOINTED

PROCEDURE COMPLETED

Tokens: 673
happy bastard quick review damn yk bug going gore bringing few action sequences here virus still feels very empty going flash no substance we don know why crew really middle nowhere we don know origin took over ship big pink flashy thing hit mir course we don know why donald sutherland stumbling around drunkenly throughout here hey let chase these people around robots acting below average likes curtis re likely get kick her work halloween h sutherland wasted baldwin well acting baldwin course real star here stan winston robot design schnazzy cgi occasional

## 4.11 Different pre-processing configurations, these will be used in the training of both: Multinomial NB and Bi-Directional LSTM

### 4.11.1 Tokenization + correction of tokenization

In [77]:
data1 = full_pre_processing(all_reviews_sents_subjective, tune_token=True)

STEP 1: TOKENIZATION .....
Processing review: 2000
TOKENIZATION PERFORMED

STEP 2: FINE-TUNING TOKENIZATION .....
Processing review: 2000
FINE-TUNING PERFORMED

STEP 3: JOINTING TOKENS .....
Processing review: 2000
TOKENS JOINTED

PROCEDURE COMPLETED


### 4.11.2 Tokenization + correction of tokenization + removal of punctuation, stopwords and numbers

In [78]:
data2 = full_pre_processing(all_reviews_sents_subjective, tune_token=True, rem_punct=True, rem_stopwords=(True, 'english'), rem_numbers=True)

STEP 1: TOKENIZATION .....
Processing review: 2000
TOKENIZATION PERFORMED

STEP 2: FINE-TUNING TOKENIZATION .....
Processing review: 2000
FINE-TUNING PERFORMED

STEP 3: PUNCTUATION REMOVAL .....
Processing review: 2000
PUNCTUATION REMOVED

STEP 4: STOPWORDS REMOVAL .....
Processing review: 2000
STOPWORDS REMOVED

STEP 5: NUMBERS REMOVAL .....
Processing review: 2000
NUMBERS REMOVED

STEP 6: JOINTING TOKENS .....
Processing review: 2000
TOKENS JOINTED

PROCEDURE COMPLETED


### 4.11.3 Tokenization + correction of tokenization + removal of punctuation, stopwords and numbers + correction of elongated words + lemmatization

In [79]:
data3 = full_pre_processing(all_reviews_sents_subjective, tune_token=True, rem_punct=True, rem_stopwords=(True, 'english'), rem_numbers=True, corr_elong_words=(True, 'v2'), app_pos_lemmatize=True)

STEP 1: TOKENIZATION .....
Processing review: 2000
TOKENIZATION PERFORMED

STEP 2: FINE-TUNING TOKENIZATION .....
Processing review: 2000
FINE-TUNING PERFORMED

STEP 3: PUNCTUATION REMOVAL .....
Processing review: 2000
PUNCTUATION REMOVED

STEP 4: STOPWORDS REMOVAL .....
Processing review: 2000
STOPWORDS REMOVED

STEP 5: NUMBERS REMOVAL .....
Processing review: 2000
NUMBERS REMOVED

STEP 6: ELONGATED WORDS CORRECTION .....
Processing review: 2000
ELONGATED WORDS CORRECTED

STEP 7: POS-LEMMATIZATION .....
Processing review: 2000
POS-LEMMATIZATION COMPLETED

STEP 8: JOINTING TOKENS .....
Processing review: 2000
TOKENS JOINTED

PROCEDURE COMPLETED


### 4.11.4  Tokenization + correction of tokenization + removal of punctuation, stopwords and numbers + correction of elongated words + lemmatization + removal of frequency words

In [80]:
data4 = full_pre_processing(all_reviews_sents_subjective, tune_token=True, rem_punct=True, rem_stopwords=(True, 'english'), rem_numbers=True, corr_elong_words=(True, 'v2'), app_pos_lemmatize=True, rem_freq_words=(True, 2000, 30), handle_neg=False)

STEP 1: TOKENIZATION .....
Processing review: 2000
TOKENIZATION PERFORMED

STEP 2: FINE-TUNING TOKENIZATION .....
Processing review: 2000
FINE-TUNING PERFORMED

STEP 3: PUNCTUATION REMOVAL .....
Processing review: 2000
PUNCTUATION REMOVED

STEP 4: STOPWORDS REMOVAL .....
Processing review: 2000
STOPWORDS REMOVED

STEP 5: NUMBERS REMOVAL .....
Processing review: 2000
NUMBERS REMOVED

STEP 6: ELONGATED WORDS CORRECTION .....
Processing review: 2000
ELONGATED WORDS CORRECTED

STEP 7: POS-LEMMATIZATION .....
Processing review: 2000
POS-LEMMATIZATION COMPLETED

STEP 8: LOW&HIGH FREQUENCY TOKEN REMOVAL .....
Processing review: 2000
LOW&HIGH FREQUENCY TOKEN REMOVED

STEP 9: JOINTING TOKENS .....
Processing review: 2000
TOKENS JOINTED

PROCEDURE COMPLETED


### 4.11.5 Tokenization + correction of tokenization + removal of punctuation, stopwords and numbers + correction of elongated words + lemmatization + removal of frequency words + handle negations (full pre-processing)

In [81]:
data5 = full_pre_processing(all_reviews_sents_subjective, tune_token=True, rem_punct=True, rem_stopwords=(True, 'english'), rem_numbers=True, corr_elong_words=(True, 'v2'), app_pos_lemmatize=True, rem_freq_words=(True, 2000, 30), handle_neg=True)

STEP 1: TOKENIZATION .....
Processing review: 2000
TOKENIZATION PERFORMED

STEP 2: FINE-TUNING TOKENIZATION .....
Processing review: 2000
FINE-TUNING PERFORMED

STEP 3: PUNCTUATION REMOVAL .....
Processing review: 2000
PUNCTUATION REMOVED

STEP 4: STOPWORDS REMOVAL .....
Processing review: 2000
STOPWORDS REMOVED

STEP 5: NUMBERS REMOVAL .....
Processing review: 2000
NUMBERS REMOVED

STEP 6: ELONGATED WORDS CORRECTION .....
Processing review: 2000
ELONGATED WORDS CORRECTED

STEP 7: POS-LEMMATIZATION .....
Processing review: 2000
POS-LEMMATIZATION COMPLETED

STEP 8: LOW&HIGH FREQUENCY TOKEN REMOVAL .....
Processing review: 2000
LOW&HIGH FREQUENCY TOKEN REMOVED

STEP 9: HANDLING NEGATIONS .....
Processing review: 2000
NEGATION HANDLED

STEP 10: JOINTING TOKENS .....
Processing review: 2000
TOKENS JOINTED

PROCEDURE COMPLETED


# 5.0 Third classification phase (#3 with objectivity-removal step, pre-processing and fine-tuning)

## 5.1 Classification with the precedent pre-processing configurations

In [82]:
def split_vectorize_fit_predict_enhanced(input_vector, reference_vector, iterations, pos_label_roc, return_avg_accuracy=False, return_classifier_vectorizer=False):
    """
    Initialize vectorizer and classifier (Multinomial Naive Bayes), then for the specified number of iterations:
    1) Run the procedure of splitting in train-set (80% of data) and test-set (20% of data).
    2) Fit the vectorizer on the train set
    3) Convert the train-set and the test-set in sequences using the previously fitted vectorizer
    4) Train and test the model
    5) Print accuracy of the current iteration and update the average accuracy
    :param input_vector: input data (list of reviews)
    :param reference_vector: ground truths for the input data
    :param iterations: number of iterations required
    :param pos_label_roc: label of the positive sample for the plot of the ROC curve
    :param return_avg_accuracy: if true the final average accuracy is returned
    :param return_classifier_vectorizer: if true the trained vectorizer and classifier of the latest iteration are returned
    :return: print of accuracy and average accuracy, Plot of ROC, AUC and confusion matrix. If required returns the average accuracy, the vectorizer and the classifier
    """
    # Initialization of vectorizer(tuned to achieve better performances) and classifier
    vectorizer = CountVectorizer(lowercase=True, max_features=20000, ngram_range=(1,3))
    classifier = MultinomialNB()

    # Define variables to save the results
    total_accuracy, average_accuracy, i = 0, 0, 0
    test_reference, prediction, test_vector = 0, 0, 0
    accuracy_vector = []

    # Run the iterative procedure
    for x in range(iterations):
        i += 1
        # Split the dataset in train set (80%) and test set (20%) with shuffle=True to randomize the results and with stratify option enabled to generate balanced splits.
        # Here the dictionary used will encompass only 80% of the information (since it has been fitted on the train set), leading to more accurate test performance measurements
        X_train, X_test, train_reference, test_reference = train_test_split(input_vector, reference_vector, train_size=0.8, shuffle=True, stratify=reference_vector)

        # Learn the words dictionary of all the reviews in the training-set
        train_vector = vectorizer.fit_transform(X_train).toarray()
        # Return vectorized reviews for the test-set based on the training-set learnt dictionary
        test_vector = vectorizer.transform(X_test).toarray()

        # Train (or fit) the MultinomialBN model over the train vector
        classifier.fit(train_vector, train_reference)

        # Predict labels for the test vector
        prediction = classifier.predict(test_vector)

        # Evaluate the model over the test vector
        accuracy = metrics.accuracy_score(test_reference, prediction)
        accuracy_vector.append(accuracy)

        # Print current iteration accuracy and overall average accuracy
        print("Accuracy for iteration number {}/{}: {:.3}".format(i, iterations, accuracy))
        total_accuracy += accuracy
        average_accuracy = total_accuracy / i
        print("Average accuracy over all iterations: {:.3}".format(average_accuracy))
        print("------------------------------------------------")

    # Perform a detailed analysis for the latest prediction
    prediction_analysis(classifier, test_reference, prediction, test_vector, pos_label_roc, accuracy_vector, iterations)
    if return_avg_accuracy and return_classifier_vectorizer:
        return average_accuracy, classifier, vectorizer,
    elif return_avg_accuracy:
        return average_accuracy
    elif return_classifier_vectorizer:
        return classifier, vectorizer

### 5.1.1 Training with tokenization + correction of tokenization

In [84]:
average_accuracy_preprocessing1 = split_vectorize_fit_predict_enhanced(data1, ground_truth_vector, K, 'pos', True)

Accuracy for iteration number 1/20: 0.84
Average accuracy over all iterations: 0.84
------------------------------------------------
Accuracy for iteration number 2/20: 0.882
Average accuracy over all iterations: 0.861
------------------------------------------------
Accuracy for iteration number 3/20: 0.892
Average accuracy over all iterations: 0.872
------------------------------------------------
Accuracy for iteration number 4/20: 0.873
Average accuracy over all iterations: 0.872
------------------------------------------------
Accuracy for iteration number 5/20: 0.863
Average accuracy over all iterations: 0.87
------------------------------------------------
Accuracy for iteration number 6/20: 0.855
Average accuracy over all iterations: 0.868
------------------------------------------------
Accuracy for iteration number 7/20: 0.873
Average accuracy over all iterations: 0.868
------------------------------------------------
Accuracy for iteration number 8/20: 0.868
Average accuracy

'Confusion matrix for the latest prediction:'

Unnamed: 0,Positive,Negative
Positive,179,21
Negative,29,171


'Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC) for the latest prediction:'

'AUC: 0.93518'

'Accuracy distribution over 20 folds:'

In [85]:
# Add the result to the Dataframe
classification_results.loc["OR + PreProcessing1", "Multinomial Naive-Bayes Accuracy (%)"] = round(average_accuracy_preprocessing1, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,0.865,,
OR + PreProcessing2,,,
OR + PreProcessing3,,,
OR + PreProcessing4,,,
OR + PreProcessing5,,,


### 5.1.2 Training with tokenization + correction of tokenization + removal of punctuation, stopwords and numbers

In [86]:
average_accuracy_preprocessing2 = split_vectorize_fit_predict_enhanced(data2, ground_truth_vector, K, 'pos', True)

Accuracy for iteration number 1/20: 0.845
Average accuracy over all iterations: 0.845
------------------------------------------------
Accuracy for iteration number 2/20: 0.84
Average accuracy over all iterations: 0.843
------------------------------------------------
Accuracy for iteration number 3/20: 0.838
Average accuracy over all iterations: 0.841
------------------------------------------------
Accuracy for iteration number 4/20: 0.848
Average accuracy over all iterations: 0.843
------------------------------------------------
Accuracy for iteration number 5/20: 0.818
Average accuracy over all iterations: 0.838
------------------------------------------------
Accuracy for iteration number 6/20: 0.865
Average accuracy over all iterations: 0.842
------------------------------------------------
Accuracy for iteration number 7/20: 0.833
Average accuracy over all iterations: 0.841
------------------------------------------------
Accuracy for iteration number 8/20: 0.828
Average accura

'Confusion matrix for the latest prediction:'

Unnamed: 0,Positive,Negative
Positive,171,29
Negative,33,167


'Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC) for the latest prediction:'

'AUC: 0.92422'

'Accuracy distribution over 20 folds:'

In [87]:
# Add the result to the Dataframe
classification_results.loc["OR + PreProcessing2", "Multinomial Naive-Bayes Accuracy (%)"] = round(average_accuracy_preprocessing2, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,0.865,,
OR + PreProcessing2,0.845,,
OR + PreProcessing3,,,
OR + PreProcessing4,,,
OR + PreProcessing5,,,


### 5.1.3 Training with tokenization + correction of tokenization + removal of punctuation, stopwords and numbers + correction of elongated words + lemmatization

In [88]:
average_accuracy_preprocessing3 = split_vectorize_fit_predict_enhanced(data3, ground_truth_vector, K, 'pos', True)

Accuracy for iteration number 1/20: 0.812
Average accuracy over all iterations: 0.812
------------------------------------------------
Accuracy for iteration number 2/20: 0.848
Average accuracy over all iterations: 0.83
------------------------------------------------
Accuracy for iteration number 3/20: 0.85
Average accuracy over all iterations: 0.837
------------------------------------------------
Accuracy for iteration number 4/20: 0.833
Average accuracy over all iterations: 0.836
------------------------------------------------
Accuracy for iteration number 5/20: 0.855
Average accuracy over all iterations: 0.839
------------------------------------------------
Accuracy for iteration number 6/20: 0.82
Average accuracy over all iterations: 0.836
------------------------------------------------
Accuracy for iteration number 7/20: 0.873
Average accuracy over all iterations: 0.841
------------------------------------------------
Accuracy for iteration number 8/20: 0.812
Average accuracy

'Confusion matrix for the latest prediction:'

Unnamed: 0,Positive,Negative
Positive,163,37
Negative,30,170


'Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC) for the latest prediction:'

'AUC: 0.91096'

'Accuracy distribution over 20 folds:'

In [89]:
# Add the result to the Dataframe
classification_results.loc["OR + PreProcessing3", "Multinomial Naive-Bayes Accuracy (%)"] = round(average_accuracy_preprocessing3, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,0.865,,
OR + PreProcessing2,0.845,,
OR + PreProcessing3,0.841,,
OR + PreProcessing4,,,
OR + PreProcessing5,,,


### 5.1.4  Training with tokenization + correction of tokenization + removal of punctuation, stopwords and numbers + correction of elongated words + lemmatization + removal of frequency words

In [90]:
average_accuracy_preprocessing4 = split_vectorize_fit_predict_enhanced(data4, ground_truth_vector, K, 'pos', True)

Accuracy for iteration number 1/20: 0.858
Average accuracy over all iterations: 0.858
------------------------------------------------
Accuracy for iteration number 2/20: 0.863
Average accuracy over all iterations: 0.86
------------------------------------------------
Accuracy for iteration number 3/20: 0.855
Average accuracy over all iterations: 0.858
------------------------------------------------
Accuracy for iteration number 4/20: 0.855
Average accuracy over all iterations: 0.858
------------------------------------------------
Accuracy for iteration number 5/20: 0.848
Average accuracy over all iterations: 0.855
------------------------------------------------
Accuracy for iteration number 6/20: 0.85
Average accuracy over all iterations: 0.855
------------------------------------------------
Accuracy for iteration number 7/20: 0.897
Average accuracy over all iterations: 0.861
------------------------------------------------
Accuracy for iteration number 8/20: 0.818
Average accurac

'Confusion matrix for the latest prediction:'

Unnamed: 0,Positive,Negative
Positive,168,32
Negative,36,164


'Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC) for the latest prediction:'

'AUC: 0.90225'

'Accuracy distribution over 20 folds:'

In [91]:
# Add the result to the Dataframe
classification_results.loc["OR + PreProcessing4", "Multinomial Naive-Bayes Accuracy (%)"] = round(average_accuracy_preprocessing4, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,0.865,,
OR + PreProcessing2,0.845,,
OR + PreProcessing3,0.841,,
OR + PreProcessing4,0.852,,
OR + PreProcessing5,,,


### 5.1.5 Training with tokenization + correction of tokenization + removal of punctuation, stopwords and numbers + correction of elongated words + lemmatization + removal of frequency words + handle negations (full pre-processing)

In [92]:
average_accuracy_preprocessing5 = split_vectorize_fit_predict_enhanced(data5, ground_truth_vector, K, 'pos', True)

Accuracy for iteration number 1/20: 0.743
Average accuracy over all iterations: 0.743
------------------------------------------------
Accuracy for iteration number 2/20: 0.79
Average accuracy over all iterations: 0.766
------------------------------------------------
Accuracy for iteration number 3/20: 0.782
Average accuracy over all iterations: 0.772
------------------------------------------------
Accuracy for iteration number 4/20: 0.777
Average accuracy over all iterations: 0.773
------------------------------------------------
Accuracy for iteration number 5/20: 0.805
Average accuracy over all iterations: 0.78
------------------------------------------------
Accuracy for iteration number 6/20: 0.8
Average accuracy over all iterations: 0.783
------------------------------------------------
Accuracy for iteration number 7/20: 0.772
Average accuracy over all iterations: 0.781
------------------------------------------------
Accuracy for iteration number 8/20: 0.782
Average accuracy 

'Confusion matrix for the latest prediction:'

Unnamed: 0,Positive,Negative
Positive,157,43
Negative,40,160


'Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC) for the latest prediction:'

'AUC: 0.87376'

'Accuracy distribution over 20 folds:'

In [93]:
# Add the result to the Dataframe
classification_results.loc["OR + PreProcessing5", "Multinomial Naive-Bayes Accuracy (%)"] = round(average_accuracy_preprocessing5, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,0.865,,
OR + PreProcessing2,0.845,,
OR + PreProcessing3,0.841,,
OR + PreProcessing4,0.852,,
OR + PreProcessing5,0.775,,


# 6.0 Sentiment Analysis with Bi-Directional LSTM

## 6.1 Re-organization of data for NN

In [95]:
# Define a numerical ground truth vector that will be used to train the network
numerical_ground_truth_vector = []
# For each truth string
for truth in ground_truth_vector:
    # Assign value 0 if it is negative
    if truth == 'neg':
        numerical_ground_truth_vector.append(0)
    # Assign value 1 if it is positive
    else: ## truth == 'pos'
        numerical_ground_truth_vector.append(1)
# Return the numerical ground truth, suitable for NN
numerical_ground_truth_vector = np.array(numerical_ground_truth_vector)

In [96]:
# Build a dataframe with the subjective reviews and the respective numerical ground truth
data = pd.DataFrame(all_reviews_sents_subjective, columns=["Original Review Text"])
data["Category"] = numerical_ground_truth_vector
data

Unnamed: 0,Original Review Text,Category
0,"watch the movie and "" sorta "" find out .\n. . ...",0
1,the happy bastard ' s quick movie review damn ...,0
2,it is movies like these that make a jaded movi...,0
3,""" quest for camelot "" is warner bros . ' first...",0
4,comments : stalked is yet another in a seeming...,0
...,...,...
1995,wow !\nwhat a movie .\nit ' s everything a mov...,1
1996,"richard gere can be a commanding actor , but h...",1
1997,the film periodically jumps between shaw ' s p...,1
1998,steven spielberg ' s second epic film on world...,1


## 6.2 Example of pre-processing and conversion to sequence

### 6.2.1 Pre-processing

In [97]:
nn_data = full_pre_processing(all_reviews_sents_subjective, tune_token=True, rem_punct=True, rem_stopwords=(True, 'english'), rem_numbers=True, corr_elong_words=(False, 'v2'), app_pos_lemmatize=False, rem_freq_words=(True, 2000, 30), handle_neg=False)

STEP 1: TOKENIZATION .....
Processing review: 2000
TOKENIZATION PERFORMED

STEP 2: FINE-TUNING TOKENIZATION .....
Processing review: 2000
FINE-TUNING PERFORMED

STEP 3: PUNCTUATION REMOVAL .....
Processing review: 2000
PUNCTUATION REMOVED

STEP 4: STOPWORDS REMOVAL .....
Processing review: 2000
STOPWORDS REMOVED

STEP 5: NUMBERS REMOVAL .....
Processing review: 2000
NUMBERS REMOVED

STEP 6: LOW&HIGH FREQUENCY TOKEN REMOVAL .....
Processing review: 2000
LOW&HIGH FREQUENCY TOKEN REMOVED

STEP 7: JOINTING TOKENS .....
Processing review: 2000
TOKENS JOINTED

PROCEDURE COMPLETED


In [98]:
# Insert in the previously defined dataframe the pre-processed reviews
data["Processed Review"] = nn_data

diff = []
# Compute the len difference between original and pre-processed review
for i in range(len(data)):
    dif = len(data.loc[i, "Original Review Text"]) - len(data.loc[i, "Processed Review"])
    diff.append(dif)

# Display the dataframe
data["Len Difference"] = diff
data

Unnamed: 0,Original Review Text,Category,Processed Review,Len Difference
0,"watch the movie and "" sorta "" find out .\n. . ...",0,watch sorta find critique mind fuck teen gener...,1693
1,the happy bastard ' s quick movie review damn ...,0,happy bastard quick review damn yk bug going g...,499
2,it is movies like these that make a jaded movi...,0,movies make jaded viewer thankful invention ti...,1076
3,""" quest for camelot "" is warner bros . ' first...",0,quest camelot warner bros first feature length...,805
4,comments : stalked is yet another in a seeming...,0,comments stalked yet another seemingly endless...,1014
...,...,...,...,...
1995,wow !\nwhat a movie .\nit ' s everything a mov...,1,wow everything funny dramatic interesting weir...,1436
1996,"richard gere can be a commanding actor , but h...",1,richard gere commanding actor always great fil...,203
1997,the film periodically jumps between shaw ' s p...,1,periodically jumps shaw point view perspective...,969
1998,steven spielberg ' s second epic film on world...,1,steven spielberg second epic world war ii unqu...,1160


### 6.2.2 Conversion to sequence and padding, using all data to create tokenizer dictionary

In [99]:
# Initialize the tokenizer
tokenizer = Tokenizer(lower=True)
# Fit the tokenizer with the text
tokenizer.fit_on_texts(nn_data)

# Extract the vocab size (number of terms present) and add one (since when we will use this value in the Embedding layer of LSTM to define
# the size of the vocab we need to take into account that the LSTM Embedding layer need also an additional space for one OOV '[UNK]' token)
vocab_size = len(tokenizer.word_index) + 1

# Define a list for the reviews converted to sequence
reviews_sequences = []
# Save the length of the longest review sequence
max_sequence_len = 0

# For each review
for review in nn_data:
    # Convert the review to a sequence of numbers and save it
    review_to_seq = tokenizer.texts_to_sequences([review])[0]
    reviews_sequences.append(review_to_seq)

    # Update the length of the longest sequence for padding
    review_to_seq_len = len(review_to_seq)
    if review_to_seq_len > max_sequence_len:
        max_sequence_len = review_to_seq_len

# Give some information about the size of the vocab (number of different tokens) and the length of the longest sequence (max num of tokens in a review)
print("Vocabulary size fitted on all the dataset: {}. Max sequence length: {}".format(vocab_size, max_sequence_len))

# Pad the sequences to make them of the same length, apply the pad to the end of the sentence (padding='post')
padded_sequences = pad_sequences(reviews_sequences, maxlen=max_sequence_len, padding='post')
# Plot the padded sequences, we have one row for each review, and one column for each token
padded_sequences

Vocabulary size fitted on all the dataset: 29442. Max sequence length: 1012


array([[  101,  7729,   151, ...,     0,     0,     0],
       [  538,  2264,   860, ...,     0,     0,     0],
       [   20,    14,  5943, ...,     0,     0,     0],
       ...,
       [10364,  3394,  5443, ...,     0,     0,     0],
       [  970,   734,   227, ...,     0,     0,     0],
       [    2,  2797,  5753, ...,     0,     0,     0]])

### 6.2.3 Conversion to sequence and padding, using only test-set data(80%) to create tokenizer dictionary

In [100]:
def split_tokenize_pad_data(input_data, numerical_ground_truth, sequence_length_limit=0):
    """
    Split the input data, initialize the tokenizer, then for the specified number of iterations:
    1) Run the procedure of splitting in train-set (80% of data) and test-set (20% of data).
    2) Fit the tokenizer on the train set, then extract the size of its vocabulary (number of unique tokens)
    3) Convert the train-set and the test-set in sequences using the previously fitted tokenizer
    4) Keep the length of the longest sequence (of the train-set) saved in a variable
    :param input_data: input data (list of reviews)
    :param numerical_ground_truth: numerical ground truths for the input data
    :param sequence_length_limit: Limit the length of the sequence. In some instances we will impose a length limit of 700 since it allows to train with 16GB or RAM without crashes
    :return: padded train set, padded test set, reference of the padded train set, reference of the padded test set, size of the tokenizer vocabulary, length of the longest sequence
    """
    # Split the input data
    X_train, X_test, train_reference, test_reference = train_test_split(input_data, numerical_ground_truth, train_size=0.8, shuffle=True, stratify=numerical_ground_truth)

    # Initialize the tokenizer
    tokenizer = Tokenizer(lower=True)
    # Fit the tokenizer with the text
    tokenizer.fit_on_texts(X_train)

    # Extract the vocabulary size (number of terms present) and add one (since when we will use this value in the Embedding layer of LSTM to define
    # the size of the vocab we need to take into account that the LSTM Embedding layer need also an additional space for one OOV '[UNK]' token)
    vocab_size = len(tokenizer.word_index) + 1

    # Define a list for the reviews converted to sequence
    X_train_to_seq = []
    X_test_to_seq = []
    # Save the length of the longest review sequence
    max_sequence_len = 0

    # For each review
    for review in X_train:
        # Convert the review of the train-set to a sequence of numbers and save it
        review_to_seq = tokenizer.texts_to_sequences([review])[0]
        X_train_to_seq.append(review_to_seq)

        # Update the length of the longest sequence for padding
        review_to_seq_len = len(review_to_seq)
        if review_to_seq_len > max_sequence_len:
            max_sequence_len = review_to_seq_len

    for review in X_test:
        # Convert the review of the test-set to a sequence of numbers and save it
        review_to_seq = tokenizer.texts_to_sequences([review])[0]
        X_test_to_seq.append(review_to_seq)

    # Give some information about the size of the vocab (number of different tokens) and the length of the longest sequence (max num of tokens in a review)
    print("Vocabulary size fitted on training set: {}. Max sequence length: {}".format(vocab_size, max_sequence_len))

    # If a sequence length limit has been imposed and if it is lower that the current max_sequence_length then return this value as length of the longest sequence, this allows to shorten the sequence and reduce the required quantity of RAM in training phase.
    if sequence_length_limit != 0 and sequence_length_limit < max_sequence_len:
        max_sequence_len = sequence_length_limit
        print("Sequence length limited to: {}".format(max_sequence_len))

    # Pad the sequences to make them of the same length, apply the pad to the end of the sentence (padding='post')
    X_train_padded_sequences = pad_sequences(X_train_to_seq, maxlen=max_sequence_len, padding='post')
    X_test_padded_sequences = pad_sequences(X_test_to_seq, maxlen=max_sequence_len, padding='post')
    # Return: padded train set, padded test set, reference of the padded train set, reference of the padded test set, size of the tokenizer vocabulary, length of the longest sequence
    return X_train_padded_sequences, X_test_padded_sequences, train_reference, test_reference, vocab_size, max_sequence_len

In [101]:
X_train_padded_sequences, X_test_padded_sequences, train_reference, test_reference, vocab_size, max_sequence_len = split_tokenize_pad_data(nn_data, numerical_ground_truth_vector)

Vocabulary size fitted on training set: 26762. Max sequence length: 1012


In [102]:
# Plot the training padded sequences, we have one row for each review, and one column for each token
X_train_padded_sequences

array([[  53,   92,   53, ...,    0,    0,    0],
       [ 934, 7501, 1424, ...,    0,    0,    0],
       [1229,   77, 7508, ...,    0,    0,    0],
       ...,
       [   8,  106,  156, ...,    0,    0,    0],
       [5380, 7865,  607, ...,    0,    0,    0],
       [2506,  199,  199, ...,    0,    0,    0]])

In [103]:
# Plot the testing padded sequences, we have one row for each review, and one column for each token
X_test_padded_sequences

array([[ 231,  350,  709, ...,    0,    0,    0],
       [4825, 9461, 1964, ...,    0,    0,    0],
       [  35,  263,  188, ...,    0,    0,    0],
       ...,
       [4118,  216,   88, ...,    0,    0,    0],
       [ 182,  861, 5446, ...,    0,    0,    0],
       [ 186,  811, 1114, ...,    0,    0,    0]])

## 6.3 Building the Bi-Directional LSTM models

In [104]:
# Import the modules required to build and train the NN
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Flatten
from keras.optimizers import Adam
from keras.models import Sequential
from keras_self_attention import SeqSelfAttention

def build_keras_bidirectional_lstm_model(vocabulary_size, max_sequence_length):
    """
    Build a Bi-directional LSTM model without attention and print its summary
    :param vocabulary_size: size of the vocabulary of the input sequences
    :param max_sequence_length: max length of the input sequences
    :return: keras model ready for the training
    """
    # Build a sequential model
    model = Sequential()

    # Add an Embedding Layer with ....
    model.add(Embedding(vocabulary_size, 64, input_length=max_sequence_length))  ## Output=(batch_size[None], input_length, output_dim)  ##input_shape=(None, max_sequence_len)

    # Add a BiDirectional Layer with ....
    # The output is (#Samples, #Time steps, #LSTM units), with return_sequences=False we remove the TimeSteps, which are the different hidden states outputs. In that case we return just the output of the latest hidden state, this make us move from 3D to 2D, it is useful when we need to classify, since we are not interested in the previous values.
    # In this case we will have return_sequences=True in the first LSTM, since we will then feed another LSTM. The second LSTM will then have return_sequences=False.
    # Another case in which return_sequences=True is needed is when we need to feed an Attention Layer, since it wants all the sequence of inputs to extract the context.
    model.add(Bidirectional(LSTM(64, return_sequences=True, input_shape=(None, 1))))

    # Add a Dropout Layer with ....
    model.add(Dropout(0.2))

    # Add a BiDirectional Layer with ....
    model.add(Bidirectional(LSTM(32))) # Here from default return_sequences=False, since then we need to classify

    # Add a Dropout Layer with ....
    model.add(Dropout(0.2))

    # Add a Dense Layer with ....
    model.add(Dense(64, activation='relu'))

    # Add a Dropout Layer with ....
    model.add(Dropout(0.1))

    # Add a Dense Layer with ....
    model.add(Dense(1, activation='sigmoid'))

    # Define optimizer
    adam = Adam(learning_rate=0.01)
    # Define loss function for training, we use 'from_logits'=True to have more precise computations, also if the 'sigmoid' function is not a logit function
    train_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

    # Compile the model
    model.compile(loss=train_loss, optimizer=adam, metrics=['accuracy'])

    # Show summary if the model
    model.summary() # For each layer the output is (None, x, y), here None is batch size (can accept any size), x is the length of the sequence, y is the number of units
    return model

def build_keras_bidirectional_lstm_model_with_self_attention(vocabulary_size, max_sequence_length):
    """
    Build a Bi-directional LSTM model with attention and print its summary
    :param vocabulary_size: size of the vocabulary of the input sequences
    :param max_sequence_length: max length of the input sequences
    :return: keras model ready for the training
    """
    # Build a Sequential model
    model = Sequential()

    # Add an Embedding Layer with ....
    model.add(Embedding(vocabulary_size, 64, input_length=max_sequence_length))  ## Output=(batch_size[None], input_length, output_dim)  ##input_shape=(None, max_sequence_len)

    # Add a BiDirectional Layer with ....
    model.add(Bidirectional(LSTM(64, return_sequences=True, input_shape=(None, 1)))) # return_sequences=True since we need to feed the time-steps into the self-attention layer.

    # Add an Attention Layer with ....
    model.add(SeqSelfAttention(attention_activation='sigmoid'))

    # add a Flatten layer with ....
    model.add(Flatten())

    # Add a Dense Layer with ....
    model.add(Dense(1, activation='sigmoid'))

    # Define optimizer
    adam = Adam(learning_rate=0.01)
    # Define loss function for training, we use 'from_logits'=True to have more precise computations, also if the 'sigmoid' function is not a logit function
    train_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

    # Compile the model and return its summary
    model.compile(loss=train_loss, optimizer=adam, metrics=['accuracy'])
    model.summary()
    return model

## 6.4 Training different models (without and with attention) over different setups

In [105]:
# Define the main training parameters
epochs = 5
epochs_att = 3
batch_size = 64
workers = 12

### 6.4.1 Training with tokenization + correction of tokenization

In [106]:
X_train_padded_sequences_1, X_test_padded_sequences_1, train_reference_1, test_reference_1, vocab_size_1, max_sequence_len_1 = split_tokenize_pad_data(data1, numerical_ground_truth_vector)

Vocabulary size fitted on training set: 27301. Max sequence length: 1915


In [107]:
print("\nBUILDING THE MODEL:")
model_1 = build_keras_bidirectional_lstm_model(vocab_size_1, max_sequence_len_1)

print("\nTRAINING THE MODEL:")
history_1 = model_1.fit(x=X_train_padded_sequences_1, y=train_reference_1, epochs=epochs, verbose=1, batch_size=batch_size,
                        validation_data=(X_test_padded_sequences_1, test_reference_1), workers=workers, use_multiprocessing=True)


BUILDING THE MODEL:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1915, 64)          1747264   
                                                                 
 bidirectional (Bidirectiona  (None, 1915, 128)        66048     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 1915, 128)         0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None,


"`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a Sigmoid activation and thus does not represent logits. Was this intended?



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [120]:
X_train_padded_sequences_1_att, X_test_padded_sequences_1_att, train_reference_1_att, test_reference_1_att, vocab_size_1_att, max_sequence_len_1_att = split_tokenize_pad_data(data1, numerical_ground_truth_vector, 700)

Vocabulary size fitted on training set: 27249. Max sequence length: 1915
Sequence length limited to: 700


In [121]:
print("\nBUILDING THE MODEL:")
model_1_attention = build_keras_bidirectional_lstm_model_with_self_attention(vocab_size_1_att, max_sequence_len_1_att)

print("\nTRAINING THE MODEL:")
history_1_attention = model_1_attention.fit(x=X_train_padded_sequences_1_att, y=train_reference_1_att, epochs=epochs_att, verbose=1, batch_size=batch_size,
                        validation_data=(X_test_padded_sequences_1_att, test_reference_1_att), workers=workers, use_multiprocessing=True)


BUILDING THE MODEL:



The initializer GlorotNormal is unseeded and being called multiple times, which will return identical values  each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 700, 64)           1743936   
                                                                 
 bidirectional_2 (Bidirectio  (None, 700, 128)         66048     
 nal)                                                            
                                                                 
 seq_self_attention (SeqSelf  (None, 700, 128)         8257      
 Attention)                                                      
                                                                 
 flatten (Flatten)           (None, 89600)             0         
                                                                 
 dense_2 (Dense)             (None, 1)                 89601     
                                                                 
Total params: 1,907,842
Trainable params: 1,907,842
No


"`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a Sigmoid activation and thus does not represent logits. Was this intended?



Epoch 2/3
Epoch 3/3


In [123]:
# Extract accuracies from previous histories
lstm_accuracy_preprocessing1 = history_1.history['val_accuracy'][-1]
lstm_attention_accuracy_preprocessing1 = history_1_attention.history['val_accuracy'][-1]
# Add the result to the Dataframe
classification_results.loc["OR + PreProcessing1", "Bi-directional LSTM Accuracy (%)"] = round(lstm_accuracy_preprocessing1, 3)
classification_results.loc["OR + PreProcessing1", "Bi-directional LSTM-ATT Accuracy (%)"] = round(lstm_attention_accuracy_preprocessing1, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,0.865,0.565,0.855
OR + PreProcessing2,0.845,,
OR + PreProcessing3,0.841,,
OR + PreProcessing4,0.852,,
OR + PreProcessing5,0.775,,


### 6.4.2 Training with tokenization + correction of tokenization + removal of punctuation, stopwords and numbers

In [124]:
X_train_padded_sequences_2, X_test_padded_sequences_2, train_reference_2, test_reference_2, vocab_size_2, max_sequence_len_2 = split_tokenize_pad_data(data2, numerical_ground_truth_vector)

Vocabulary size fitted on training set: 26878. Max sequence length: 831


In [125]:
print("\nBUILDING THE MODEL:")
model_2 = build_keras_bidirectional_lstm_model(vocab_size_2, max_sequence_len_2)

print("\nTRAINING THE MODEL:")
history_2 = model_2.fit(x=X_train_padded_sequences_2, y=train_reference_2, epochs=epochs, verbose=1, batch_size=batch_size,
                        validation_data=(X_test_padded_sequences_2, test_reference_2), workers=workers, use_multiprocessing=True)


BUILDING THE MODEL:
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 831, 64)           1720192   
                                                                 
 bidirectional_3 (Bidirectio  (None, 831, 128)         66048     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 831, 128)          0         
                                                                 
 bidirectional_4 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (Non


"`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a Sigmoid activation and thus does not represent logits. Was this intended?



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [126]:
X_train_padded_sequences_2_att, X_test_padded_sequences_2_att, train_reference_2_att, test_reference_2_att, vocab_size_2_att, max_sequence_len_2_att = split_tokenize_pad_data(data2, numerical_ground_truth_vector, 700)

Vocabulary size fitted on training set: 26949. Max sequence length: 1052
Sequence length limited to: 700


In [127]:
print("\nBUILDING THE MODEL:")
model_2_attention = build_keras_bidirectional_lstm_model_with_self_attention(vocab_size_2_att, max_sequence_len_2_att)

print("\nTRAINING THE MODEL:")
history_2_attention = model_2_attention.fit(x=X_train_padded_sequences_2_att, y=train_reference_2_att, epochs=epochs_att, verbose=1, batch_size=batch_size,
                        validation_data=(X_test_padded_sequences_2_att, test_reference_2_att), workers=workers, use_multiprocessing=True)


BUILDING THE MODEL:
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 700, 64)           1724736   
                                                                 
 bidirectional_5 (Bidirectio  (None, 700, 128)         66048     
 nal)                                                            
                                                                 
 seq_self_attention_1 (SeqSe  (None, 700, 128)         8257      
 lfAttention)                                                    
                                                                 
 flatten_1 (Flatten)         (None, 89600)             0         
                                                                 
 dense_5 (Dense)             (None, 1)                 89601     
                                                                 
Total params: 1,888,642
Trainable


The initializer GlorotNormal is unseeded and being called multiple times, which will return identical values  each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.


"`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a Sigmoid activation and thus does not represent logits. Was this intended?



Epoch 2/3
Epoch 3/3


In [132]:
# Extract accuracies from previous histories
lstm_accuracy_preprocessing2 = history_2.history['val_accuracy'][-1]
lstm_attention_accuracy_preprocessing2 = history_2_attention.history['val_accuracy'][-1]
# Add the result to the Dataframe
classification_results.loc["OR + PreProcessing2", "Bi-directional LSTM Accuracy (%)"] = round(lstm_accuracy_preprocessing2, 3)
classification_results.loc["OR + PreProcessing2", "Bi-directional LSTM-ATT Accuracy (%)"] = round(lstm_attention_accuracy_preprocessing2, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,0.865,0.565,0.855
OR + PreProcessing2,0.845,0.627,0.81
OR + PreProcessing3,0.841,,
OR + PreProcessing4,0.852,,
OR + PreProcessing5,0.775,,


### 6.4.3 Training with tokenization + correction of tokenization + removal of punctuation, stopwords and numbers + correction of elongated words + lemmatization

In [128]:
X_train_padded_sequences_3, X_test_padded_sequences_3, train_reference_3, test_reference_3, vocab_size_3, max_sequence_len_3 = split_tokenize_pad_data(data3, numerical_ground_truth_vector)

Vocabulary size fitted on training set: 22258. Max sequence length: 1052


In [129]:
print("\nBUILDING THE MODEL:")
model_3 = build_keras_bidirectional_lstm_model(vocab_size_3, max_sequence_len_3)

print("\nTRAINING THE MODEL:")
history_3 = model_3.fit(x=X_train_padded_sequences_3, y=train_reference_3, epochs=epochs, verbose=1, batch_size=batch_size,
                        validation_data=(X_test_padded_sequences_3, test_reference_3), workers=workers, use_multiprocessing=True)


BUILDING THE MODEL:
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 1052, 64)          1424512   
                                                                 
 bidirectional_6 (Bidirectio  (None, 1052, 128)        66048     
 nal)                                                            
                                                                 
 dropout_6 (Dropout)         (None, 1052, 128)         0         
                                                                 
 bidirectional_7 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dropout_7 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (Non


"`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a Sigmoid activation and thus does not represent logits. Was this intended?



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [130]:
X_train_padded_sequences_3_att, X_test_padded_sequences_3_att, train_reference_3_att, test_reference_3_att, vocab_size_3_att, max_sequence_len_3_att = split_tokenize_pad_data(data3, numerical_ground_truth_vector, 700)

Vocabulary size fitted on training set: 22333. Max sequence length: 1052
Sequence length limited to: 700


In [133]:
print("\nBUILDING THE MODEL:")
model_3_attention = build_keras_bidirectional_lstm_model_with_self_attention(vocab_size_3_att, max_sequence_len_3_att)

print("\nTRAINING THE MODEL:")
history_3_attention = model_3_attention.fit(x=X_train_padded_sequences_3_att, y=train_reference_3_att, epochs=epochs_att, verbose=1, batch_size=batch_size,
                        validation_data=(X_test_padded_sequences_3_att, test_reference_3_att), workers=workers, use_multiprocessing=True)


BUILDING THE MODEL:
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 700, 64)           1429312   
                                                                 
 bidirectional_9 (Bidirectio  (None, 700, 128)         66048     
 nal)                                                            
                                                                 
 seq_self_attention_3 (SeqSe  (None, 700, 128)         8257      
 lfAttention)                                                    
                                                                 
 flatten_3 (Flatten)         (None, 89600)             0         
                                                                 
 dense_9 (Dense)             (None, 1)                 89601     
                                                                 
Total params: 1,593,218
Trainable


The initializer GlorotNormal is unseeded and being called multiple times, which will return identical values  each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.


"`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a Sigmoid activation and thus does not represent logits. Was this intended?



Epoch 2/3
Epoch 3/3


In [134]:
# Extract accuracies from previous histories
lstm_accuracy_preprocessing3 = history_3.history['val_accuracy'][-1]
lstm_attention_accuracy_preprocessing3 = history_3_attention.history['val_accuracy'][-1]
# Add the result to the Dataframe
classification_results.loc["OR + PreProcessing3", "Bi-directional LSTM Accuracy (%)"] = round(lstm_accuracy_preprocessing3, 3)
classification_results.loc["OR + PreProcessing3", "Bi-directional LSTM-ATT Accuracy (%)"] = round(lstm_attention_accuracy_preprocessing3, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,0.865,0.565,0.855
OR + PreProcessing2,0.845,0.627,0.81
OR + PreProcessing3,0.841,0.605,0.882
OR + PreProcessing4,0.852,,
OR + PreProcessing5,0.775,,


### 6.4.4  Training with tokenization + correction of tokenization + removal of punctuation, stopwords and numbers + correction of elongated words + lemmatization + removal of frequency words

In [135]:
X_train_padded_sequences_4, X_test_padded_sequences_4, train_reference_4, test_reference_4, vocab_size_4, max_sequence_len_4 = split_tokenize_pad_data(data4, numerical_ground_truth_vector)

Vocabulary size fitted on training set: 22266. Max sequence length: 751


In [136]:
## BEST ONE, WITH ORIGINAL NN MODEL ACHIEVES 70% accuracy at epoch 10 and 73% at epoch 5
print("\nBUILDING THE MODEL:")
model_4 = build_keras_bidirectional_lstm_model(vocab_size_4, max_sequence_len_4)

print("\nTRAINING THE MODEL:")
history_4 = model_4.fit(x=X_train_padded_sequences_4, y=train_reference_4, epochs=epochs, verbose=1, batch_size=batch_size,
                        validation_data=(X_test_padded_sequences_4, test_reference_4), workers=workers, use_multiprocessing=True)


BUILDING THE MODEL:
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 751, 64)           1425024   
                                                                 
 bidirectional_10 (Bidirecti  (None, 751, 128)         66048     
 onal)                                                           
                                                                 
 dropout_9 (Dropout)         (None, 751, 128)          0         
                                                                 
 bidirectional_11 (Bidirecti  (None, 64)               41216     
 onal)                                                           
                                                                 
 dropout_10 (Dropout)        (None, 64)                0         
                                                                 
 dense_10 (Dense)            (Non


"`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a Sigmoid activation and thus does not represent logits. Was this intended?



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [137]:
X_train_padded_sequences_4_att, X_test_padded_sequences_4_att, train_reference_4_att, test_reference_4_att, vocab_size_4_att, max_sequence_len_4_att = split_tokenize_pad_data(data4, numerical_ground_truth_vector, 700)

Vocabulary size fitted on training set: 22312. Max sequence length: 980
Sequence length limited to: 700


In [138]:
print("\nBUILDING THE MODEL:")
model_4_attention = build_keras_bidirectional_lstm_model_with_self_attention(vocab_size_4_att, max_sequence_len_4_att)

print("\nTRAINING THE MODEL:")
history_4_attention = model_4_attention.fit(x=X_train_padded_sequences_4_att, y=train_reference_4_att, epochs=epochs_att, verbose=1, batch_size=batch_size,
                        validation_data=(X_test_padded_sequences_4_att, test_reference_4_att), workers=workers, use_multiprocessing=True)


BUILDING THE MODEL:
Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 700, 64)           1427968   
                                                                 
 bidirectional_12 (Bidirecti  (None, 700, 128)         66048     
 onal)                                                           
                                                                 
 seq_self_attention_4 (SeqSe  (None, 700, 128)         8257      
 lfAttention)                                                    
                                                                 
 flatten_4 (Flatten)         (None, 89600)             0         
                                                                 
 dense_12 (Dense)            (None, 1)                 89601     
                                                                 
Total params: 1,591,874
Trainable


The initializer GlorotNormal is unseeded and being called multiple times, which will return identical values  each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.


"`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a Sigmoid activation and thus does not represent logits. Was this intended?



Epoch 2/3
Epoch 3/3


In [139]:
# Extract accuracies from previous histories
lstm_accuracy_preprocessing4 = history_4.history['val_accuracy'][-1]
lstm_attention_accuracy_preprocessing4 = history_4_attention.history['val_accuracy'][-1]
# Add the result to the Dataframe
classification_results.loc["OR + PreProcessing4", "Bi-directional LSTM Accuracy (%)"] = round(lstm_accuracy_preprocessing4, 3)
classification_results.loc["OR + PreProcessing4", "Bi-directional LSTM-ATT Accuracy (%)"] = round(lstm_attention_accuracy_preprocessing4, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,0.865,0.565,0.855
OR + PreProcessing2,0.845,0.627,0.81
OR + PreProcessing3,0.841,0.605,0.882
OR + PreProcessing4,0.852,0.598,0.815
OR + PreProcessing5,0.775,,


### 6.4.5 Training with tokenization + correction of tokenization + removal of punctuation, stopwords and numbers + correction of elongated words + lemmatization + removal of frequency words + handle negations (full pre-processing)

In [140]:
X_train_padded_sequences_5, X_test_padded_sequences_5, train_reference_5, test_reference_5, vocab_size_5, max_sequence_len_5 = split_tokenize_pad_data(data5, numerical_ground_truth_vector)

Vocabulary size fitted on training set: 22252. Max sequence length: 1464


In [141]:
print("\nBUILDING THE MODEL:")
model_5 = build_keras_bidirectional_lstm_model(vocab_size_5, max_sequence_len_5)

print("\nTRAINING THE MODEL:")
history_5 = model_5.fit(x=X_train_padded_sequences_5, y=train_reference_5, epochs=epochs, verbose=1, batch_size=batch_size,
                        validation_data=(X_test_padded_sequences_5, test_reference_5), workers=workers, use_multiprocessing=True)


BUILDING THE MODEL:
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 1464, 64)          1424128   
                                                                 
 bidirectional_13 (Bidirecti  (None, 1464, 128)        66048     
 onal)                                                           
                                                                 
 dropout_12 (Dropout)        (None, 1464, 128)         0         
                                                                 
 bidirectional_14 (Bidirecti  (None, 64)               41216     
 onal)                                                           
                                                                 
 dropout_13 (Dropout)        (None, 64)                0         
                                                                 
 dense_13 (Dense)            (Non


"`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a Sigmoid activation and thus does not represent logits. Was this intended?



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [142]:
X_train_padded_sequences_5_att, X_test_padded_sequences_5_att, train_reference_5_att, test_reference_5_att, vocab_size_5_att, max_sequence_len_5_att = split_tokenize_pad_data(data5, numerical_ground_truth_vector, 700)

Vocabulary size fitted on training set: 22257. Max sequence length: 1556
Sequence length limited to: 700


In [143]:
print("\nBUILDING THE MODEL:")
model_5_attention = build_keras_bidirectional_lstm_model_with_self_attention(vocab_size_5_att, max_sequence_len_5_att)

print("\nTRAINING THE MODEL:")
history_5_attention = model_5_attention.fit(x=X_train_padded_sequences_5_att, y=train_reference_5_att, epochs=epochs_att, verbose=1, batch_size=batch_size,
                        validation_data=(X_test_padded_sequences_5_att, test_reference_5_att), workers=workers, use_multiprocessing=True)


BUILDING THE MODEL:
Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 700, 64)           1424448   
                                                                 
 bidirectional_15 (Bidirecti  (None, 700, 128)         66048     
 onal)                                                           
                                                                 
 seq_self_attention_5 (SeqSe  (None, 700, 128)         8257      
 lfAttention)                                                    
                                                                 
 flatten_5 (Flatten)         (None, 89600)             0         
                                                                 
 dense_15 (Dense)            (None, 1)                 89601     
                                                                 
Total params: 1,588,354
Trainabl


The initializer GlorotNormal is unseeded and being called multiple times, which will return identical values  each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.


"`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a Sigmoid activation and thus does not represent logits. Was this intended?



Epoch 2/3
Epoch 3/3


In [144]:
# Extract accuracies from previous histories
lstm_accuracy_preprocessing5 = history_5.history['val_accuracy'][-1]
lstm_attention_accuracy_preprocessing5 = history_5_attention.history['val_accuracy'][-1]
# Add the result to the Dataframe
classification_results.loc["OR + PreProcessing5", "Bi-directional LSTM Accuracy (%)"] = round(lstm_accuracy_preprocessing5, 3)
classification_results.loc["OR + PreProcessing5", "Bi-directional LSTM-ATT Accuracy (%)"] = round(lstm_attention_accuracy_preprocessing5, 3)
classification_results

Unnamed: 0_level_0,Multinomial Naive-Bayes Accuracy (%),Bi-directional LSTM Accuracy (%),Bi-directional LSTM-ATT Accuracy (%)
Type of Processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Raw Text - FullDict,0.822,,
Raw Text - 80% Dict,0.814,,
Object Removal (OR),0.849,,
OR + PreProcessing1,0.865,0.565,0.855
OR + PreProcessing2,0.845,0.627,0.81
OR + PreProcessing3,0.841,0.605,0.882
OR + PreProcessing4,0.852,0.598,0.815
OR + PreProcessing5,0.775,0.615,0.842


## 6.5 Plot of the best results

In [145]:
# Save the history of the best model to then plot it
plot_history = history_3_attention

# Extract the training accuracy and put it into a pandas dataframe
accuracy = pd.DataFrame(plot_history.history['accuracy'], columns=["Training Accuracy"])
# Extract the testing accuracy and add it to the previous pandas dataframe
accuracy["Testing Accuracy"] = plot_history.history['val_accuracy']
# Plot the train and test accuracies over a single chart to compare them
fig = px.line(accuracy, title="Training&Testing accuracy over epochs", labels={'index':'Epochs', 'value':'Accuracy'}, template='plotly', width=800)
fig.show()

# Extract the training loss and put it into a pandas dataframe
loss = pd.DataFrame(plot_history.history['loss'], columns=["Training Loss"])
# Extract the testing loss and add it to the previous pandas dataframe
loss["Testing Loss"] = plot_history.history['val_loss']
# Plot the train and test losses over a single chart to compare them
fig = px.line(loss, title="Training&Testing loss over epochs", labels={'index':'Epochs', 'value':'Loss'}, template='plotly', width=800)
fig.show()

## 6.6 Evaluation of the best results

In [147]:
# Compute the prediction for the test-set with the best performing model
predictions = model_3_attention.predict(X_test_padded_sequences_3_att)

# Convert the probability predictions into binary predictions
binary_prediction = []
for pred in predictions:
    # If the probability of the review to be positive is greater than 50% then classify it as positive (1)
    if pred >= 0.5:
        binary_prediction.append(1)
    # Otherwise classify it as negative
    else:
        binary_prediction.append(0)



In [148]:
# Plot some metrics regarding the previous predictions
report = classification_report(test_reference_3_att, binary_prediction, target_names=["negative", "positive"])
print(report)

              precision    recall  f1-score   support

    negative       0.90      0.85      0.88       200
    positive       0.86      0.91      0.89       200

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400



In [149]:
# Compute and show the confusion matrix for the previous predictions
display("Confusion matrix for the latest prediction:")
generate_confusion_matrix(test_reference_3_att, binary_prediction, True, False)

'Confusion matrix for the latest prediction:'

Unnamed: 0,Positive,Negative
Positive,171,29
Negative,18,182
