# CS-344 Artificial Intelligence - Final Project
## Project: SLO TBL Topic Classification
## Author: Joseph Jinn
## Date: 5-3-19

# Final Project Proposal - Draft 2 - Vision Statement

A short summary and background of the project.

# Final Project Proposal - Draft 2 - Code-Base

## SLO_TBL_Tweet_Preprocessor_Specialized.py

This Python program pre-processes several datasets in preparation for TBL topic classification using Scikit-Learn Classifiers.

In [1]:
"""
Course: CS 344 - Artificial Intelligence
Instructor: Professor VanderLinden
Name: Joseph Jinn
Date: 4-23-19

Final Project - SLO TBL Topic Classification

###########################################################
Notes:

These function(s) performs Tweet pre-processing SPECIFIC TO A SINGLE DATASET ONLY.
This is NOT a generalized Tweet dataset preprocessor!!!

###########################################################
Resources Used:

Refer to slo_topic_classification_original.py for a full list of URL's to online resources referenced.

"""

################################################################################################################

import re
import string
import time
import warnings
import pandas as pd
import logging as log

# Note: Need to set level AND turn on debug variables in order to see all debug output.
log.basicConfig(level=log.DEBUG)

# Miscellaneous parameter adjustments for pandas and python.
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

# Turn on and off to debug various sub-sections.
debug = True


################################################################################################################
def tweet_dataset_preprocessor_1():
    """
    Function pre-processes tbl_training_set.csv in preparation for machine learning input feature creation.

    :return: Nothing. Saves to CSV file.
    """

    # Import the dataset.
    slo_dataset = \
        pd.read_csv("tbl-datasets/tbl_training_set.csv", sep=",")

    # Shuffle the data randomly.
    slo_dataset = slo_dataset.reindex(
        pd.np.random.permutation(slo_dataset.index))

    # Rename columns to something that makes sense.
    column_names = ['Tweet', 'SLO1', 'SLO2', 'SLO3']

    # Generate a Pandas dataframe.
    slo_dataframe = pd.DataFrame(slo_dataset)

    if debug:
        # Print shape and column names.
        log.debug("The shape of our SLO dataframe:")
        log.debug(slo_dataframe.shape)
        log.debug("\n")
        log.debug("The columns of our SLO dataframe:")
        log.debug(slo_dataframe.head)
        log.debug("\n")

    # Assign column names.
    slo_dataframe.columns = column_names

    ################################################################################################################

    # Drop all rows with only NaN in all columns.
    slo_dataframe = slo_dataframe.dropna(how='all')
    # Drop all rows without at least 2 non NaN values - indicating no existing SLO TBL topic classification labels.
    slo_dataframe = slo_dataframe.dropna(thresh=2)

    if debug:
        # Iterate through each row and check we dropped properly.
        log.debug("Dataframe with only examples that have SLO TBL topic classification labels:")
        for index in slo_dataframe.index:
            log.debug(slo_dataframe['Tweet'][index] + '\tSLO1: ' + str(slo_dataframe['SLO1'][index])
                      + '\tSLO2: ' + str(slo_dataframe['SLO2'][index]) + '\tSLO3: ' + str(slo_dataframe['SLO3'][index]))
        log.debug("\n")
        log.debug(
            "Shape of dataframe with only examples that have SLO TBL topic classifications: " + str(
                slo_dataframe.shape))
        log.debug("\n")

    #######################################################

    # Boolean indexing to select examples with only a single SLO TBL topic classification.
    mask = slo_dataframe['SLO1'].notna() & (slo_dataframe['SLO2'].isna() & slo_dataframe['SLO3'].isna())

    if debug:
        # Check that boolean indexing is working.
        log.debug("Check that our boolean indexing mask gives only examples with a single SLO TBL topic "
                  "classification:")
        log.debug(mask.tail)
        log.debug("\n")
        log.debug("The shape of our boolean indexing mask:")
        log.debug(mask.shape)
        log.debug("\n")

    # Create new dataframe with examples that have only a single SLO TBL topic classification.
    slo_dataframe_single_classification = slo_dataframe[mask]

    if debug:
        # Check that we have created the new dataframe properly.
        log.debug("Dataframe with only examples that have a single SLO TBL topic classification labels:")
        # Iterate through each row and check that only examples with a single SLO TBL topic classification are left.
        for index in slo_dataframe_single_classification.index:
            log.debug(slo_dataframe_single_classification['Tweet'][index]
                      + '\tSLO1: ' + str(slo_dataframe_single_classification['SLO1'][index])
                      + '\tSLO2: ' + str(slo_dataframe_single_classification['SLO2'][index])
                      + '\tSLO3: ' + str(slo_dataframe_single_classification['SLO3'][index]))
        log.debug("\n")
        log.debug("Shape of dataframe with only examples that have a single SLO TBL topic classification: "
                  + str(slo_dataframe_single_classification.shape))
        log.debug("\n")

    #######################################################

    # Drop SLO2 and SLO3 columns as they are just NaN values.
    slo_dataframe_single_classification = slo_dataframe_single_classification.drop(columns=['SLO2', 'SLO3'])

    if debug:
        # Check that we have dropped SLO2 and SLO3 columns properly.
        log.debug("Dataframe with SLO2 and SLO3 columns dropped as they are just NaN values:")
        # Iterate through each row and check that each example only has one SLO TBL topic classification left.
        for index in slo_dataframe_single_classification.index:
            log.debug(slo_dataframe_single_classification['Tweet'][index] + '\tSLO1: '
                      + str(slo_dataframe_single_classification['SLO1'][index]))
        log.debug("\n")
        log.debug(
            "Shape of dataframe with SLO2 and SLO3 columns dropped: " + str(slo_dataframe_single_classification.shape))
        log.debug("\n")

    # Re-name columns for clarity of purpose.
    column_names_single = ['Tweet', 'SLO']
    slo_dataframe_single_classification.columns = column_names_single

    #######################################################

    # Boolean indexing to select examples with multiple SLO TBL topic classifications.
    mask = slo_dataframe['SLO1'].notna() & (slo_dataframe['SLO2'].notna() | slo_dataframe['SLO3'].notna())

    if debug:
        # Check that boolean indexing is working.
        log.debug(
            "Check that our boolean indexing mask gives only examples with multiple SLO TBL topic classifications:")
        log.debug(mask.tail)
        log.debug("\n")
        log.debug("The shape of our boolean indexing mask:")
        log.debug(mask.shape)
        log.debug("\n")

    # Create new dataframe with only those examples with multiple SLO TBL topic classifications.
    slo_dataframe_multiple_classifications = slo_dataframe[mask]

    if debug:
        # Check that we have created the new dataframe properly.
        log.debug("Dataframe with only examples that have multiple SLO TBL topic classification labels:")
        # Iterate through each row and check that only examples with multiple SLO TBL topic classifications are left.
        for index in slo_dataframe_multiple_classifications.index:
            log.debug(slo_dataframe_multiple_classifications['Tweet'][index]
                      + '\tSLO1: ' + str(slo_dataframe_multiple_classifications['SLO1'][index])
                      + '\tSLO2: ' + str(slo_dataframe_multiple_classifications['SLO2'][index])
                      + '\tSLO3: ' + str(slo_dataframe_multiple_classifications['SLO3'][index]))
        log.debug("\n")
        log.debug("Shape of dataframe with only examples that have multiple SLO TBL topic classifications: "
                  + str(slo_dataframe_multiple_classifications.shape))
        log.debug("\n")

    #######################################################

    # Duplicate examples with multiple SLO TBL classifications into examples with only 1 SLO TBL topic classification
    # each.
    slo1_dataframe = slo_dataframe_multiple_classifications.drop(columns=['SLO2', 'SLO3'])
    slo2_dataframe = slo_dataframe_multiple_classifications.drop(columns=['SLO1', 'SLO3'])
    slo3_dataframe = slo_dataframe_multiple_classifications.drop(columns=['SLO1', 'SLO2'])

    if debug:
        # Check that we have created the new dataframes properly.
        log.debug(
            "Separated dataframes with only a single label for examples with multiple SLO TBL topic classification "
            "labels:")
        # Iterate through each row and check that each example only has one SLO TBL topic classification left.
        for index in slo1_dataframe.index:
            log.debug(slo1_dataframe['Tweet'][index] + '\tSLO1: ' + str(slo1_dataframe['SLO1'][index]))
        for index in slo2_dataframe.index:
            log.debug(slo2_dataframe['Tweet'][index] + '\tSLO2: ' + str(slo2_dataframe['SLO2'][index]))
        for index in slo3_dataframe.index:
            log.debug(slo3_dataframe['Tweet'][index] + '\tSLO3: ' + str(slo3_dataframe['SLO3'][index]))
        log.debug("\n")
        log.debug("Shape of slo1_dataframe: " + str(slo1_dataframe.shape))
        log.debug("Shape of slo2_dataframe: " + str(slo2_dataframe.shape))
        log.debug("Shape of slo3_dataframe: " + str(slo3_dataframe.shape))
        log.debug("\n")

    # Re-name columns for clarity of purpose.
    column_names_single = ['Tweet', 'SLO']

    slo1_dataframe.columns = column_names_single
    slo2_dataframe.columns = column_names_single
    slo3_dataframe.columns = column_names_single

    #######################################################

    # Concatenate the individual dataframes back together.
    frames = [slo1_dataframe, slo2_dataframe, slo3_dataframe, slo_dataframe_single_classification]
    slo_dataframe_combined = pd.concat(frames, ignore_index=True)

    # Note: Doing this as context-sensitive menu stopped displaying all useable function calls after concat.
    slo_dataframe_combined = pd.DataFrame(slo_dataframe_combined)

    if debug:
        # Check that we have recombined the dataframes properly.
        log.debug("Recombined individual dataframes for the dataframe representing Tweets with only a single SLO TBL "
                  " topic classification\nand for the dataframes representing Tweets with multiple SLO TBL topic"
                  "classifications:")
        # Iterate through each row and check that each example only has one SLO TBL Classification left.
        for index in slo_dataframe_combined.index:
            log.debug(slo_dataframe_combined['Tweet'][index] + '\tSLO: ' + str(slo_dataframe_combined['SLO'][index]))
        log.debug('\n')
        log.debug('Shape of recombined dataframes: ' + str(slo_dataframe_combined.shape))
        log.debug('\n')

    #######################################################

    # Drop all rows with only NaN in all columns.
    slo_dataframe_combined = slo_dataframe_combined.dropna()

    if debug:
        # Check that we have dropped all NaN's properly.
        log.debug("Recombined dataframes - NaN examples removed:")
        # Iterate through each row and check that we no longer have examples with NaN values.
        for index in slo_dataframe_combined.index:
            log.debug(slo_dataframe_combined['Tweet'][index] + '\tSLO: ' + str(slo_dataframe_combined['SLO'][index]))
        log.debug('\n')
        log.debug('Shape of recombined dataframes without NaN examples: ' + str(slo_dataframe_combined.shape))
        log.debug('\n')

    #######################################################

    # Drop duplicate examples with the same SLO TBL topic classification class.
    slo_dataframe_tbl_duplicates_dropped = slo_dataframe_combined.drop_duplicates(subset=['Tweet', 'SLO'], keep=False)

    if debug:
        # Check that we have dropped all duplicate labels properly.
        log.debug("Duplicate examples with duplicate SLO TBL topic classifications removed:")
        # Iterate through each row and check that we no longer have duplicate examples with the same labels.
        for index in slo_dataframe_tbl_duplicates_dropped.index:
            log.debug(slo_dataframe_tbl_duplicates_dropped['Tweet'][index] + '\tSLO: '
                      + str(slo_dataframe_tbl_duplicates_dropped['SLO'][index]))
        log.debug('\n')
        log.debug(
            'Shape of dataframes without duplicate TBL labels: ' + str(slo_dataframe_tbl_duplicates_dropped.shape))
        log.debug('\n')

    #######################################################

    def preprocess_tweet_text(tweet_text):
        """
        Helper function performs text pre-processing using regular expressions and other Python functions.

        Notes:

        Stop words are retained.

        TODO - shrink character elongations
        TODO - remove non-english tweets
        TODO - remove non-company associated tweets
        TODO - remove year and time.
        TODO - remove cash items?

        :return:
        """

        # Remove "RT" tags.
        preprocessed_tweet_text = re.sub("rt", "", tweet_text)

        # Remove URL's.
        preprocessed_tweet_text = re.sub("http[s]?://\S+", "slo_url", preprocessed_tweet_text)

        # Remove Tweet mentions.
        preprocessed_tweet_text = re.sub("@\S+", "slo_mention", preprocessed_tweet_text)

        # Remove Tweet hashtags.
        preprocessed_tweet_text = re.sub("#\S+", "slo_hashtag", preprocessed_tweet_text)

        # Remove all punctuation.
        preprocessed_tweet_text = preprocessed_tweet_text.translate(str.maketrans('', '', string.punctuation))

        return preprocessed_tweet_text

    # Assign new dataframe to contents of old.
    slo_df_tokenized = slo_dataframe_tbl_duplicates_dropped

    # Down-case all text.
    slo_df_tokenized['Tweet'] = slo_df_tokenized['Tweet'].str.lower()

    # Pre-process each tweet individually.
    for index in slo_df_tokenized.index:
        slo_df_tokenized['Tweet'][index] = preprocess_tweet_text(slo_df_tokenized['Tweet'][index])

    # Reindex everything.
    slo_df_tokenized.index = pd.RangeIndex(len(slo_df_tokenized.index))
    # slo_df_tokenized.index = range(len(slo_df_tokenized.index))

    # Save to CSV file.
    slo_df_tokenized.to_csv("preprocessed-datasets/tbl_training_set_PROCESSED.csv", sep=',',
                            encoding='utf-8', index=False, header=['Tweet', 'SLO'])

    # return slo_df_tokenized


################################################################################################################

def tweet_dataset_preprocessor_2():
    """
      Function pre-processes tbl_kvlinden.csv in preparation for machine learning input feature creation.

    :return: Nothing. Saves to CSV file.
    """

    # Import the dataset.
    slo_dataset = \
        pd.read_csv("tbl-datasets/tbl_kvlinden.csv", sep=",")

    # Shuffle the data randomly.
    slo_dataset = slo_dataset.reindex(
        pd.np.random.permutation(slo_dataset.index))

    # Rename columns to something that makes sense.
    column_names = ['Tweet', 'SLO1', 'SLO2']

    # Generate a Pandas dataframe.
    slo_dataframe = pd.DataFrame(slo_dataset)

    if debug:
        # Print shape and column names.
        log.debug("The shape of our SLO dataframe:")
        log.debug(slo_dataframe.shape)
        log.debug("\n")
        log.debug("The columns of our SLO dataframe:")
        log.debug(slo_dataframe.head)
        log.debug("\n")
        log.debug("The 2nd column of our SLO dataframe:")

    # Assign column names.
    slo_dataframe.columns = column_names

    if debug:
        log.debug("The Tweets column:")
        log.debug(slo_dataframe['Tweet'])
        log.debug("\n")
        log.debug("The SLO column:")
        log.debug(slo_dataframe['SLO1'])
        log.debug("\n")
        log.debug("The 2nd SLO column:")
        log.debug(slo_dataframe['SLO2'])
        log.debug("\n")

    #######################################################

    # Restrict to just SLO1 column by dropping SLO2 column.
    slo_dataframe_column1 = slo_dataframe.drop('SLO2', axis=1)

    if debug:
        log.debug("The shape of dataframe with only slo column1:")
        log.debug(slo_dataframe_column1.shape)
        log.debug("\n")
        log.debug("The contents of the dataframe with only slo column1:")
        log.debug(slo_dataframe_column1.sample())
        log.debug("\n")

    #######################################################

    # Drop any row with "NaN" columns. (isolates examples with multiple TBL classification labels)
    slo_dataframe_column2 = slo_dataframe.dropna()

    if debug:
        log.debug("The contents of the dataframe with only examples containing multiple classifications")
        for index in slo_dataframe_column2.index:
            log.debug(slo_dataframe_column2['Tweet'][index] + '\tSLO1: '
                      + str(slo_dataframe_column2['SLO1'][index])
                      + '\tSLO2: ' + str(slo_dataframe_column2['SLO2'][index]))
        log.debug("\n")

    # Drop SLO1 column to restrict to just 2nd classification label in SLO2 column.
    slo_dataframe_column2 = slo_dataframe_column2.drop('SLO1', axis=1)

    #######################################################

    # Rename columns for concatenation back into a single dataframe.
    column_names = ["Tweet", "SLO"]
    slo_dataframe_column1.columns = column_names
    slo_dataframe_column2.columns = column_names

    if debug:
        log.debug("Check that we have renamed columns properly:")
        log.debug(slo_dataframe_column1.head())
        log.debug(slo_dataframe_column2.head())
        log.debug("\n")

    # Concatenate the individual dataframes back together.
    frames = [slo_dataframe_column1, slo_dataframe_column2]
    slo_dataframe_combined = pd.concat(frames, ignore_index=True)

    if debug:
        log.debug("Check that we have concatenated properly:")
        log.debug(slo_dataframe_combined.shape)
        log.debug("\n")
        log.debug(slo_dataframe_combined.tail())
        log.debug("\n")

    #######################################################

    def preprocess_tweet_text(tweet_text):
        """
        Helper function performs text pre-processing using regular expressions and other Python functions.

        Notes:

        Stop words are retained.

        TODO - shrink character elongations
        TODO - remove non-english tweets
        TODO - remove non-company associated tweets
        TODO - remove year and time.
        TODO - remove cash items?

        :return:
        """

        # Remove "RT" tags.
        preprocessed_tweet_text = re.sub("rt", "", tweet_text)

        # Remove URL's.
        preprocessed_tweet_text = re.sub("http[s]?://\S+", "slo_url", preprocessed_tweet_text)

        # Remove Tweet mentions.
        preprocessed_tweet_text = re.sub("@\S+", "slo_mention", preprocessed_tweet_text)

        # Remove Tweet hashtags.
        preprocessed_tweet_text = re.sub("#\S+", "slo_hashtag", preprocessed_tweet_text)

        # Remove all punctuation.
        preprocessed_tweet_text = preprocessed_tweet_text.translate(str.maketrans('', '', string.punctuation))

        return preprocessed_tweet_text

        # Assign new dataframe to contents of old.

    #######################################################

    # Down-case all text.
    slo_dataframe_combined['Tweet'] = slo_dataframe_combined['Tweet'].str.lower()

    # Pre-process each tweet individually.
    for index in slo_dataframe_combined.index:
        slo_dataframe_combined['Tweet'][index] = preprocess_tweet_text(slo_dataframe_combined['Tweet'][index])

    # Reindex everything.
    slo_dataframe_combined.index = pd.RangeIndex(len(slo_dataframe_combined.index))
    # slo_dataframe_combined.index = range(len(slo_dataframe_combined.index))

    if debug:
        log.debug("Check that we have pre-processed properly:")
        for index in slo_dataframe_combined.index:
            log.debug(slo_dataframe_combined['Tweet'][index] + '\tSLO: '
                      + str(slo_dataframe_combined['SLO'][index]))
        log.debug("\n")

    # Save to CSV file.
    slo_dataframe_combined.to_csv("preprocessed-datasets/tbl_kvlinden_PROCESSED.csv", sep=',',
                                  encoding='utf-8', index=False, header=['Tweet', 'SLO'])

    # return slo_dataframe_combined


################################################################################################################

def tweet_dataset_preprocessor_3():
    """
    Function pre-processes dataset_20100101-20180510_tok.csv in preparation for machine learning input feature creation.

    Note: We are doing this as our pre-processing for the other datasets we are using is different from the
    pre-processing done on this already tokenized dataset.  Hence, we wish to normalize the difference between them
    as much as possible before using this dataset as our prediction set.

    :return: Nothing. Saves to CSV file.
    """

    # Import the dataset.
    slo_dataset_cmu = \
        pd.read_csv("borg-SLO-classifiers/dataset_20100101-20180510_tok.csv", sep=",")

    # Shuffle the data randomly.
    slo_dataset_cmu = slo_dataset_cmu.reindex(
        pd.np.random.permutation(slo_dataset_cmu.index))

    # Generate a Pandas dataframe.
    slo_dataframe_cmu = pd.DataFrame(slo_dataset_cmu)

    # Print shape and column names.
    log.debug("\n")
    log.debug("The shape of our SLO CMU dataframe:")
    log.debug(slo_dataframe_cmu.shape)
    log.debug("\n")
    log.debug("The columns of our SLO CMU dataframe:")
    log.debug(slo_dataframe_cmu.head)
    log.debug("\n")

    #######################################################

    def preprocess_tweet_text(tweet_text):
        """
        Helper function performs text pre-processing using regular expressions and other Python functions.

        Notes:

        Stop words are retained.

        TODO - shrink character elongations
        TODO - remove non-english tweets
        TODO - remove non-company associated tweets
        TODO - remove year and time.
        TODO - remove cash items?

        :return:
        """

        # Remove "RT" tags.
        preprocessed_tweet_text = re.sub("rt", "", tweet_text)

        # Remove URL's.
        preprocessed_tweet_text = re.sub("http[s]?://\S+", "slo_url", preprocessed_tweet_text)

        # Remove Tweet mentions.
        preprocessed_tweet_text = re.sub("@\S+", "slo_mention", preprocessed_tweet_text)

        # Remove Tweet hashtags.
        preprocessed_tweet_text = re.sub("#\S+", "slo_hashtag", preprocessed_tweet_text)

        # Remove all punctuation.
        preprocessed_tweet_text = preprocessed_tweet_text.translate(str.maketrans('', '', string.punctuation))

        return preprocessed_tweet_text

        # Assign new dataframe to contents of old.

    #######################################################

    # Down-case all text.
    slo_dataframe_cmu['tweet_t'] = slo_dataframe_cmu['tweet_t'].str.lower()

    # Pre-process each tweet individually.
    for index in slo_dataframe_cmu.index:
        slo_dataframe_cmu['tweet_t'][index] = preprocess_tweet_text(slo_dataframe_cmu['tweet_t'][index])

    # Reindex everything.
    slo_dataframe_cmu.index = pd.RangeIndex(len(slo_dataframe_cmu.index))
    # slo_dataframe_combined.index = range(len(slo_dataframe_combined.index))

    # Create input features.
    selected_features_cmu = slo_dataframe_cmu['tweet_t']
    processed_features_cmu = selected_features_cmu.copy()

    # Check what we are using for predictions.
    if debug:
        log.debug("The shape of our SLO CMU feature dataframe:")
        log.debug(processed_features_cmu.shape)
        log.debug("\n")
        log.debug("The columns of our SLO CMU feature dataframe:")
        log.debug(processed_features_cmu.head)
        log.debug("\n")

    # Save to CSV file.
    slo_dataframe_cmu.to_csv("preprocessed-datasets/dataset_20100101-20180510_tok_PROCESSED.csv", sep=',',
                             encoding='utf-8', index=False)

    # return processed_features_cmu


################################################################################################################

############################################################################################
"""
Main function.  Execute the program.

Note: Used to individually test that the preprocessors function as intended.
"""

# Debug variable.
debug_main = 0

if __name__ == '__main__':

    start_time = time.time()

    """
    Comment or uncomment in order to run the associated tweet preprocessor module.
    """
    # tweet_dataset_preprocessor_1()
    # tweet_dataset_preprocessor_2()
    # tweet_dataset_preprocessor_3()

    end_time = time.time()

    if debug:
        log.debug("\n")
        log.debug("Time taken to run pre-processor function:")
        time_taken = end_time - start_time
        log.debug(time_taken)
        log.debug("\n")

############################################################################################


DEBUG:root:

DEBUG:root:Time taken to run pre-processor function:
DEBUG:root:0.0
DEBUG:root:



## slo_topic_classification_clean.py

This Python program compares the accuracy metrics obtained by training various Scikit-Learn non-NN Classifiers and one NN Classifier on a small labeled TBL topic classification dataset.  It also makes TBL topic predictions on a large 600k+ preprocessed Twitter dataset for each trained Classifier.

In [1]:
"""
Course: CS 344 - Artificial Intelligence
Instructor: Professor VanderLinden
Name: Joseph Jinn
Date: 4-23-19

Final Project - SLO TBL Topic Classification

###########################################################
Notes:

Utilizes Scikit-Learn machine learning algorithms for fast prototyping and topic classification using a variety
of Classifiers.

TODO - resolve SettingWithCopyWarning.

TODO - implement data visualizations via matplotlib and Seaborn.

TODO - attempt to acquire additional labeled Tweets for topic classification using pattern matching and pandas queries.
TODO - reference settings.py and autocoding.py for template of how to do this.

TODO - revise report.ipynb and paper as updates are made to implementation and code-base.

###########################################################
Resources Used:

Refer to original un-cleaned version.

https://scikit-plot.readthedocs.io/en/stable/index.html
(visualizations simplified)

"""

################################################################################################################
################################################################################################################

import logging as log
import warnings
import tensorflow as tf
from tensorflow import keras
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import nltk as nltk
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn import metrics

#############################################################

# Note: FIXME - indicates unresolved import error, but still runs fine.
# noinspection PyUnresolvedReferences
from SLO_TBL_Tweet_Preprocessor_Specialized import tweet_dataset_preprocessor_1, tweet_dataset_preprocessor_2, \
    tweet_dataset_preprocessor_3

#############################################################

# Note: Need to set level AND turn on debug variables in order to see all debug output.
log.basicConfig(level=log.DEBUG)
tf.logging.set_verbosity(tf.logging.ERROR)

# Miscellaneous parameter adjustments for pandas and python.
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

# Turn on and off to debug various sub-sections.
debug = False
debug_pipeline = False
debug_preprocess_tweets = False
debug_train_test_set_creation = False
debug_classifier_iterations = False
debug_create_prediction_set = False
debug_make_predictions = False

################################################################################################################
################################################################################################################

# Import the datasets.
tweet_dataset_processed1 = \
    pd.read_csv("preprocessed-datasets/tbl_kvlinden_PROCESSED.csv", sep=",")

tweet_dataset_processed2 = \
    pd.read_csv("preprocessed-datasets/tbl_training_set_PROCESSED.csv", sep=",")

# Reindex and shuffle the data randomly.
tweet_dataset_processed1 = tweet_dataset_processed1.reindex(
    pd.np.random.permutation(tweet_dataset_processed1.index))

tweet_dataset_processed2 = tweet_dataset_processed2.reindex(
    pd.np.random.permutation(tweet_dataset_processed2.index))

# Generate a Pandas dataframe.
tweet_dataframe_processed1 = pd.DataFrame(tweet_dataset_processed1)
tweet_dataframe_processed2 = pd.DataFrame(tweet_dataset_processed2)

if debug_preprocess_tweets:
    # Print shape and column names.
    log.debug("\n")
    log.debug("The shape of our SLO dataframe 1:")
    log.debug(tweet_dataframe_processed1.shape)
    log.debug("\n")
    log.debug("The columns of our SLO dataframe 1:")
    log.debug(tweet_dataframe_processed1.head)
    log.debug("\n")
    # Print shape and column names.
    log.debug("\n")
    log.debug("The shape of our SLO dataframe 2:")
    log.debug(tweet_dataframe_processed2.shape)
    log.debug("\n")
    log.debug("The columns of our SLO dataframe 2:")
    log.debug(tweet_dataframe_processed2.head)
    log.debug("\n")

# Concatenate the individual datasets together.
frames = [tweet_dataframe_processed1, tweet_dataframe_processed2]
slo_dataframe_combined = pd.concat(frames, ignore_index=True)

# Reindex everything.
slo_dataframe_combined.index = pd.RangeIndex(len(slo_dataframe_combined.index))
# slo_dataframe_combined.index = range(len(slo_dataframe_combined.index))

# Assign column names.
tweet_dataframe_processed_column_names = ['Tweet', 'SLO']

# Create input features.
selected_features = slo_dataframe_combined[tweet_dataframe_processed_column_names]
processed_features = selected_features.copy()

if debug_preprocess_tweets:
    # Check what we are using as inputs.
    log.debug("\n")
    log.debug("The Tweets in our input feature:")
    log.debug(processed_features['Tweet'])
    log.debug("\n")
    log.debug("SLO TBL topic classification label for each Tweet:")
    log.debug(processed_features['SLO'])
    log.debug("\n")

# Create feature set and target sets.
slo_feature_set = processed_features['Tweet']
slo_target_set = processed_features['SLO']


#######################################################

def create_training_and_test_set():
    """
    This functions splits the feature and target set into training and test sets for each set.

    Note: We use this to generate a randomized training and target set in order to average our results over
    n iterations.

    random_state = rng (where rng = random number seed generator)

    :return: Nothing.  Global variables are established.
    """
    global tweet_train, tweet_test, target_train, target_test, target_train_encoded, target_test_encoded

    from sklearn.model_selection import train_test_split

    import random
    rng = random.randint(1, 1000000)
    # Split feature and target set into training and test sets for each set.
    tweet_train, tweet_test, target_train, target_test = train_test_split(slo_feature_set, slo_target_set,
                                                                          test_size=0.33,
                                                                          random_state=rng)

    if debug_train_test_set_creation:
        log.debug("Shape of tweet training set:")
        log.debug(tweet_train.data.shape)
        log.debug("Shape of tweet test set:")
        log.debug(tweet_test.data.shape)
        log.debug("Shape of target training set:")
        log.debug(target_train.data.shape)
        log.debug("Shape of target test set:")
        log.debug(target_test.data.shape)
        log.debug("\n")

    #######################################################

    # Use Sci-kit learn to encode labels into integer values - one assigned integer value per class.
    from sklearn import preprocessing

    target_label_encoder = preprocessing.LabelEncoder()
    target_train_encoded = target_label_encoder.fit_transform(target_train)
    target_test_encoded = target_label_encoder.fit_transform(target_test)

    target_train_decoded = target_label_encoder.inverse_transform(target_train_encoded)
    target_test_decoded = target_label_encoder.inverse_transform(target_test_encoded)

    if debug_train_test_set_creation:
        log.debug("Encoded target training labels:")
        log.debug(target_train_encoded)
        log.debug("Decoded target training labels:")
        log.debug(target_train_decoded)
        log.debug("\n")
        log.debug("Encoded target test labels:")
        log.debug(target_test_encoded)
        log.debug("Decoded target test labels:")
        log.debug(target_test_decoded)
        log.debug("\n")

    # return [tweet_train, tweet_test, target_train, target_test, target_train_encoded, target_test_encoded]


#######################################################

def scikit_learn_multinomialnb_classifier_non_pipeline():
    """
    Function trains a Multinomial Naive Bayes Classifier without using a Pipeline.

    Note: Implemented for educational purposes - so I can see the manual workflow, otherwise the Pipeline Class hides
    these details and we only have to tune parameters.

    :return: none.
    """

    # Create the training and test sets from the feature and target sets.
    create_training_and_test_set()

    # Use Sci-kit learn to tokenize each Tweet and convert into a bag-of-words sparse feature vector.
    vectorizer = CountVectorizer(min_df=0, lowercase=False)
    tweet_train_encoded = vectorizer.fit_transform(tweet_train)
    tweet_test_encoded = vectorizer.transform(tweet_test)

    if debug:
        log.debug("Vectorized tweet training set:")
        log.debug(tweet_train_encoded)
        log.debug("Vectorized tweet testing set:")
        log.debug(tweet_test_encoded)
        log.debug("Shape of the tweet training set:")
        log.debug(tweet_train_encoded.shape)
        log.debug("Shape of the tweet testing set:")
        log.debug(tweet_test_encoded.shape)

    #######################################################

    # Use Sci-kit learn to convert each tokenized Tweet into term frequencies.
    tfidf_transformer = TfidfTransformer()

    tweet_train_encoded_tfidf = tfidf_transformer.fit_transform(tweet_train_encoded)
    tweet_test_encoded_tfidf = tfidf_transformer.transform(tweet_test_encoded)

    if debug:
        log.debug("vectorized tweet training set term frequencies down-sampled:")
        log.debug(tweet_train_encoded_tfidf)
        log.debug("Shape of the tweet training set term frequencies down-sampled: ")
        log.debug(tweet_train_encoded_tfidf.shape)
        log.debug("\n")
        log.debug("vectorized tweet test set term frequencies down-sampled:")
        log.debug(tweet_test_encoded_tfidf)
        log.debug("Shape of the tweet test set term frequencies down-sampled: ")
        log.debug(tweet_test_encoded_tfidf.shape)
        log.debug("\n")

    #######################################################

    from sklearn.naive_bayes import MultinomialNB

    # Train the Multinomial Naive Bayes Classifier.
    clf_multinomial_nb = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
    clf_multinomial_nb.fit(tweet_train_encoded_tfidf, target_train_encoded)

    # Predict using the Multinomial Naive Bayes Classifier.
    clf_multinomial_nb_predict = clf_multinomial_nb.predict(tweet_test_encoded_tfidf)

    from sklearn.metrics import accuracy_score

    log.debug("MultinomialNB Classifier accuracy using accuracy_score() function : ",
              accuracy_score(target_test_encoded, clf_multinomial_nb_predict, normalize=True))
    log.debug("\n")

    # Another method of obtaining accuracy metric.
    log.debug("Accuracy for test set predictions using multinomialNB:")
    log.debug(str(np.mean(clf_multinomial_nb_predict == target_test_encoded)))
    log.debug("\n")

    # View the results as Tweet => predicted topic classification label.
    for doc, category in zip(tweet_test, clf_multinomial_nb_predict):
        log.debug('%r => %s' % (doc, category))


################################################################################################################
def create_prediction_set():
    """
    Function prepares the borg-classifier dataset to be used for predictions in trained models.

    :return: the prepared dataset.
    """

    # Import the dataset.
    slo_dataset_cmu = \
        pd.read_csv("preprocessed-datasets/dataset_20100101-20180510_tok_PROCESSED.csv", sep=",")

    # Shuffle the data randomly.
    slo_dataset_cmu = slo_dataset_cmu.reindex(
        pd.np.random.permutation(slo_dataset_cmu.index))

    # Generate a Pandas dataframe.
    slo_dataframe_cmu = pd.DataFrame(slo_dataset_cmu)

    if debug_create_prediction_set:
        # Print shape and column names.
        log.debug("\n")
        log.debug("The shape of our SLO CMU dataframe:")
        log.debug(slo_dataframe_cmu.shape)
        log.debug("\n")
        log.debug("The columns of our SLO CMU dataframe:")
        log.debug(slo_dataframe_cmu.head)
        log.debug("\n")

    # Reindex everything.
    slo_dataframe_cmu.index = pd.RangeIndex(len(slo_dataframe_cmu.index))
    # slo_dataframe_cmu.index = range(len(slo_dataframe_cmu.index))

    # Create input features.
    # Note: using "filter()" - other methods seems to result in shape of (658982, ) instead of (658982, 1)
    selected_features_cmu = slo_dataframe_cmu.filter(['tweet_t'])
    processed_features_cmu = selected_features_cmu.copy()

    # Rename column.
    processed_features_cmu.columns = ['Tweets']

    if debug_create_prediction_set:
        # Print shape and column names.
        log.debug("\n")
        log.debug("The shape of our processed features:")
        log.debug(processed_features_cmu.shape)
        log.debug("\n")
        log.debug("The columns of our processed features:")
        log.debug(processed_features_cmu.head)
        log.debug("\n")

    if debug_create_prediction_set:
        # Check what we are using as inputs.
        log.debug("\n")
        log.debug("The Tweets in our input feature:")
        log.debug(processed_features_cmu['Tweets'])
        log.debug("\n")

    return processed_features_cmu


################################################################################################################

def make_predictions(trained_model):
    """
    Function makes predictions using the trained model passed as an argument.

    :param trained_model
    :return: Nothingl.
    """

    # Generate the dataset to be used for predictions.
    prediction_set = create_prediction_set()

    # Make predictions of the borg-slo-classifiers dataset.
    # Note to self: don't be an idiot and try to make predictions on the entire dataframe object instead of a column.
    predictions = trained_model.predict(prediction_set['Tweets'])

    # Store predictions in Pandas dataframe.
    results_df = pd.DataFrame(predictions)

    # Assign column names.
    results_df_column_name = ['TBL_classification']
    results_df.columns = results_df_column_name

    if debug_make_predictions:
        log.debug("The shape of our prediction results dataframe:")
        log.debug(results_df.shape)
        log.debug("\n")
        log.debug("The contents of our prediction results dataframe:")
        log.debug(results_df.head())
        log.debug("\n")

    # Count # of each classifications made.
    social_counter = 0
    economic_counter = 0
    environmental_counter = 0

    for index in results_df.index:
        if results_df['TBL_classification'][index] == 'economic':
            economic_counter += 1
        if results_df['TBL_classification'][index] == 'social':
            social_counter += 1
        if results_df['TBL_classification'][index] == 'environmental':
            environmental_counter += 1

    # Calculate percentages for each classification.
    social_percentage = (social_counter / results_df.shape[0]) * 100.0
    economic_percentage = (economic_counter / results_df.shape[0]) * 100.0
    environmental_percentage = (environmental_counter / results_df.shape[0]) * 100.0

    # Display our statistics.
    log.debug("The number of Tweets identified as social is :" + str(social_counter))
    log.debug("The % of Tweets identified as social in the entire dataset is: " + str(social_percentage))
    log.debug("The number of Tweets identified as economic is :" + str(economic_counter))
    log.debug("The % of Tweets identified as economic in the entire dataset is: " + str(economic_percentage))
    log.debug("The number of Tweets identified as environmental is :" + str(environmental_counter))
    log.debug("The % of Tweets identified as environmental in the entire dataset is: " + str(environmental_percentage))
    log.debug("\n")


################################################################################################################
def multinomial_naive_bayes_classifier_grid_search():
    """
    Function performs a exhaustive grid search to find the best hyper-parameters for use training the model.

    :return: Nothing.
    """
    from sklearn.naive_bayes import MultinomialNB

    # Create randomized training and test set using our dataset.
    create_training_and_test_set()

    multinomial_nb_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)),
    ])

    from sklearn.model_selection import GridSearchCV

    # What parameters do we search for?
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
        'tfidf__use_idf': (True, False),
        'clf__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10],
    }

    # Perform the grid search using all cores.
    multinomial_nb_clf = GridSearchCV(multinomial_nb_clf, parameters, cv=5, iid=False, n_jobs=-1)

    # Train and predict on optimal parameters found by Grid Search.
    multinomial_nb_clf.fit(tweet_train, target_train)
    multinomial_nb_predictions = multinomial_nb_clf.predict(tweet_test)

    if debug_pipeline:
        # View all the information stored in the model after training it.
        classifier_results = pd.DataFrame(multinomial_nb_clf.cv_results_)
        log.debug("The shape of the Multinomial Naive Bayes Classifier model's result data structure is:")
        log.debug(classifier_results.shape)
        log.debug("The contents of the Multinomial Naive Bayes Classifier model's result data structure is:")
        log.debug(classifier_results.head())

    # Display the optimal parameters.
    log.debug("The optimal parameters found for the Multinomial Naive Bayes Classifier is:")
    for param_name in sorted(parameters.keys()):
        log.debug("%s: %r" % (param_name, multinomial_nb_clf.best_params_[param_name]))
    log.debug("\n")

    # Display the accuracy we obtained using the optimal parameters.
    log.debug("Accuracy using Multinomial Naive Bayes Classifier Grid Search is: ")
    log.debug(np.mean(multinomial_nb_predictions == target_test))
    log.debug("\n")


################################################################################################################
def multinomial_naive_bayes_classifier():
    """
    Functions trains a Multinomial Naive Bayes Classifier.

    :return: none.
    """
    from sklearn.naive_bayes import MultinomialNB

    multinomial_nb_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)),
    ])

    # Predict n iterations and calculate mean accuracy.
    mean_accuracy = 0.0
    iterations = 1000
    for index in range(0, iterations):

        # Create randomized training and test set using our dataset.
        create_training_and_test_set()

        multinomial_nb_clf.fit(tweet_train, target_train)
        multinomial_nb_predictions = multinomial_nb_clf.predict(tweet_test)

        # Calculate the accuracy of our predictions.
        accuracy = np.mean(multinomial_nb_predictions == target_test)

        if debug_classifier_iterations:
            # Measure accuracy.
            log.debug("\n")
            log.debug("Accuracy for test set predictions using Multinomial Naive Bayes Classifier:")
            log.debug(str(accuracy))
            log.debug("\n")

            log.debug("Multinomial Naive Bayes Classifier Metrics")
            log.debug(metrics.classification_report(target_test, multinomial_nb_predictions,
                                                    target_names=['economic', 'environmental', 'social']))

            log.debug("Multinomial Naive Bayes Classifier confusion matrix:")
            log.debug(metrics.confusion_matrix(target_test, multinomial_nb_predictions))

        mean_accuracy += accuracy

    mean_accuracy = mean_accuracy / iterations
    log.debug("Multinomial Naive Bayes Classifier:")
    log.debug("Mean accuracy over " + str(iterations) + " iterations is: " + str(mean_accuracy))
    log.debug("\n")

    # Make predictions using trained model.
    log.debug("Prediction statistics using Multinomial Naive Bayes Classifier:")
    make_predictions(multinomial_nb_clf)


################################################################################################################

def sgd_classifier_grid_search():
    """
    Function performs a exhaustive grid search to find the best hyper-parameters for use training the model.

    :return: Nothing.
    """
    from sklearn.linear_model import SGDClassifier

    # Create randomized training and test set using our dataset.
    create_training_and_test_set()

    sgd_classifier_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
                              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
                              n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
                              power_t=0.5, random_state=None, shuffle=True, tol=None,
                              validation_fraction=0.1, verbose=0, warm_start=False)),
    ])

    from sklearn.model_selection import GridSearchCV

    # What parameters do we search for?
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
        'tfidf__use_idf': (True, False),
        'clf__alpha': (1e-1, 1e-2, 1e-3, 0.00001, 0.000001),
    }

    # Perform the grid search using all cores.
    sgd_classifier_clf = GridSearchCV(sgd_classifier_clf, parameters, cv=5, iid=False, n_jobs=-1)

    # Train and predict on optimal parameters found by Grid Search.
    sgd_classifier_clf.fit(tweet_train, target_train)
    sgd_classifier_predictions = sgd_classifier_clf.predict(tweet_test)

    if debug_pipeline:
        # View all the information stored in the model after training it.
        classifier_results = pd.DataFrame(sgd_classifier_clf.cv_results_)
        log.debug("The shape of the SGD Classifier model's result data structure is:")
        log.debug(classifier_results.shape)
        log.debug("The contents of the SGD Classifier model's result data structure is:")
        log.debug(classifier_results.head())

    # Display the optimal parameters.
    log.debug("The optimal parameters found for the SGD Classifier is:")
    for param_name in sorted(parameters.keys()):
        log.debug("%s: %r" % (param_name, sgd_classifier_clf.best_params_[param_name]))
    log.debug("\n")

    # Display the accuracy we obtained using the optimal parameters.
    log.debug("Accuracy using Stochastic Gradient Descent Classifier Grid Search is: ")
    log.debug(np.mean(sgd_classifier_predictions == target_test))
    log.debug("\n")


################################################################################################################
def sgd_classifier():
    """
    Function trains a Stochastic Gradient Descent Classifier.
    
    :return: none.
    """
    from sklearn.linear_model import SGDClassifier

    sgd_classifier_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', SGDClassifier(alpha=0.1, average=False, class_weight=None,
                              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
                              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
                              n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
                              power_t=0.5, random_state=None, shuffle=True, tol=None,
                              validation_fraction=0.1, verbose=0, warm_start=False)),
    ])

    # Predict n iterations and calculate mean accuracy.
    mean_accuracy = 0.0
    iterations = 1000
    for index in range(0, iterations):

        # Create randomized training and test set using our dataset.
        create_training_and_test_set()

        sgd_classifier_clf.fit(tweet_train, target_train)
        sgd_classifier_predictions = sgd_classifier_clf.predict(tweet_test)

        # Calculate the accuracy of our predictions.
        accuracy = np.mean(sgd_classifier_predictions == target_test)

        if debug_classifier_iterations:
            # Measure accuracy.
            log.debug("\n")
            log.debug("Accuracy for test set predictions using Stochastic Gradient Descent Classifier:")
            log.debug(str(accuracy))
            log.debug("\n")

            log.debug("SGD_classifier Classifier Metrics")
            log.debug(metrics.classification_report(target_test, sgd_classifier_predictions,
                                                    target_names=['economic', 'environmental', 'social']))

            log.debug("SGD_classifier confusion matrix:")
            log.debug(metrics.confusion_matrix(target_test, sgd_classifier_predictions))

        mean_accuracy += accuracy

    mean_accuracy = mean_accuracy / iterations
    log.debug("Stochastic Gradient Descent Classifier:")
    log.debug("Mean accuracy over " + str(iterations) + " iterations is: " + str(mean_accuracy))
    log.debug("\n")

    # Make predictions using trained model.
    log.debug("Prediction statistics using Stochastic Gradient Descent Classifier:")
    make_predictions(sgd_classifier_clf)


################################################################################################################
def svm_support_vector_classification_grid_search():
    """
    Function performs a exhaustive grid search to find the best hyper-parameters for use training the model.

    :return: Nothing.
    """
    from sklearn import svm

    # Create randomized training and test set using our dataset.
    create_training_and_test_set()

    svc_classifier_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                        decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
                        max_iter=-1, probability=False, random_state=None, shrinking=True,
                        tol=0.001, verbose=False)),
    ])

    from sklearn.model_selection import GridSearchCV

    # What parameters do we search for?
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
        'tfidf__use_idf': (True, False),
        'clf__C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'clf__gamma': ['scale', 'auto'],
        'clf__shrinking': (True, False),
        'clf__probability': (True, False),
        'clf__tol': [0.1, 0.01, 0.001, 0.0001, 0.00001],
        'clf__decision_function_shape': ['ovo', 'ovr'],
    }

    # Perform the grid search using all cores.
    svc_classifier_clf = GridSearchCV(svc_classifier_clf, parameters, cv=5, iid=False, n_jobs=-1)

    # Train and predict on optimal parameters found by Grid Search.
    svc_classifier_clf.fit(tweet_train, target_train)
    svc_classifier_predictions = svc_classifier_clf.predict(tweet_test)

    if debug_pipeline:
        # View all the information stored in the model after training it.
        classifier_results = pd.DataFrame(svc_classifier_clf.cv_results_)
        log.debug("The shape of the Support Vector Classification Classifier model's result data structure is:")
        log.debug(classifier_results.shape)
        log.debug("The contents of the Support Vector Classification Classifier model's result data structure is:")
        log.debug(classifier_results.head())

    # Display the optimal parameters.
    log.debug("The optimal parameters found for the Support Vector Classification Classifier is:")
    for param_name in sorted(parameters.keys()):
        log.debug("%s: %r" % (param_name, svc_classifier_clf.best_params_[param_name]))
    log.debug("\n")

    # Display the accuracy we obtained using the optimal parameters.
    log.debug("Accuracy using Support Vector Classification Classifier Grid Search is: ")
    log.debug(np.mean(svc_classifier_predictions == target_test))
    log.debug("\n")


################################################################################################################
def svm_support_vector_classification():
    """
    Functions trains a Support Vector Machine - Support Vector Classification Classifier.
    
    :return: none.
    """
    from sklearn import svm

    svc_classifier_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', svm.SVC(C=0.9, cache_size=200, class_weight=None, coef0=0.0,
                        decision_function_shape='ovo', degree=3, gamma='scale', kernel='sigmoid',
                        max_iter=-1, probability=True, random_state=None, shrinking=True,
                        tol=0.01, verbose=False)),
    ])

    # Predict n iterations and calculate mean accuracy.
    mean_accuracy = 0.0
    iterations = 1000
    for index in range(0, iterations):
        # Create randomized training and test set using our dataset.
        create_training_and_test_set()

        svc_classifier_clf.fit(tweet_train, target_train)
        svc_classifier_predictions = svc_classifier_clf.predict(tweet_test)

        # Calculate the accuracy of our predictions.
        accuracy = np.mean(svc_classifier_predictions == target_test)

        if debug_classifier_iterations:
            # Measure accuracy.
            log.debug("\n")
            log.debug("Accuracy for test set predictions using Support Vector Classification Classifier:")
            log.debug(str(accuracy))
            log.debug("\n")

            log.debug("SVC_classifier Metrics")
            log.debug(metrics.classification_report(target_test, svc_classifier_predictions,
                                                    target_names=['economic', 'environmental', 'social']))

            log.debug("SVC_classifier confusion matrix:")
            log.debug(metrics.confusion_matrix(target_test, svc_classifier_predictions))

        mean_accuracy += accuracy

    mean_accuracy = mean_accuracy / iterations
    log.debug("Support Vector Classification Classifier:")
    log.debug("Mean accuracy over " + str(iterations) + " iterations is: " + str(mean_accuracy))
    log.debug("\n")

    # Make predictions using trained model.
    log.debug("Prediction statistics using Support Vector Classification Classifier:")
    make_predictions(svc_classifier_clf)


################################################################################################################
def svm_linear_support_vector_classification_grid_search():
    """
    Function performs a exhaustive grid search to find the best hyper-parameters for use training the model.

    :return: Nothing.
    """
    from sklearn import svm

    # Create randomized training and test set using our dataset.
    create_training_and_test_set()

    linear_svc_classifier_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', svm.LinearSVC(C=0.7, class_weight=None, dual=True, fit_intercept=True,
                              intercept_scaling=1, loss='squared_hinge', max_iter=1000,
                              multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
                              verbose=0)),
    ])

    from sklearn.model_selection import GridSearchCV

    # What parameters do we search for?
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
        'tfidf__use_idf': (True, False),
        'clf__C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'clf__penalty': ['l2'],
        'clf__loss': ['squared_hinge'],
        # 'clf__dual': (True, False),
        'clf__multi_class': ['ovr', 'crammer_singer'],
        'clf__tol': [0.1, 0.01, 0.001, 0.0001, 0.00001],
        'clf__fit_intercept': (True, False),
        'clf__max_iter': [500, 1000, 1500, 2000],
    }

    # Perform the grid search using all cores.
    linear_svc_classifier_clf = GridSearchCV(linear_svc_classifier_clf, parameters, cv=5, iid=False, n_jobs=-1)

    # Train and predict on optimal parameters found by Grid Search.
    linear_svc_classifier_clf.fit(tweet_train, target_train)
    linear_svc_classifier_predictions = linear_svc_classifier_clf.predict(tweet_test)

    if debug_pipeline:
        # View all the information stored in the model after training it.
        classifier_results = pd.DataFrame(linear_svc_classifier_clf.cv_results_)
        log.debug("The shape of the Linear Support Vector Classification Classifier model's result data structure is:")
        log.debug(classifier_results.shape)
        log.debug(
            "The contents of the Linear Support Vector Classification Classifier model's result data structure is:")
        log.debug(classifier_results.head())

    # Display the optimal parameters.
    log.debug("The optimal parameters found for the Linear Support Vector Classification Classifier is:")
    for param_name in sorted(parameters.keys()):
        log.debug("%s: %r" % (param_name, linear_svc_classifier_clf.best_params_[param_name]))
    log.debug("\n")

    # Display the accuracy we obtained using the optimal parameters.
    log.debug("Accuracy using Linear Support Vector Classification Classifier Grid Search is: ")
    log.debug(np.mean(linear_svc_classifier_predictions == target_test))
    log.debug("\n")


################################################################################################################
def svm_linear_support_vector_classification():
    """"
    Function trains a Support Vector Machine - Linear Support Vector Classification Classifier.
    
    :return: none.
    """
    from sklearn import svm

    linear_svc_classifier_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', svm.LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
                              intercept_scaling=1, loss='squared_hinge', max_iter=2000,
                              multi_class='ovr', penalty='l2', random_state=None, tol=0.1,
                              verbose=0)),
    ])

    # Predict n iterations and calculate mean accuracy.
    mean_accuracy = 0.0
    iterations = 1000
    for index in range(0, iterations):
        # Create randomized training and test set using our dataset.
        create_training_and_test_set()

        linear_svc_classifier_clf.fit(tweet_train, target_train)
        linear_svc_classifier_predictions = linear_svc_classifier_clf.predict(tweet_test)

        # Calculate the accuracy of our predictions.
        accuracy = np.mean(linear_svc_classifier_predictions == target_test)

        if debug_classifier_iterations:
            # Measure accuracy.
            log.debug("\n")
            log.debug("Accuracy for test set predictions using Linear Support Vector Classification Classifier:")
            log.debug(str(accuracy))
            log.debug("\n")

            log.debug("LinearSVC_classifier Metrics")
            log.debug(metrics.classification_report(target_test, linear_svc_classifier_predictions,
                                                    target_names=['economic', 'environmental', 'social']))

            log.debug("LinearSVC_classifier confusion matrix:")
            log.debug(metrics.confusion_matrix(target_test, linear_svc_classifier_predictions))

        mean_accuracy += accuracy

    mean_accuracy = mean_accuracy / iterations
    log.debug("Linear Support Vector Classification Classifier:")
    log.debug("Mean accuracy over " + str(iterations) + " iterations is: " + str(mean_accuracy))
    log.debug("\n")

    # Make predictions using trained model.
    log.debug("Prediction statistics using Linear Support Vector Classification Classifier:")
    make_predictions(linear_svc_classifier_clf)


################################################################################################################
def nearest_kneighbor_classifier_grid_search():
    """
       Function performs a exhaustive grid search to find the best hyper-parameters for use training the model.

       :return: Nothing.
       """
    from sklearn.neighbors import KNeighborsClassifier

    # Create randomized training and test set using our dataset.
    create_training_and_test_set()

    k_neighbor_classifier_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', KNeighborsClassifier(n_neighbors=3, n_jobs=-1)),
    ])

    from sklearn.model_selection import GridSearchCV

    # What parameters do we search for?
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
        'tfidf__use_idf': (True, False),
        'clf__n_neighbors': [10, 15, 20, 25, 30],
        'clf__weights': ['uniform', 'distance'],
        'clf__algorithm': ['auto'],
        'clf__leaf_size': [5, 10, 15, 20],
        'clf__p': [1, 2, 3, 4],
        'clf__metric': ['euclidean', 'manhattan'],
    }

    # Perform the grid search using all cores.
    k_neighbor_classifier_clf = GridSearchCV(k_neighbor_classifier_clf, parameters, cv=5, iid=False, n_jobs=-1)

    # Train and predict on optimal parameters found by Grid Search.
    k_neighbor_classifier_clf.fit(tweet_train, target_train)
    k_neighbor_classifier_predictions = k_neighbor_classifier_clf.predict(tweet_test)

    if debug_pipeline:
        # View all the information stored in the model after training it.
        classifier_results = pd.DataFrame(k_neighbor_classifier_clf.cv_results_)
        log.debug("The shape of the KNeighbor Classifier model's result data structure is:")
        log.debug(classifier_results.shape)
        log.debug(
            "The contents of the  KNeighbor Classifier model's result data structure is:")
        log.debug(classifier_results.head())

    # Display the optimal parameters.
    log.debug("The optimal parameters found for the KNeighbor Classifier is:")
    for param_name in sorted(parameters.keys()):
        log.debug("%s: %r" % (param_name, k_neighbor_classifier_clf.best_params_[param_name]))
    log.debug("\n")

    # Display the accuracy we obtained using the optimal parameters.
    log.debug("Accuracy using KNeighbor Classifier Grid Search is: ")
    log.debug(np.mean(k_neighbor_classifier_predictions == target_test))
    log.debug("\n")


################################################################################################################
def nearest_kneighbor_classifier():
    """
    Function trains a Nearest Neighbor - KNeighbor Classifier.
    
    :return: none. 
    """
    from sklearn.neighbors import KNeighborsClassifier

    k_neighbor_classifier_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 2))),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', KNeighborsClassifier(n_neighbors=30, algorithm='auto', leaf_size=10, metric='euclidean', p=1,
                                     weights='uniform', n_jobs=-1)),
    ])

    # Predict n iterations and calculate mean accuracy.
    mean_accuracy = 0.0
    iterations = 1000
    for index in range(0, iterations):
        # Create randomized training and test set using our dataset.
        create_training_and_test_set()

        k_neighbor_classifier_clf.fit(tweet_train, target_train)
        k_neighbor_classifier_predictions = k_neighbor_classifier_clf.predict(tweet_test)

        # Calculate the accuracy of our predictions.
        accuracy = np.mean(k_neighbor_classifier_predictions == target_test)

        if debug_classifier_iterations:
            # Measure accuracy.
            log.debug("\n")
            log.debug("Accuracy for test set predictions using KNeighbor Classifier:")
            log.debug(str(accuracy))
            log.debug("\n")

            log.debug("KNeighbor_classifier Metrics")
            log.debug(metrics.classification_report(target_test, k_neighbor_classifier_predictions,
                                                    target_names=['economic', 'environmental', 'social']))

            log.debug("KNeighbor_classifier confusion matrix:")
            log.debug(metrics.confusion_matrix(target_test, k_neighbor_classifier_predictions))

        mean_accuracy += accuracy

    mean_accuracy = mean_accuracy / iterations
    log.debug("KNeighbor Classifier:")
    log.debug("Mean accuracy over " + str(iterations) + " iterations is: " + str(mean_accuracy))
    log.debug("\n")

    # Make predictions using trained model.
    log.debug("Prediction statistics using KNeighbor Classifier:")
    make_predictions(k_neighbor_classifier_clf)


################################################################################################################
def decision_tree_classifier_grid_search():
    """
       Function performs a exhaustive grid search to find the best hyper-parameters for use training the model.

       :return: Nothing.
       """
    from sklearn import tree

    # Create randomized training and test set using our dataset.
    create_training_and_test_set()

    decision_tree_classifier_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', tree.DecisionTreeClassifier()),
    ])

    from sklearn.model_selection import GridSearchCV

    # What parameters do we search for?
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
        'tfidf__use_idf': (True, False),
        'clf__criterion': ['gini', 'entropy'],
        'clf__max_depth': [None],
        'clf__min_samples_split': [2, 3, 4],
        'clf__min_samples_leaf': [1, 2, 3, 4],
        'clf__min_weight_fraction_leaf': [0],
        'clf__max_features': [None, 'sqrt', 'log2'],
        'clf__max_leaf_nodes': [None, 2, 3, 4],
        'clf__min_impurity_decrease': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9],
    }

    # Perform the grid search using all cores.
    decision_tree_classifier_clf = GridSearchCV(decision_tree_classifier_clf, parameters, cv=5, iid=False, n_jobs=-1)

    # Train and predict on optimal parameters found by Grid Search.
    decision_tree_classifier_clf.fit(tweet_train, target_train)
    decision_tree_classifier_predictions = decision_tree_classifier_clf.predict(tweet_test)

    if debug_pipeline:
        # View all the information stored in the model after training it.
        classifier_results = pd.DataFrame(decision_tree_classifier_clf.cv_results_)
        log.debug("The shape of the Decision Tree Classifier model's result data structure is:")
        log.debug(classifier_results.shape)
        log.debug(
            "The contents of the Decision Tree Classifier model's result data structure is:")
        log.debug(classifier_results.head())

    # Display the optimal parameters.
    log.debug("The optimal parameters found for the Decision Tree Classifier is:")
    for param_name in sorted(parameters.keys()):
        log.debug("%s: %r" % (param_name, decision_tree_classifier_clf.best_params_[param_name]))
    log.debug("\n")

    # Display the accuracy we obtained using the optimal parameters.
    log.debug("Accuracy using Decision Tree Classifier Grid Search is: ")
    log.debug(np.mean(decision_tree_classifier_predictions == target_test))
    log.debug("\n")


################################################################################################################
def decision_tree_classifier():
    """
    Functions trains a Decision Tree Classifier.
    
    :return: none. 
    """
    from sklearn import tree

    decision_tree_classifier_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 2))),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', tree.DecisionTreeClassifier(criterion='gini', max_depth=None, max_features=None,
                                            max_leaf_nodes=3, min_impurity_decrease=1e-5, min_samples_leaf=1,
                                            min_samples_split=2, min_weight_fraction_leaf=0)),
    ])

    # Predict n iterations and calculate mean accuracy.
    mean_accuracy = 0.0
    iterations = 1000
    for index in range(0, iterations):
        # Create randomized training and test set using our dataset.
        create_training_and_test_set()

        decision_tree_classifier_clf.fit(tweet_train, target_train)
        decision_tree_classifier_predictions = decision_tree_classifier_clf.predict(tweet_test)

        # Calculate the accuracy of our predictions.
        accuracy = np.mean(decision_tree_classifier_predictions == target_test)

        if debug_classifier_iterations:
            # Measure accuracy.
            log.debug("\n")
            log.debug("Accuracy for test set predictions using Decision Tree Classifier:")
            log.debug(str(accuracy))
            log.debug("\n")

            log.debug("DecisionTree_classifier Metrics")
            log.debug(metrics.classification_report(target_test, decision_tree_classifier_predictions,
                                                    target_names=['economic', 'environmental', 'social']))

            log.debug("DecisionTree_classifier confusion matrix:")
            log.debug(metrics.confusion_matrix(target_test, decision_tree_classifier_predictions))

        mean_accuracy += accuracy

    mean_accuracy = mean_accuracy / iterations
    log.debug("Decision Tree Classifier:")
    log.debug("Mean accuracy over " + str(iterations) + " iterations is: " + str(mean_accuracy))
    log.debug("\n")

    # Make predictions using trained model.
    log.debug("Prediction statistics using Decision Tree Classifier:")
    make_predictions(decision_tree_classifier_clf)


################################################################################################################
def multi_layer_perceptron_classifier_grid_search():
    """
         Function performs a exhaustive grid search to find the best hyper-parameters for use training the model.

         :return: Nothing.
         """
    from sklearn.neural_network import MLPClassifier

    # Create randomized training and test set using our dataset.
    create_training_and_test_set()

    mlp_classifier_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MLPClassifier(activation='logistic', alpha=1e-1, batch_size='auto',
                              beta_1=0.9, beta_2=0.999, early_stopping=True,
                              epsilon=1e-08, hidden_layer_sizes=(5, 2),
                              learning_rate='constant', learning_rate_init=1e-1,
                              max_iter=1000, momentum=0.9, n_iter_no_change=10,
                              nesterovs_momentum=True, power_t=0.5, random_state=1,
                              shuffle=True, solver='sgd', tol=0.0001,
                              validation_fraction=0.1, verbose=False, warm_start=False)),
    ])

    from sklearn.model_selection import GridSearchCV

    # What parameters do we search for?
    parameters = {
        'vect__ngram_range': [(1, 1)],
        # 'tfidf__use_idf': (True, False),
        # 'clf__hidden_layer_sizes': [(15, 15), (50, 50)],
        'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'clf__solver': ['lbfgs', 'sgd', 'adam'],
        'clf__alpha': [1e-1, 1e-2, 1e-4, 1e-6, 1e-8],
        # 'clf__batch_size': [5, 10, 20, 40, 80, 160],
        'clf__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'clf__learning_rate_init': [1e-1, 1e-3, 1e-5],
        # 'clf__power_t': [0.1, 0.25, 0.5, 0.75, 1.0],
        # 'clf__max_iter': [200, 400, 800, 1600],
        # 'clf_shuffle': [True, False],
        # 'clf__tol': [1e-1, 1e-2, 1e-4, 1e-6, 1e-8],
        # 'clf__momentum': [0.1, 0.3, 0.6, 0.9],
        # 'clf_nestesrovs_momentum': [True, False],
        # 'clf_early_stopping': [True, False],
        # 'clf__validation_fraction': [0.1, 0.2, 0.4],
        # 'clf_beta_1': [0.1, 0.2, 0.4, 0.6, 0.8],
        # 'clf_beta_2': [0.1, 0.2, 0.4, 0.6, 0.8],
        # 'clf_epsilon': [1e-1, 1e-2, 1e-4, 1e-8],
        # 'clf__n_iter_no_change': [1, 2, 4, 8, 16]

    }

    # Perform the grid search using all cores.
    mlp_classifier_clf = GridSearchCV(mlp_classifier_clf, parameters, cv=5, iid=False,
                                      n_jobs=-1)

    # Train and predict on optimal parameters found by Grid Search.
    mlp_classifier_clf.fit(tweet_train, target_train)
    mlp_classifier_predictions = mlp_classifier_clf.predict(tweet_test)

    if debug_pipeline:
        # View all the information stored in the model after training it.
        classifier_results = pd.DataFrame(mlp_classifier_clf.cv_results_)
        log.debug("The shape of the Multi Layer Perceptron Neural Network Classifier model's result data structure is:")
        log.debug(classifier_results.shape)
        log.debug(
            "The contents of the Multi Layer Perceptron Neural Network Classifier model's result data structure is:")
        log.debug(classifier_results.head())

    # Display the optimal parameters.
    log.debug("The optimal parameters found for the Multi Layer Perceptron Neural Network Classifier is:")
    for param_name in sorted(parameters.keys()):
        log.debug("%s: %r" % (param_name, mlp_classifier_clf.best_params_[param_name]))
    log.debug("\n")

    # Display the accuracy we obtained using the optimal parameters.
    log.debug("Accuracy using Multi Layer Perceptron Neural Network Classifier Grid Search is: ")
    log.debug(np.mean(mlp_classifier_predictions == target_test))
    log.debug("\n")


################################################################################################################
def multi_layer_perceptron_classifier():
    """
    Function trains a Multi Layer Perceptron Neural Network Classifier.
    
    :return: none. 
    """
    from sklearn.neural_network import MLPClassifier

    mlp_classifier_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', MLPClassifier(activation='identity', alpha=1e-1, batch_size='auto',
                              beta_1=0.9, beta_2=0.999, early_stopping=True,
                              epsilon=1e-08, hidden_layer_sizes=(5, 2),
                              learning_rate='constant', learning_rate_init=1e-1,
                              max_iter=1000, momentum=0.9, n_iter_no_change=10,
                              nesterovs_momentum=True, power_t=0.5, random_state=1,
                              shuffle=True, solver='lbfgs', tol=0.1,
                              validation_fraction=0.1, verbose=False, warm_start=False)),
    ])

    # Predict n iterations and calculate mean accuracy.
    mean_accuracy = 0.0
    iterations = 1000
    for index in range(0, iterations):
        # Create randomized training and test set using our dataset.
        create_training_and_test_set()

        # from sklearn.preprocessing import StandardScaler
        # scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        # scaler.fit(tweet_train)
        # tweet_train_scaled = scaler.transform(tweet_train)
        # tweet_test_scaled = scaler.transform(tweet_test)

        mlp_classifier_clf.fit(tweet_train, target_train)
        mlp_classifier_predictions = mlp_classifier_clf.predict(tweet_test)

        # Calculate the accuracy of our predictions.
        accuracy = np.mean(mlp_classifier_predictions == target_test)

        if debug_classifier_iterations:
            # Measure accuracy.
            log.debug("\n")
            log.debug("Accuracy for test set predictions using Decision Tree Classifier:")
            log.debug(str(accuracy))
            log.debug("\n")

            log.debug("MLP_classifier Metrics")
            log.debug(metrics.classification_report(target_test, mlp_classifier_predictions,
                                                    target_names=['economic', 'environmental', 'social']))

            log.debug("MLP_classifier confusion matrix:")
            log.debug(metrics.confusion_matrix(target_test, mlp_classifier_predictions))

        mean_accuracy += accuracy

    mean_accuracy = mean_accuracy / iterations
    log.debug("Multi Layer Perceptron Neural Network Classifier:")
    log.debug("Mean accuracy over " + str(iterations) + " iterations is: " + str(mean_accuracy))
    log.debug("\n")

    # Make predictions using trained model.
    log.debug("Prediction statistics using Multi Layer Perceptron Neural Network Classifier:")
    make_predictions(mlp_classifier_clf)


################################################################################################################
def logistic_regression_classifier_grid_search():
    """
       Function performs a exhaustive grid search to find the best hyper-parameters for use training the model.

       :return: Nothing.
       """
    from sklearn.linear_model import LogisticRegression

    # Create randomized training and test set using our dataset.
    create_training_and_test_set()

    logistic_regression_classifier_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression(random_state=0, solver='lbfgs',
                                   multi_class='multinomial', n_jobs=-1)),
    ])

    from sklearn.model_selection import GridSearchCV

    # What parameters do we search for?
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
        'tfidf__use_idf': (True, False),
        'clf__penalty': ['l2'],
        'clf__tol': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
        'clf__C': [0.2, 0.4, 0.6, 0.8, 1.0],
        'clf__fit_intercept': [True, False],
        'clf__class_weight': ['balanced', None],
        'clf__solver': ['saga', 'newton-cg', 'sag', 'lbfgs'],
        'clf__max_iter': [2000, 4000, 8000, 16000],
        'clf__multi_class': ['ovr', 'multinomial'],
    }

    # Perform the grid search using all cores.
    logistic_regression_classifier_clf = GridSearchCV(logistic_regression_classifier_clf, parameters, cv=5, iid=False,
                                                      n_jobs=-1)

    # Train and predict on optimal parameters found by Grid Search.
    logistic_regression_classifier_clf.fit(tweet_train, target_train)
    logistic_regression_classifier_predictions = logistic_regression_classifier_clf.predict(tweet_test)

    if debug_pipeline:
        # View all the information stored in the model after training it.
        classifier_results = pd.DataFrame(logistic_regression_classifier_clf.cv_results_)
        log.debug("The shape of the  Logistic Regression Classifier model's result data structure is:")
        log.debug(classifier_results.shape)
        log.debug(
            "The contents of the Logistic Regression Classifier model's result data structure is:")
        log.debug(classifier_results.head())

    # Display the optimal parameters.
    log.debug("The optimal parameters found for the Logistic Regression Classifier is:")
    for param_name in sorted(parameters.keys()):
        log.debug("%s: %r" % (param_name, logistic_regression_classifier_clf.best_params_[param_name]))
    log.debug("\n")

    # Display the accuracy we obtained using the optimal parameters.
    log.debug("Accuracy using Logistic Regression Classifier Grid Search is: ")
    log.debug(np.mean(logistic_regression_classifier_predictions == target_test))
    log.debug("\n")


################################################################################################################
def logistic_regression_classifier():
    """
    Function trains a Logistic Regression Classifier.
    
    :return: none. 
    """
    from sklearn.linear_model import LogisticRegression

    logistic_regression_classifier_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', LogisticRegression(C=1.0, class_weight=None, fit_intercept=False, max_iter=2000,
                                   multi_class='ovr', penalty='l2', solver='sag', tol=1e-1)),
    ])

    # Predict n iterations and calculate mean accuracy.
    mean_accuracy = 0.0
    iterations = 1000
    for index in range(0, iterations):

        # Create randomized training and test set using our dataset.
        create_training_and_test_set()

        logistic_regression_classifier_clf.fit(tweet_train, target_train)
        logistic_regression_classifier_predictions = logistic_regression_classifier_clf.predict(tweet_test)

        # Calculate the accuracy of our predictions.
        accuracy = np.mean(logistic_regression_classifier_predictions == target_test)

        if debug_classifier_iterations:
            # Measure accuracy.
            log.debug("\n")
            log.debug("Accuracy for test set predictions using Logistic Regression Classifier:")
            log.debug(str(accuracy))
            log.debug("\n")

            log.debug("LogisticRegression_classifier Metrics")
            log.debug(metrics.classification_report(target_test, logistic_regression_classifier_predictions,
                                                    target_names=['economic', 'environmental', 'social']))

            log.debug("LogisticRegression_classifier confusion matrix:")
            log.debug(metrics.confusion_matrix(target_test, logistic_regression_classifier_predictions))

        mean_accuracy += accuracy

    mean_accuracy = mean_accuracy / iterations
    log.debug("Logistic Regression Classifier:")
    log.debug("Mean accuracy over " + str(iterations) + " iterations is: " + str(mean_accuracy))
    log.debug("\n")

    # Make predictions using trained model.
    log.debug("Prediction statistics using Logistic Regression Classifier:")
    make_predictions(logistic_regression_classifier_clf)


################################################################################################################
def keras_deep_neural_network():
    """
    Function implements a Keras Deep Neural Network Model.
    TODO - most likely won't be implemented until summer research with much larger labeled datasets.
    :return: none. 
    """

    from keras.models import Sequential
    from keras import layers

    pass


################################################################################################################

############################################################################################
"""
Main function.  Execute the program.
"""

# Debug variable.
debug_main = 0

if __name__ == '__main__':

    import time

    start_time = time.time()

    # Call non-pipelined multinomial Naive Bayes classifier training function.
    # scikit_learn_multinomialnb_classifier_non_pipeline()

    # Call pipelined classifier training functions and grid search functions.
    # multinomial_naive_bayes_classifier_grid_search()
    multinomial_naive_bayes_classifier()
    # sgd_classifier_grid_search()
    sgd_classifier()
    # svm_support_vector_classification_grid_search()
    svm_support_vector_classification()
    # svm_linear_support_vector_classification_grid_search()
    svm_linear_support_vector_classification()
    # nearest_kneighbor_classifier_grid_search()
    nearest_kneighbor_classifier()
    # decision_tree_classifier_grid_search()
    decision_tree_classifier()
    # multi_layer_perceptron_classifier_grid_search()
    multi_layer_perceptron_classifier()
    # logistic_regression_classifier_grid_search()
    logistic_regression_classifier()

    end_time = time.time()

    if debug_pipeline:
        log.debug("The time taken to train the classifier(s) is:")
        total_time = end_time - start_time
        log.debug(str(total_time))
        log.debug("\n")

    # For debug purposes.
    # my_set = create_prediction_set()

############################################################################################


DEBUG:root:Multinomial Naive Bayes Classifier:
DEBUG:root:Mean accuracy over 1000 iterations is: 0.49251086956521645
DEBUG:root:

DEBUG:root:Prediction statistics using Multinomial Naive Bayes Classifier:
DEBUG:root:The number of Tweets identified as social is :648413
DEBUG:root:The % of Tweets identified as social in the entire dataset is: 98.3961625658971
DEBUG:root:The number of Tweets identified as economic is :10569
DEBUG:root:The % of Tweets identified as economic in the entire dataset is: 1.6038374341029042
DEBUG:root:The number of Tweets identified as environmental is :0
DEBUG:root:The % of Tweets identified as environmental in the entire dataset is: 0.0
DEBUG:root:

DEBUG:root:Stochastic Gradient Descent Classifier:
DEBUG:root:Mean accuracy over 1000 iterations is: 0.49178260869565205
DEBUG:root:

DEBUG:root:Prediction statistics using Stochastic Gradient Descent Classifier:
DEBUG:root:The number of Tweets identified as social is :658982
DEBUG:root:The % of Tweets identified a

# Final Project Proposal - Draft 2 - Preliminary Draft of Final Paper:

Social License to Operator Triple-Bottom-Line Tweet Classification</p>


The application domain is the Triple-Bottom-Line (TBL) classification of Tweet in the context of the Social License to Operate (SLO) of mining companies.  The objective of this project is to continue and extend the earlier work on Tweet TBL classification done at CSIRO – the Commonwealth Scientific and Industrial Research Organization (Australia’s National Science Agency).  The goal is to set up a prototype machine learning system that is capable of identifying the topic classification of a Tweet as either Environmental, Social, or Economic.  The initial milestone is to achieve at an absolute minimum a 50% accuracy metric or higher, indicating the ability to at least guess on par with a flip of a coin.</p>
	
    
The Social License to Operate is defined as when an existing project has the ongoing approval of the local community and other stakeholders within the domain the project operates in.  It is the ongoing social acceptance of that project in regards to a favorable or dis-favorable disposition by those who are concerned.  The SLO must not only be earned but also maintained as the beliefs, opinions, and perceptions of people tend to be dynamic over the course of time.  It is beneficial to the project owners and managers to maintain an agreeable relationship with the local population and their stakeholders.</p>
	
    
The Triple Bottom Line is defined as a framework where organizations and companies dedicate themselves not only to profit but also the social and environmental impact of their operation.  The phrase was coined by the British management consultant John Elkington as a metric to measure the performance of corporate America.  According to Investopedia, the corporate business should be done according to:</p>


Profit – the traditional measure of corporate profit – the profit and loss (P & L) account.</p>

People – the measure of how socially responsible an organization has been throughout its operations.</p>

Planet – the measure of how environmentally responsible a firm has been.</p>

These are the three elements of TBL which are then sourced into the terms Economy (profit), Environmental (planet), and Social (people).</p>
	
    
Twitter data (Tweets) can be obtained in 4 distinct ways – retrieval from the Twitter public API, use of an existing Twitter dataset, purchase from Twitter directly, or purchase access from a 3rd party Twitter service provider.  For the purposes of this project, we will be using existing Twitter datasets provided by Professor VanderLinden via access to Calvin College’s Borg supercomputer.  Specifically, we will be using a training set consisting of crowdsourced Triple Bottom Line labeled Tweets used by CSIRO in their preliminary topic classification research.  For the test set, we will be using a small dataset consisting of TBL labeled Tweets hand-labeled by Professor VanderLinden.  With the machine learning model trained on these two sets, we will then generalize the model to make predictions on the dataset used for stance classification of Tweets in earlier research by Professor VanderLinden and Roy Adams.</p>
	
    
As our research is a continuation of prior research from CSIRO and based on the foundation laid by Professor VanderLinden’s “Machine Learning for Social Media” project, we see no reason to not use machine learning.  While we might consider symbolic artificial intelligence (GOFAI – Good, Old-Fashioned AI), we learned in CS-344 that symbolic reasoning implementations resulted in rules engines, also known as expert systems or knowledge graphs.  These proved to be too brittle and became unmanageable as the knowledge base grew beyond a few thousand rules.  Considering the nature of Tweets, GOFAI seems not to be a viable solution.  The language of Tweets is often informal, prone to slang, misuse of established grammatical rules, and in general a chimeric bastardization of known human languages (insofar in my experience).  It is doubtful a purely symbolic AI would be computationally feasible.  Perhaps as Professor VanderLinden mentioned, a hybrid A.I. combining symbolic reasoning and deep neural networks is the future of A.I. and would prove to be a feasible approach.</p>


Preliminary analysis of the two provided datasets indicates that they will require significant pre-processing before becoming useable as input features for machine learning.  The Tweets are stored as comma delimited CSV files.  The training dataset consists of 299 total Tweets, of which 198 are unlabeled due to not being associated with any TBL classification.  The test dataset consists of 31 hand-labeled Tweets.  Based on the size of the datasets we are working with neural networks may not be the best choice to start with.  Neural networks typically require larger datasets in order to train and as we barely have 330 total examples to work with, the results may be less than optimal.  Therefore, we will start with Bayesian models and SVM’s – Support Vector Machines.  Later, we will expand to using supervised neural networks just to see if we can tune hyperparameters to obtain results closely comparable to our non-NN models.</p>
	 
     
For fast prototyping, we will be using Scikit-Learn in Python rather than Keras or straight Tensorflow, at least until we have established which baseline supervised learning algorithm will provide us with the potential for the best results.  We will also use Pandas, built on NumPy, for data-frame manipulation and matplotlib for visualizations.  To encode our categorical Tweet data into useable numerical Tweet data, we will be using the tools provided by Scikit-Learn.</p>
	
    
Our Bayesian model will be the MultinomialNB classifier that implements the naïve Bayes algorithm for multinomially distributed data.  Scikit-Learn.org indicates that it is one of the two classic Naïve Bayes variants used in textual classification problems.  This indicates it will be an excellent starting point as we have decided our two datasets are too small to initially warrant the use of a supervised neural network training algorithm.  “Naïve” in this case indicates the application of Bayes’ theorem with the “naïve” assumption of conditional independence between every pair of features given the value of the class variable (4).  Further information indicates the classifier performs fast and works in many real-world applications, including document classification and spam filtering.  We built a spam filter based on Paul Graham’s “A Plan for Spam” and indeed it worked well.</p> 
	
    
Our SVM classification model will be the LinearSVC Classifier– Linear Support Vector Classification.  Sci-Kit Learn indicates it is effective in high dimensional spaces and when the number of dimensions is greater than the number of samples.  This will be the case for us as we have a limited 330 samples and after multi-hot encoding to form a feature vector to create a bag-of-words vocabulary, our dimensionality is bound to be pretty high in comparison to the samples.  The memory efficiency of this algorithm should also help as we will no doubt have sparse vectors in comparison to the total vocabulary present across all of the Tweets.  Of note, is that SVM algorithms are not scaling invariant, so data scaling is required, which will matter in our case as encoding our categorical word data will result in word occurrence values for the input feature vector (unless we choose to simply represent as binary: 0 – word not present and 1- word is present). API documentation indicates that the classifier supports sparse input (good for us) and supports multi-class using the one-vs-the-rest scheme.</p>
	
    
Our deep neural network will be the MLP Classifier – multi-layer perceptron.  Scikit-Learn indicates it uses a Softmax layer as the output function to perform multi-class classification and uses the cross-entropy loss function.  MLP also supports multi-label classification through use of the logistic activation function where values > 0.5  1 and values < 0.5  0.  Given this, it would be possible for us to perform multi-class multi-label TBL classification on our training dataset.  Our training dataset does possess Tweets that have been given multiple topic classifications, although some are redundant duplicates of either economic, social, or environmental.  We will leave this possibility for the future, time permitting.  Effective use of the MLP classifier would most likely require us to hand-label additional training example from the larger Twitter datasets present on Calvin’s Borg supercomputer.  Crowdsourcing does not seem a viable option so this task would be tediously time-consuming.</p>
	
    
The application of machine learning to Social License to Operate on Triple-Bottom-Line topic classification can potentially assist any organization or company in evaluating their current level of acceptability by the local population and relevant stakeholders.  Specifically, it could help evaluate whether people are more concerned about the economic, social, or environmental aspects of the project.  In conjunction with stance and sentiment SLO machine learning models, it should be plausible that the level of acceptability of a project can be accurately judged.</p>
	
    
With social media so prevalent in this day and age, it is a simple matter to obtain fresh new datasets on a daily basis to gauge the SLO.  As such, the synchronicity between the dynamism of maintaining the SLO and new Tweets pertaining to the associated project works well.  Rather than conduct old fashioned mail surveys, which is time-consuming and potentially expensive, the entire procedure can be automated.  Extract Twitter data using the Twitter API, pre-process the dataset, post-process the dataset, insert into the machine learning model(s) as input feature vectors, and predict the level of approval.  Given a good model, any organization, corporation, or other entity, can perform a pseudo-real-time estimate on how accepted their current operations and activities are.</p>
	
    
The initial investment would be in adjusting hyperparameters with the validation set to achieve the optimal results while avoiding overfitting and ensuring the model generalizes well to new data.  Once this is achieved, the model should be relevant and usable as an SLO predictor for a given period of time for a particular project and organization.  Of course, even with a good model perhaps the best way to judge SLO would still be to do a face-to-face interview with the individuals in the community and stakeholders and simply ask how they feel about the project.  Then again, the anonymity of the Internet does provide an outlet for people to vent and voice their opinions with less fear of reprisal than in reality.  So perhaps anonymous Tweeters are more honest.  But, anonymity could also cause people to simply say whatever they desire with little regard to how their words actually correlate to their own personal beliefs and opinions on the matter.  Either way, an SLO TBL machine learned prediction model won’t be the be all and end all in estimating Social License to Operate.  But, it can be a useful cog in the whole machine in order to generate the necessary analysis required to measure the components of SLO.</p>
 
 
Works Referenced:


1)	Anonymous ACL submission. “Classifying Stance Using Profile Texts”.</p>

2)	“1. Supervised Learning¶.” Scikit, scikit-learn.org/stable/supervised_learning.html#supervised-learning.</p>

3)	“A Gentle Introduction to the Bag-of-Words Model.” Machine Learning Mastery, 12 Mar. 2019, machinelearningmastery.com/gentle-introduction-bag-words-model/.</p>

4)	“Introduction to Machine Learning  |  Machine Learning Crash Course  |  Google Developers.” Google, Google, developers.google.com/machine-learning/crash-course/ml-intro.</p>

5)	Kenton, Will. “How Can There Be Three Bottom Lines?” Investopedia, Investopedia, 9 Apr. 2019, www.investopedia.com/terms/t/triple-bottom-line.asp.</p>

6)	Littman, Justin. “Where to Get Twitter Data for Academic Research.” Social Feed Manager, 14 Sept. 2017, gwu-libraries.github.io/sfm-ui/posts/2017-09-14-twitter-data.</p>

7)	Mohammad, Saif, et al. “SemEval-2016 Task 6: Detecting Stance in Tweets.” Proceedings of the 10th International Workshop on Semantic Evaluation (SemEval-2016), 2016, doi:10.18653/v1/s16-1003.</p>

8)	“Multiclass Classification.” Wikipedia, Wikimedia Foundation, 18 Apr. 2019, en.wikipedia.org/wiki/Multiclass_classification.</p>

9)	“Symbolic Reasoning (Symbolic AI) and Machine Learning.” Skymind, skymind.ai/wiki/symbolic-reasoning.
10)	Walker, Leslie. “Learn Tweeting Slang: A Twitter Dictionary.” Lifewire, Lifewire, 8 Nov. 2017, www.lifewire.com/twitter-slang-and-key-terms-explained-2655399.</p>

11)	“What Is the Social License?” The Social License To Operate, socialicense.com/definition.html.</p>

12)	“Working With Text Data¶.” Scikit, scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html.</p>

# Social License to Operate: Triple-Bottom-Line Topic Classification Report

Vision:</p>


The general purpose of the project is to perform Social License to Operate Triple-Bottom-Line topic classification on Twitter data associated with various mining companies.  Social License to Operate indicates the ongoing acceptance of a company or industry’s standard business practices and operating procedures by its employees, stakeholders, and the general public (Investopedia).  Triple Bottom Line is a framework or theory that recommends that companies commit to focus on social and environmental concerns just as they do on profits (Investopedia).  We will use supervised machine learning algorithms to perform multi-class single-label classification Tweets to predict whether their topic of discussion corresponds to social, environmental, or economic concerns.</p>


Background:</p>


Our work is a revival and continuation of the work initially done at the Commonwealth Scientific and Industrial Research Organization (CSIRO) by (insert name here) on TBL topic classification.  We are not directly referencing that research but instead basing our initial data pre-processing on the anonymous ACL submission titled “Classifying Stance Using Profile Text”.  We are however using the exact same labeled training dataset that was used in the prior research for TBL topic classification on SLO for mining companies.  Our work will also involve use of the datasets available on Calvin College’s Borg Supercomputer and will be uploaded to the Calvin-CS / slo-classifiers GitHub Repository.  This project will be a prelude to continued research on topic, stance, and sentiment analysis utilizing machine learning for Social License to Operate of mining companies in connection with Professor VanderLinden’s “Machine Learning for Social Media” research project.</p>


As of the current status of this report, we are currently rapid prototyping using Scikit-Learn machine learning classifiers.  These classifiers require minimal effort to initially setup with default hyperparameters.  They train speedily and provide results in a timely manner, allowing us to adjust our hyper-parameters on-the-fly to see if there are any noticeable differences.  It is also quite simple to add additional Classifiers as the Pipeline class allows literal copy/paste of a code template.  All that is required is the addition of a new import statement for that Classifier and to replace the name of the old Classifier and its corresponding parameters with the new one.  This design feature is one of the reasons we chose to utilize Scikit-Learn; that and it was recommended by Professor VanderLinden as the starting point.</p>  


Of note is that Scikit-Learn provides automated parameter tuning via the Grid Search and Random Search classes.  Grid search methodically builds and evaluates a model for each combination of algorithm parameters specified in a grid.  Random search methodically builds and evaluates a model for each combination of algorithm parameters sampled from a random distribution for a fixed number of iterations.  We plan to utilize one or both of these parameter tuning methods in order to expedite the search for optimal hyperparameters for all of the Scikit-Learn Classifiers we are prototyping with.  As we add additional Classifiers to our codebase, it becomes time-saving to automate parameter tuning as much as possible.</p>


Once we have established which classifiers have the most potential to provide favorable metrics, we may migrate towards Keras and Tensorflow for GPU support and more versatility.  Scikit-Learn does not provide GPU support for its machine learning algorithms.  This does not matter at the moment as we are working with two very small datasets which in total only provide us with 330 samples.  That and GPU support will primarily benefit deep neural networks while we are also using non-NN algorithms.  However, if we wish to crowdsource TBL classification on significantly larger Twitter datasets and work with those, then GPU support will become necessary.  We have heard it requires approximately 24 hours utilizing one Nvidia Geforce Titan on the Borg supercomputer to perform stance analysis training on the larger Twitter datasets consisting of 500k+ examples.  It would be expedient to parallelize this process utilizing all 4 Nvidia Geforce Titans on Borg to cut the training time down to a quarter.</p>


We plan to implement metric visualizations via the use of the matplotlib library and SciView in Pycharm.  The Scikit-learn online documentation has a section on “Classification of text documents using sparse features” that can hopefully be modified to suit our purposes.  Their codebase constructs a bar plot comparing a variety of Classifiers side-by-side visualizing the accuracy score, training time, and test time.  As we are also training multiple Classifiers in the hopes of finding a suitable one(s) to further explore in the Keras and Tensorflow API, this type of visualization would be very useful.  Individual charts detailing a metric summarization of the micro/macro average, weighted average and associated precision, recall, f1-score, and support values are also planned.</p>


Implementation:</p>


These sections will describe in detail (perhaps too much detail) our current implementation for SLO TBL topic classification in Python in association with the current state of the codebase.  We have decided to keep all debug output statements in the meantime as this is far from the final system that will be implemented.</p>


We are performing text pre-processing on the training dataset that consists of 229 Tweet examples.  Not all of them are labeled with a TBL topic classification and those are dropped from consideration.  The data is shuffled randomly upon importation to ensure there is no biased structure to the import order.  We do so by utilizing Numpy’s “random. permutation” function.  Then, a Pandas dataframe is constructed to store the dataset.  Custom column names are added for clarity of purpose as none originally exist.  The “Tweet” column stores the Tweet, “SLO1” stores the first assigned topic label, “SLO2” and “SLO3” do the same.</p>


Pandas provide a “dropna()” method by which we drop all rows without at least 2 non-NaN values.  This indicates that the example lacks any TBL classification labels and can be safely discarded.  We use Boolean indexing via bitwise operations, the “.notna()” method, to construct a mask by which we isolate those examples with only a single TBL classification.  These examples are placed in a new dataframe and afterward, we drop the SLO2 and SLO3 columns as they are obviously just NaN values.  This procedure is effective as a preliminary analysis of the CSV file indicates that all labeled examples definitely have a label in the “SLO1” column.  Our objective is to construct a dataframe consisting of a column storing the raw Tweet and another column storing a single topic classification.  We rename this new dataframe to columns “Tweet” and “SLO”.</p>


Next, we construct another mask to isolate all examples with multiple SLO TBL classifications and apply the mask to construct a new dataframe containing only those examples.  We then perform a “drop()” operation on the new dataframe to construct 3 separate dataframes.  The first from dropping SLO2 and SLO3, second dropping SLO1 and SLO3, and third dropping SLO1 and SLO2.  This inefficient but workable solution effectively create duplicates of all examples with multiple SLO TBL classifications with just a single label per example.  We then name the columns “Tweet” and “SLO”.  This is done so that our machine learning model can take into consideration those examples that can be classified as multiple topics.</p>


The multiple separate dataframes constructed from the above operations are then concatenated back together as a single whole Pandas dataframe.  Any rows with a NaN value in any column are then dropped via “dropna()” to effectively remove all examples with multiple topic classifications that might have had a topic in SLO2 but not SLO3 or vice versa.  Last, we drop all duplicated examples possessing the same TBL classification values in the “SLO” column.  We do this as the initial imported dataset sometimes contained duplicate labels for the same example.  We surmise this is because multiple people were manually hand-tagging the Tweets and sometimes they were in agreement.</p>


Using the “shape()” method call, our final training dataframe contains a total of 245 Tweets with a single TBL topic classification label.  It should be noted that as of our current implementation the second TBL labeled dataset provided by Professor VanderLinden is not currently in use.  There are 31 additional Tweets and we plan to include these in the future to help alleviate our issue of a small training and test dataset.  We are also using a large Twitter dataset that has already been pre-processed and tokenized as the set we will make predictions on in order to test the generalization of our model(s) to new data.  This set does not contain any target labels and thus we cannot use part of it to supplement our small training and test sets.  There are a total of 658983 Tweets included.  The CMU Tweet Tagger was used to pre-process the text but unfortunately, this is not a feasible option for us as we are working solely on Windows OS workstation(s).</p>


As we are incapable of using the Linux/Mac only CMU Tweet Tagger for pre-processing, our decision was to manually clean the raw Tweet using Python regular expressions and other libraries.  The Natural Language Toolkit was considered as an alternative but ultimately we chose to just use built-in Python libraries and functions.  A for loop is used to send each Tweet to a preprocessing function that does the following:</p>


a)	Removes “RT” tags indicating retweets.</p>

b)	Removes URL. (e.ge. https//…) and replace with slo_url.</p>

c)	Removes Tweet mentions (e.g. @mention) and replaces with slo_mention.</p>

d)	Removes Tweet hashtags (e.g. #hashtag) and replaces with slo_hashtag.</p>

e)	Removes all punctuation from the Tweet.</p>


We also down-case all text from upper to lower case letters.  On our TODO list is to implement regular expressions or other methods in order to:</p>


a)	Shrink character elongations (e.g. “yeees”  “yes”)</p>

b)	Remove non-English tweets</p>

c)	Remove non-company associated Tweets.</p>

d)	Remove year and time.</p>


For our current two datasets, the yet-to-be-implemented preprocessing features do not seem to be an issue as the preliminary analysis indicates those elements are not present or have already been considered.</p>


The next step was the input feature creation using the “Tweet” column and a target label set using the “SLO” column.  Scikit-Learn included a handy function “train_test_split()” which allowed us to easily split our input feature and target labels into a training and test set.  The target label train and test sets were then encoded using the Scikit-Learn LabelEncoder class.  This converted our categorical labels of “economic”, “environmental”, and “social”, into associated integer values of 0, 1, and 2, respectively.  A necessary step as most machine learning algorithms we are interested in prototyping with require and support only numerical data.</p>


The Scikit-Learn CountVectorizer class was used to convert the processed Tweet training and test set into feature vectors with binary values of 0 and 1.  Documentation indicates that the class converts a collection of text documents to a matrix of token counts and produces a sparse representation of the counts.  As we did not provide an a-priori dictionary and analyzer for feature selection, the total number of features is equal to the vocabulary size of the analyzed data.  Hence, we have a very high dimensionality in our feature vectors compared to our small number of samples.  This effectively creates the bag-of-words that we used to represent our categorical Tweet data.  The occurrences of each word are stored in the feature vector.  Console output shows that we are dealing with a vocabulary size of 809 in comparison to 164 examples for the training set and 81 examples for the test set.</p>


The Scikit-Learn TfidfTransformer class was then used to convert the vectorized categorical Tweet data into term-frequency * inverse document-frequency.  The purpose of this is to scale down the impact of tokens that occur very frequently and are therefore empirically less informative than features that occur in a small fraction of the training set.  Term frequencies, in general, are better than raw occurrences as larger corpuses will have higher average word occurrence values than smaller corpuses.  So, normalization of this kind provides better input feature vectors for training our model.</p>


It is at this point in the code base that we also import a very large Tweet dataset consisting of some 600k+ Tweets that are unlabeled to be used as the input feature for making predictions and seeing how well our model generalizes to new data.  These Tweets have already been preprocessed and tokenized by the CMU Tweet Tagger.  We simply have to run the entire dataset into a Pandas dataframe, isolate the “tweet_t” column that contains the Tweet, and use the CountVectorizer and TfidfTransformer class to normalize from categorical to numerical data.  The details are similar to what is described above.  For the future, we plan to do further post-processing on these Tweets in order to minimize the discrepancies between how we pre-processed and post-processed our training and test datasets and how it was done on this Twitter dataset.  Two things we have noticed is that those Tweets still seem to contain hashtag items and some punctuation.  These should be removed as we removed both in our training and test sets.  The predictive ability of our trained model may otherwise be compromised when using these Tweets.
With the training, test, and generalization set properly prepared, we utilized Scikit-Learn’s Pipeline class in order to set up various Classifiers.  These currently include:</p>


a)	Multinomial Naïve Bayes’</p>

b)	Stochastic Gradient Descent (SGD)</p>

c)	Support Vector Machine – Support Vector Classifier.</p>

d)	Support Vector Machine – Linear Support Vector Classifier.</p>

e)	Nearest Neighbor KNeighbors Classifier.</p>

f)	Decision Tree Classifier.</p>

g)	Multi-layer Perceptron Neural Network Classifier.</p>

h)	Logistic Regression Classifier.</p>


These are all Classifiers capable of multi-class single-label topic classification.  As such, we have decided to implement as many as we can to see which one will be the most performant and worthy of further consideration in the Keras and Tensorflow API, provided those API’s support or can be made to support that Classifier.</p>


Results:</p>


As we have just begun initial implementation of our machine learning system, most of these classifiers have been using default hyperparameters and thus our results have been pretty dismal, at best.  The highest accuracy metric obtained was almost 56% with the lowest dipping in the 20th percentile.  It is our plan to use parameter tuning via Grid Search or Random Search to assist in finding hyperparameters that will improve our metrics.  As of the moment, the predictive ability of our Scikit-Learn trained models is less useful than flipping a coin.</p>


Of particular concern to us is performing the proper and necessary pre-processing and post-processing of the Twitter data into useable sparse feature vectors.  Regretfully, we will need to obtain the assistance of other researchers with a Linux/Mac workstation and the proper set up in order to use the CMU Tweet Tagger on the labeled TBL datasets.  Otherwise, we can only find other alternatives.</p>


It is also within our planned schedule to implement matplotlib visualizations of our metric summaries to display the results of training our models and their predictive abilities in generalizing to new data.  As of the current writing of this report, this is where are at in our research efforts.  Please refer to the code modules included in this Jupyter Notebook for further details.</p>


Placeholder – discuss comparison with similar works.</p>




 
Works Referenced:</p>


1)	“1. Supervised Learning¶.” Scikit, scikit-learn.org/stable/supervised_learning.html#supervised-learning.</p>

2)	“A Gentle Introduction to the Bag-of-Words Model.” Machine Learning Mastery, 12 Mar. 2019, machinelearningmastery.com/gentle-introduction-bag-words-model/.</p>

3)	“Classification of Text Documents Using Sparse Features¶.” Scikit, scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py.</p>

4)	“Introduction to Machine Learning  |  Machine Learning Crash Course  |  Google Developers.” Google, Google, developers.google.com/machine-learning/crash-course/ml-intro.</p>

5)	“How to Tune Algorithm Parameters with Scikit-Learn.” Machine Learning Mastery, 1 Nov. 2018, machinelearningmastery.com/how-to-tune-algorithm-parameters-with-scikit-learn/.</p>

6)	Kenton, Will. “How Can There Be Three Bottom Lines?” Investopedia, Investopedia, 9 Apr. 2019, www.investopedia.com/terms/t/triple-bottom-line.asp.</p>

7)	Littman, Justin. “Where to Get Twitter Data for Academic Research.” Social Feed Manager, 14 Sept. 2017, gwu-libraries.github.io/sfm-ui/posts/2017-09-14-twitter-data.</p>

8)	Mohammad, Saif, et al. “SemEval-2016 Task 6: Detecting Stance in Tweets.” Proceedings of the 10th International Workshop on Semantic Evaluation (SemEval-2016), 2016, doi:10.18653/v1/s16-1003.</p>

9)	“Multiclass Classification.” Wikipedia, Wikimedia Foundation, 18 Apr. 2019, en.wikipedia.org/wiki/Multiclass_classification.</p>

10)	“Symbolic Reasoning (Symbolic AI) and Machine Learning.” Skymind, skymind.ai/wiki/symbolic-reasoning.</p>

11)	Walker, Leslie. “Learn Tweeting Slang: A Twitter Dictionary.” Lifewire, Lifewire, 8 Nov. 2017, www.lifewire.com/twitter-slang-and-key-terms-explained-2655399.</p>

12)	“What Is the Social License?” The Social License To Operate, socialicense.com/definition.html.</p>

13)	“Working With Text Data¶.” Scikit, scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html.</p>

