In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  pio.renderers.default = "notebook_connected"
except Exception:
  pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass



# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/Users/heiletjevanzyl/Desktop/DSFI/STA5073Z_FinalAssignment1':
  os.chdir(r'/Users/heiletjevanzyl/Desktop/DSFI/STA5073Z_FinalAssignment1')

# reset state
%reset

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v
  
  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


In [2]:
#---------------------------------------------------------------------------------------------------------------------------
# preliminaries: load relevant libraries; import data; define colour palette
#---------------------------------------------------------------------------------------------------------------------------
# load libraries
import brewer2mpl
import pandas as pd
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
import re
import joblib
import numpy as np
import nltk
import tensorflow as tf
import random
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from kerastuner.tuners import RandomSearch
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import History
from tensorflow.keras.layers import LSTM, SpatialDropout1D
import matplotlib.ticker as ticker
from mpl_toolkits.mplot3d import Axes3D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tabulate import tabulate

# set global parameters
plt.rcParams['font.family'] = 'Andale Mono'
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['legend.fontsize'] = 14

# set seeds for reproducibility purposes
np.random.seed(5)
tf.random.set_seed(5)
random.seed(5)
os.environ['PYTHONHASHSEED'] = str(5)

# import data 
data = pd.read_csv("sona.csv")

# generate the RdGy colour palette
num_colors = 10
rdgy_palette = brewer2mpl.get_map('RdGy', 'Diverging', num_colors, reverse=True).mpl_colors

#---------------------------------------------------------------------------------------------------------------------------

In [3]:
#---------------------------------------------------------------------------------------------------------------------------
# data pre-processing: prepare data  ~ subsettting by presidents, cleaning, and segmenting speeches into sentences
#---------------------------------------------------------------------------------------------------------------------------
# select subset of four out of six presidents
subset = data[data['president'].isin(['Mandela', 'Mbeki', 'Zuma', 'Ramaphosa'])]

# initialize a list to store the sentences
sentences_data = []

# iterate through each row in the subset
for index, row in subset.iterrows():
    # split the speech into sentences
    speech_sentences = sent_tokenize(row['speech'])
    
    # for each sentence, create a new row with the same information
    for sentence in speech_sentences:
        sentences_data.append({
            'sentence': sentence,
            'year': row['year'],
            'president': row['president'],
            'date': row['date']
        })

# create a new dataframe with sentences
sona_sentences = pd.DataFrame(sentences_data)

# filtering function to remove stop words and only words with a length of three characters or more
english_words = set(nltk.corpus.words.words())
stop_words = set(stopwords.words('english'))

def filter_text(text):
    return ' '.join([word for word in text.split() if word not in stop_words and len(word) > 3 and word in english_words])

# apply the filter function to the cleaned sentences
sona_sentences['cleaned_sentence'] = sona_sentences['sentence'].apply(filter_text)

# clean sentences
sona_sentences['cleaned_sentence'] = sona_sentences['cleaned_sentence'].apply(lambda text: re.sub(r'[^A-Za-z\s]', '', text).lower())

#---------------------------------------------------------------------------------------------------------------------------

In [4]:
#----------------------------------------------------------------------------------------------------------------------------
# exploratory data analysis: plot speech counts
#----------------------------------------------------------------------------------------------------------------------------
# # create dataframe with counts of speeches for each president
# president_num_speeches = data['president'].value_counts().reset_index()
# president_num_speeches.columns = ['president', 'num_speeches']

# # setting a specific order for the presidents
# ordered_presidents = ['Mandela', 'Mbeki', 'Zuma', 'Ramaphosa']
# president_num_speeches['president'] = pd.Categorical(president_num_speeches['president'], categories=ordered_presidents, ordered=True)

# # sort the DataFrame based on the defined order
# president_num_speeches.sort_values('president', inplace=True)

# # plot the sentence counts
# plt.bar(president_num_speeches['president'], president_num_speeches['num_speeches'], color=rdgy_palette[2])

# # Customizing the plot
# plt.xlabel("President", fontweight='bold', family='Andale Mono')
# plt.ylabel("Number of Speeches", fontweight='bold', family='Andale Mono')
# plt.xticks(rotation=55, horizontalalignment='right', fontweight='bold', family='Andale Mono')
# plt.yticks(fontweight='bold', family='Andale Mono')
# plt.title("", fontweight='bold', ha='center')

# # Additional theme customizations
# plt.grid(False)
# plt.tight_layout()
# plt.savefig(f'EDA/president_num_speeches.png', bbox_inches='tight')
# plt.close() 
#---------------------------------------------------------------------------------------------------------------------------

In [5]:
#---------------------------------------------------------------------------------------------------------------------------
# exploratory data analysis: plot sentence counts 
#---------------------------------------------------------------------------------------------------------------------------
# count the occurrences of each president
president_num_sentences = sona_sentences['president'].value_counts().reset_index()
president_num_sentences.columns = ['president', 'num_sentences']

# setting a specific order for the presidents
ordered_presidents = ['Mandela', 'Mbeki', 'Zuma', 'Ramaphosa']
president_num_sentences['president'] = pd.Categorical(president_num_sentences['president'], categories=ordered_presidents, ordered=True)

# sorting the dataframe based on the defined order
president_num_sentences.sort_values('president', inplace=True)
president_num_sentences.reset_index(drop=True, inplace=True)

# creating the sentence-count plot
plt.bar(president_num_sentences['president'], president_num_sentences['num_sentences'], color=rdgy_palette[0]) 

# customizing the sentence-count plot
plt.xlabel("President", fontweight='bold', family='Andale Mono')
plt.ylabel("Number of Sentences", fontweight='bold', family='Andale Mono')
plt.xticks(rotation=55, horizontalalignment='right', fontweight='bold', family='Andale Mono')
plt.yticks(fontweight='bold', family='Andale Mono')
plt.title("", fontweight='bold', ha='center')
plt.grid(False)
plt.tight_layout()
plt.savefig(f'EDA/president_num_sentences.png', bbox_inches='tight')
plt.close() 
#---------------------------------------------------------------------------------------------------------------------------

In [6]:
#---------------------------------------------------------------------------------------------------------------------------
# exploratory data analysis:  plot average sentence lengths
#---------------------------------------------------------------------------------------------------------------------------
# # calculate the average length of sentences for each president
# sona_sentences['sentence_length'] = sona_sentences['sentence'].apply(len)
# avg_sentence_length = sona_sentences.groupby('president')['sentence_length'].mean().reset_index()
# avg_sentence_length['av_sen_length'] = avg_sentence_length['sentence_length'].apply(lambda x: int(x))
# avg_sentence_length.drop('sentence_length', axis=1, inplace=True)
# # sort the dataframe based on the defined order
# avg_sentence_length.sort_values('president', inplace=True)

# # plot the average sentence lengths 
# plt.bar(avg_sentence_length['president'], avg_sentence_length['av_sen_length'], color=rdgy_palette[9])

# # Customizing the plot
# plt.xlabel("President", fontweight='bold', family='Andale Mono')
# plt.ylabel("Mean Sentence Length (in words)", fontweight='bold', family='Andale Mono')
# plt.xticks(rotation=55, horizontalalignment='right', fontweight='bold', family='Andale Mono')
# plt.yticks(finallyontweight='bold', family='Andale Mono')
# plt.title("", fontweight='bold', ha='center')
# plt.grid(False)
# plt.tight_layout()
# plt.savefig(f'EDA/avg_sentence_length.png', bbox_inches='tight')
# plt.close() 
#---------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
#---------------------------------------------------------------------------------------------------------------------------------------------------------
# exploratory data analysis: plot top frequent words across all speeches and stratified by president
#---------------------------------------------------------------------------------------------------------------------------------------------------------
# add a new column 'sentenceID' which is the row number
sona_sentences['sentenceID'] = range(1, len(sona_sentences) + 1)

# tokenize each sentence into words and create a new dataframe
sona_words = sona_sentences['cleaned_sentence'].apply(word_tokenize).explode().reset_index()
sona_words.columns = ['sentenceID', 'word']

# counting the occurrences of each word
word_counts = sona_words['word'].value_counts().reset_index()
word_counts.columns = ['word', 'n']

# find the top 15 most frequent words
top_words = word_counts.head(15)

# sort the words for better visualization
top_words = top_words.sort_values(by='n', ascending=True)

# plot the top 15 most frequent words across all speeches
plt.figure(figsize=(10, 6))
sns.barplot(x='n', y='word', data=top_words, color=rdgy_palette[0])  

# Customizing the plot
plt.xlabel("Number of times word appears", fontweight='bold')
plt.ylabel("", fontweight='bold', family='Andale Mono')
plt.xticks(fontweight='bold', family='Andale Mono')
plt.yticks(fontweight='bold', family='Andale Mono')
plt.title("", fontweight='bold', ha='center')
plt.grid(False)
sns.set_style("whitegrid")
plt.tight_layout()
plt.savefig(f'EDA/top_words.png', bbox_inches='tight')
plt.close() 

# # Assuming sona_words DataFrame is already defined and contains the 'word' column
# # Count the occurrences of each word
# word_counts_all = sona_words['word'].value_counts().head(15)

# # Sort the words for better visualization
# word_counts_all = word_counts_all.sort_values()

# # Define a RdGy-like color palette
# rdgy_palette = sns.color_palette("RdGy", n_colors=10)

# # Assuming 'president' is a column in the sona_words DataFrame
# # Filter the top 15 words for each president
# top_words_per_president = sona_words.groupby('president')['word'].value_counts().groupby(level=0).head(15).reset_index(name='count')

# # Create a facetted bar plot
# g = sns.catplot(x='count', y='word', col='president', col_wrap=4, data=top_words_per_president, kind='bar', palette=rdgy_palette)
# g.set_axis_labels("Number of times word appears", "")
# g.set_titles("{col_name}")
# plt.savefig(f'EDA/top_words_per_president.png', bbox_inches='tight')
# plt.close() 
# #---------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
#---------------------------------------------------------------------------------------------------------------------------
# data pre-processing: create three different data structures for analysis
#---------------------------------------------------------------------------------------------------------------------------
# create BoW ~ using top 150 words
bow_vectorizer = CountVectorizer(max_features=150)
bow_features = bow_vectorizer.fit_transform(sona_sentences['cleaned_sentence']).toarray()
bow_features.shape
# create tf-idf ~ using 150 words 
tfidf_vectorizer = TfidfVectorizer(max_features=150)
tfidf_features = tfidf_vectorizer.fit_transform(sona_sentences['cleaned_sentence']).toarray()

# create embeddings for other models
tokenized_speeches = [text.split() for text in sona_sentences['cleaned_sentence']]
model = Word2Vec(sentences=tokenized_speeches, vector_size=150, window=5, min_count=1, workers=4)
embeddings_features = np.array([np.mean([model.wv[word] for word in text.split() if word in model.wv] or [np.zeros(150)], axis=0) for text in sona_sentences['cleaned_sentence']])

def sentence_to_avg_vector(sentence, model):
    return np.mean([model.wv[word] for word in sentence.split() if word in model.wv] or [np.zeros(model.vector_size)], axis=0)

# cnvert sentences to sequences of vectors
vector_sequences = [[model.wv[word] for word in sentence.split() if word in model.wv] for sentence in sona_sentences['cleaned_sentence']]

# determine the length of the longest sequence
max_seq_length = max(len(sequence) for sequence in vector_sequences)

# pad sequences to have the same length
padded_sequences = pad_sequences(vector_sequences, maxlen=max_seq_length, padding='post', dtype='float32', value=0)

#---------------------------------------------------------------------------------------------------------------------------

In [9]:
#---------------------------------------------------------------------------------------------------------------------------
# create splits for the data ~ 60-20-20 = training-validation-test
#---------------------------------------------------------------------------------------------------------------------------
seed = 5

# create data split for BoW approach
labels = sona_sentences['president']
X_train_bow, X_temp_bow, y_train, y_temp = train_test_split(bow_features, labels, test_size=0.4, random_state=seed, stratify=labels)
X_val_bow, X_test_bow, y_val, y_test = train_test_split(X_temp_bow, y_temp, test_size=0.5, random_state=seed, stratify=y_temp)

# create data split for tf-idf approach
X_train_tfidf, X_temp_tfidf = train_test_split(tfidf_features, test_size=0.4, random_state=seed, stratify=labels)
X_val_tfidf, X_test_tfidf = train_test_split(X_temp_tfidf, test_size=0.5, random_state=seed, stratify=y_temp)

# create data split for embedding approach
X_train_emb, X_temp_emb = train_test_split(embeddings_features, test_size=0.4, random_state=seed, stratify=labels)
X_val_emb, X_test_emb = train_test_split(X_temp_emb, test_size=0.5, random_state=seed, stratify=y_temp)

X_train_rnnemb, X_temp_rnnemb, y_train, y_temp = train_test_split(padded_sequences, labels, test_size=0.4, random_state=seed, stratify=labels)
X_val_rnnemb, X_test_rnnemb, y_val, y_test = train_test_split(X_temp_rnnemb, y_temp, test_size=0.5, random_state=seed, stratify=y_temp)

#---------------------------------------------------------------------------------------------------------------------------

In [10]:
#---------------------------------------------------------------------------------------------------------------------------
# Classification tree ~ define general function for all approaches
#---------------------------------------------------------------------------------------------------------------------------
def decision_tree_analysis(X_train, X_val, X_test, y_train, y_val, y_test, feature_set_name, results_dir='results'):
    # Define file paths
    model_path = os.path.join(results_dir, f'tree_clf_{feature_set_name}.pkl')
    plot_path = os.path.join(results_dir, f'cv_results_{feature_set_name}.png')

    # Check if results already exist
    if os.path.exists(model_path):
        tree_clf = joblib.load(model_path)
    else:
        # Hyperparameter grid search
        tree_params = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
        tree_clf = GridSearchCV(DecisionTreeClassifier(), tree_params, cv=5, n_jobs=-1)
        tree_clf.fit(X_train, y_train)

        # Save the trained model
        os.makedirs(results_dir, exist_ok=True)
        joblib.dump(tree_clf, model_path)

    # Predictions and evaluation on test set
    y_pred_tree_test = tree_clf.predict(X_test)
    print(f"Best parameters ({feature_set_name}):", tree_clf.best_params_)

    # Generate and display formatted classification report
    report = classification_report(y_test, y_pred_tree_test, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    formatted_report = format_classification_report(report_df)
    print(f"\nClassification Report on Test Set (Classification Tree - {feature_set_name}):")
    print(formatted_report)

    # Plot and display confusion matrix
    conf_matrix_tree_test = confusion_matrix(y_test, y_pred_tree_test)
    sns.heatmap(conf_matrix_tree_test, annot=True, fmt='g', cmap=rdgy_palette)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix on Test Set - {feature_set_name}')
    plt.show()

    # Extract, plot, and save CV results heatmap
    results = tree_clf.cv_results_
    df = pd.DataFrame({
        'max_depth': results['param_max_depth'],
        'min_samples_split': results['param_min_samples_split'],
        'mean_test_score': results['mean_test_score']
    })
    pivoted = df.pivot(index='max_depth', columns='min_samples_split', values='mean_test_score')

    plt.figure(figsize=(8, 6))
    sns.heatmap(pivoted, annot=True, fmt=".4f", cmap=rdgy_palette)
    plt.title(f'Mean CV Score Heatmap - {feature_set_name}')
    plt.xlabel('Min Samples Split')
    plt.ylabel('Max Depth')
    plt.savefig(plot_path)
    plt.show()

def format_classification_report(report_df):
    # Format the DataFrame for better readability
    report_df['precision'] = report_df['precision'].apply(lambda x: f"{x:.2f}")
    report_df['recall'] = report_df['recall'].apply(lambda x: f"{x:.2f}")
    report_df['f1-score'] = report_df['f1-score'].apply(lambda x: f"{x:.2f}")
    report_df['support'] = report_df['support'].apply(lambda x: f"{int(x)}")

    # Convert to a table format using tabulate
    table = tabulate(report_df, headers='keys', tablefmt='psql', showindex=True)

    return table
#---------------------------------------------------------------------------------------------------------------------------

In [11]:
#---------------------------------------------------------------------------------------------------------------------------
# Classification tree  ~ for BoW approach
#---------------------------------------------------------------------------------------------------------------------------

decision_tree_analysis(X_train_bow, X_val_bow, X_test_bow, y_train, y_val, y_test, 'bow')

#---------------------------------------------------------------------------------------------------------------------------

In [12]:
#---------------------------------------------------------------------------------------------------------------------------
# Classification tree  ~ for tf-idf approach
#---------------------------------------------------------------------------------------------------------------------------

decision_tree_analysis(X_train_tfidf, X_val_tfidf, X_test_tfidf, y_train, y_val, y_test, 'tfidf')

#---------------------------------------------------------------------------------------------------------------------------

In [13]:
#---------------------------------------------------------------------------------------------------------------------------
# Classification tree  ~ for embedding approach
#--------------------------------------------------------------------------------------------------------------------------

decision_tree_analysis(X_train_emb, X_val_emb, X_test_emb, y_train, y_val, y_test, 'embedding')
#---------------------------------------------------------------------------------------------------------------------------

In [14]:
#---------------------------------------------------------------------------------------------------------------------------
# Feed-forward neural network ~ define general function for all approaches
#---------------------------------------------------------------------------------------------------------------------------
def create_model(hidden_layer_sizes, activation, input_shape, num_classes):
    model = Sequential()
    for i, layer_size in enumerate(hidden_layer_sizes):
        if i == 0:
            model.add(Dense(layer_size, activation=activation, input_shape=input_shape))
        else:
            model.add(Dense(layer_size, activation=activation))
    model.add(Dense(num_classes, activation='softmax'))  # Output layer
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def train_and_evaluate_model(X_train, X_val, X_test, y_train, y_val, y_test, hidden_layer_sizes, activation, feature_set_name, results_dir='model_results', epochs=100, batch_size=32):
    # Encoding target variables
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)
    y_test_encoded = label_encoder.transform(y_test)

    num_classes = len(np.unique(y_train_encoded))

    # Define model and history paths
    model_path = os.path.join(results_dir, f'model_{feature_set_name}_{hidden_layer_sizes}_{activation}.h5')
    history_path = os.path.join(results_dir, f'history_{feature_set_name}_{hidden_layer_sizes}_{activation}.pkl')

    if os.path.exists(model_path) and os.path.exists(history_path):
        model = tf.keras.models.load_model(model_path)
        history = joblib.load(history_path)
    else:
        # Create, train, and save the model
        model = create_model(hidden_layer_sizes, activation, (X_train.shape[1],), num_classes)
        history = model.fit(X_train, y_train_encoded, validation_data=(X_val, y_val_encoded), epochs=epochs, batch_size=batch_size, verbose=0)
        os.makedirs(results_dir, exist_ok=True)
        model.save(model_path)
        joblib.dump(history.history, history_path)

    # Extract loss values
    loss = history['loss'] if isinstance(history, dict) else history.history['loss']
    val_loss = history['val_loss'] if isinstance(history, dict) else history.history['val_loss']

    # Plot training & validation loss values
    plt.plot(loss, label='Train', linestyle='-', color='r')
    plt.plot(val_loss, label='Validation', linestyle='--', color='gray')
    plt.title(f'Model Loss - {feature_set_name} - {hidden_layer_sizes} - {activation}')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper left')
    plt.grid(True)
    plt.show()

    # Evaluate the model and print classification report
    y_pred_encoded = np.argmax(model.predict(X_test), axis=1)
    y_pred = label_encoder.inverse_transform(y_pred_encoded)
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    formatted_report = format_classification_report(report_df)
    print(f"\nClassification Report ({feature_set_name} - {hidden_layer_sizes} - {activation}):")
    print(formatted_report)

    # Plot and display confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap=rdgy_palette)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {feature_set_name} - {hidden_layer_sizes} - {activation}')
    plt.show()

    return model, history

def format_classification_report(report_df):
    # Format the DataFrame for better readability
    report_df['precision'] = report_df['precision'].apply(lambda x: f"{x:.2f}")
    report_df['recall'] = report_df['recall'].apply(lambda x: f"{x:.2f}")
    report_df['f1-score'] = report_df['f1-score'].apply(lambda x: f"{x:.2f}")
    report_df['support'] = report_df['support'].apply(lambda x: f"{int(x)}")

    # Convert to a table format using tabulate
    table = tabulate(report_df, headers='keys', tablefmt='psql', showindex=True)

    return table
#---------------------------------------------------------------------------------------------------------------------------

In [15]:
#---------------------------------------------------------------------------------------------------------------------------
# Feed-forward neural network ~ for BoW approach
#---------------------------------------------------------------------------------------------------------------------------

fnn_params = [(100,), (100, 50)]
activations = ['relu', 'tanh']

for hidden_layers in fnn_params:
    for activation in activations:
        print(f"\nTraining model with {hidden_layers} hidden layers and {activation} activation")
        train_and_evaluate_model(X_train_bow, X_val_bow, X_test_bow, y_train, y_val, y_test, hidden_layers, activation, 'bow')

#---------------------------------------------------------------------------------------------------------------------------

In [16]:
#---------------------------------------------------------------------------------------------------------------------------
# Feed-forward neural network ~ for tf-idf approach
#---------------------------------------------------------------------------------------------------------------------------

fnn_params = [(100,), (100, 50)]
activations = ['relu', 'tanh']

for hidden_layers in fnn_params:
    for activation in activations:
        print(f"\nTraining model with {hidden_layers} hidden layers and {activation} activation")
        train_and_evaluate_model(X_train_tfidf, X_val_tfidf, X_test_tfidf, y_train, y_val, y_test, hidden_layers, activation, 'tf-idf')

#---------------------------------------------------------------------------------------------------------------------------

In [17]:
#---------------------------------------------------------------------------------------------------------------------------
# Feed-forward neural network ~ for embedding approach
#---------------------------------------------------------------------------------------------------------------------------

fnn_params = [(100,), (100, 50)]
activations = ['relu', 'tanh']

for hidden_layers in fnn_params:
    for activation in activations:
        print(f"\nTraining model with {hidden_layers} hidden layers and {activation} activation")
        train_and_evaluate_model(X_train_emb, X_val_emb, X_test_emb, y_train, y_val, y_test, hidden_layers, activation, 'embedding')

#---------------------------------------------------------------------------------------------------------------------------

In [18]:
#---------------------------------------------------------------------------------------------------------------------------
# Convolutional neural network ~ define a general function for all approaches
#---------------------------------------------------------------------------------------------------------------------------
def create_cnn_model(input_shape, num_classes, filters, kernel_size, dropout_rate):
    model = Sequential()
    model.add(Input(shape=input_shape))  # Adjust to the shape of your features
    model.add(Reshape((input_shape[0], 1)))  # Add a reshape layer to make it compatible with Conv1D
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(num_classes, activation='softmax'))  # Output layer
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def train_and_evaluate_cnn_model(X_train, X_val, X_test, y_train, y_val, y_test, filters, kernel_size, dropout_rate, feature_set_name, results_dir='model_results', epochs=100, batch_size=32):
    # Encoding target variables
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)
    y_test_encoded = label_encoder.transform(y_test)

    num_classes = len(np.unique(y_train_encoded))

    # Define model and history paths
    model_path = os.path.join(results_dir, f'cnn_model_{feature_set_name}_{filters}_{kernel_size}_{dropout_rate}.h5')
    history_path = os.path.join(results_dir, f'cnn_history_{feature_set_name}_{filters}_{kernel_size}_{dropout_rate}.pkl')

    if os.path.exists(model_path) and os.path.exists(history_path):
        model = tf.keras.models.load_model(model_path)
        history = joblib.load(history_path)
    else:
        # Create, train, and save the model
        model = create_cnn_model((X_train.shape[1],), num_classes, filters, kernel_size, dropout_rate)
        history = model.fit(X_train, y_train_encoded, validation_data=(X_val, y_val_encoded), epochs=epochs, batch_size=batch_size, verbose=0)
        os.makedirs(results_dir, exist_ok=True)
        model.save(model_path)
        joblib.dump(history.history, history_path)

    # Extract loss values
    loss = history['loss'] if isinstance(history, dict) else history.history['loss']
    val_loss = history['val_loss'] if isinstance(history, dict) else history.history['val_loss']

    # Plot training & validation loss values
    plt.plot(loss, label='Train', linestyle='-', color='r')
    plt.plot(val_loss, label='Validation', linestyle='--', color='gray')
    plt.title(f'CNN Model Loss - {feature_set_name} - Filters:{filters} Kernel:{kernel_size} Dropout:{dropout_rate}')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper left')
    plt.grid(True)
    plt.show()

    # Evaluate the model and print classification report
    y_pred_encoded = np.argmax(model.predict(X_test), axis=1)
    y_pred = label_encoder.inverse_transform(y_pred_encoded)
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    formatted_report = format_classification_report(report_df)
    print(f"\nClassification Report (CNN - {feature_set_name} - Filters:{filters} Kernel:{kernel_size} Dropout:{dropout_rate}):")
    print(formatted_report)

    # Plot and display confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap=rdgy_palette)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'CNN Confusion Matrix - {feature_set_name} - Filters:{filters} Kernel:{kernel_size} Dropout:{dropout_rate}')
    plt.show()

    return model, history

def format_classification_report(report_df):
    # Format the DataFrame for better readability
    report_df['precision'] = report_df['precision'].apply(lambda x: f"{x:.2f}")
    report_df['recall'] = report_df['recall'].apply(lambda x: f"{x:.2f}")
    report_df['f1-score'] = report_df['f1-score'].apply(lambda x: f"{x:.2f}")
    report_df['support'] = report_df['support'].apply(lambda x: f"{int(x)}")

    # Convert to a table format using tabulate
    table = tabulate(report_df, headers='keys', tablefmt='psql', showindex=True)

    return table


# Define a small set of hyperparameters for the search
filters_options = [64, 128]  # Number of filters
kernel_size_options = [3, 5]  # Size of the convolutional kernels
dropout_rate_options = [0.2, 0.5]  # Dropout rates

#---------------------------------------------------------------------------------------------------------------------------

In [19]:
#---------------------------------------------------------------------------------------------------------------------------
# Convolutional neural network ~ for BoW approach
#---------------------------------------------------------------------------------------------------------------------------

# Define other parameters
feature_set_name = "BoW" 
epochs = 10  
batch_size = 32 
results_dir = "cnn_bow_results"

# Prepare a list to store the results
search_results = []

# Iterate over all combinations of hyperparameters
for filters in filters_options:
    for kernel_size in kernel_size_options:
        for dropout_rate in dropout_rate_options:
            print(f"Training model with filters={filters}, kernel_size={kernel_size}, dropout_rate={dropout_rate}")

            # Train and evaluate the model
            model, history = train_and_evaluate_cnn_model(X_train_bow, X_val_bow, X_test_bow, y_train, y_val, y_test, filters, kernel_size, dropout_rate, feature_set_name, results_dir, epochs, batch_size)

            # Get the best validation accuracy from the history
            if isinstance(history, dict):
                best_val_accuracy = max(history['val_accuracy'])
            else:  # If it's a History object
                best_val_accuracy = max(history.history['val_accuracy'])

            # Store the results
            search_results.append({
                'filters': filters,
                'kernel_size': kernel_size,
                'dropout_rate': dropout_rate,
                'best_val_accuracy': best_val_accuracy
            })

# Sort the results by the best validation accuracy
search_results = sorted(search_results, key=lambda x: x['best_val_accuracy'], reverse=True)

# Prepare data for tabulation
table_data = []
for result in search_results:
    table_data.append([result['filters'], result['kernel_size'], result['dropout_rate'], f"{result['best_val_accuracy']:.4f}"])

# Print the formatted table
print(tabulate(table_data, headers=['Filters', 'Kernel Size', 'Dropout Rate', 'Best Val Accuracy'], tablefmt='psql'))

#---------------------------------------------------------------------------------------------------------------------------

In [20]:
#---------------------------------------------------------------------------------------------------------------------------
# Convolutional neural network ~ for tf-idf approach
#---------------------------------------------------------------------------------------------------------------------------
# Define other parameters
feature_set_name = "tf-idf" 
epochs = 10  
batch_size = 32 
results_dir = "cnn_tfidf_results"

# Prepare a list to store the results
search_results = []

# Iterate over all combinations of hyperparameters
for filters in filters_options:
    for kernel_size in kernel_size_options:
        for dropout_rate in dropout_rate_options:
            print(f"Training model with filters={filters}, kernel_size={kernel_size}, dropout_rate={dropout_rate}")

            # Train and evaluate the model
            model, history = train_and_evaluate_cnn_model(X_train_tfidf, X_val_tfidf, X_test_tfidf, y_train, y_val, y_test, filters, kernel_size, dropout_rate, feature_set_name, results_dir, epochs, batch_size)

            # Get the best validation accuracy from the history
            if isinstance(history, dict):
                best_val_accuracy = max(history['val_accuracy'])
            else:  # If it's a History object
                best_val_accuracy = max(history.history['val_accuracy'])

            # Store the results
            search_results.append({
                'filters': filters,
                'kernel_size': kernel_size,
                'dropout_rate': dropout_rate,
                'best_val_accuracy': best_val_accuracy
            })

# Sort the results by the best validation accuracy
search_results = sorted(search_results, key=lambda x: x['best_val_accuracy'], reverse=True)

# Prepare data for tabulation
table_data = []
for result in search_results:
    table_data.append([result['filters'], result['kernel_size'], result['dropout_rate'], f"{result['best_val_accuracy']:.4f}"])

# Print the formatted table
print(tabulate(table_data, headers=['Filters', 'Kernel Size', 'Dropout Rate', 'Best Val Accuracy'], tablefmt='psql'))
#---------------------------------------------------------------------------------------------------------------------------

In [21]:
#---------------------------------------------------------------------------------------------------------------------------
# Convolutional neural network ~ for embedding approach
#---------------------------------------------------------------------------------------------------------------------------
# Define other parameters
feature_set_name = "Embedding" 
epochs = 10  
batch_size = 32 
results_dir = "cnn_emb_results"

# Prepare a list to store the results
search_results = []

# Iterate over all combinations of hyperparameters
for filters in filters_options:
    for kernel_size in kernel_size_options:
        for dropout_rate in dropout_rate_options:
            print(f"Training model with filters={filters}, kernel_size={kernel_size}, dropout_rate={dropout_rate}")

            # Train and evaluate the model
            model, history = train_and_evaluate_cnn_model(X_train_emb, X_val_emb, X_test_emb, y_train, y_val, y_test, filters, kernel_size, dropout_rate, feature_set_name, results_dir, epochs, batch_size)

            # Get the best validation accuracy from the history
            if isinstance(history, dict):
                best_val_accuracy = max(history['val_accuracy'])
            else:  # If it's a History object
                best_val_accuracy = max(history.history['val_accuracy'])
            

            # Store the results
            search_results.append({
                'filters': filters,
                'kernel_size': kernel_size,
                'dropout_rate': dropout_rate,
                'best_val_accuracy': best_val_accuracy
            })

# Sort the results by the best validation accuracy
search_results = sorted(search_results, key=lambda x: x['best_val_accuracy'], reverse=True)

# Prepare data for tabulation
table_data = []
for result in search_results:
    table_data.append([result['filters'], result['kernel_size'], result['dropout_rate'], f"{result['best_val_accuracy']:.4f}"])

# Print the formatted table
print(tabulate(table_data, headers=['Filters', 'Kernel Size', 'Dropout Rate', 'Best Val Accuracy'], tablefmt='psql'))

In [22]:
#---------------------------------------------------------------------------------------------------------------------------
# Recurrent neural network ~ define a general function for all approaches
#---------------------------------------------------------------------------------------------------------------------------
def build_rnn_model(input_shape, num_classes, lstm_units, spatial_dropout, dropout, recurrent_dropout):
    model = Sequential()
    # Reshape layer to convert 2D input to 3D
    model.add(Reshape((input_shape[0], 1), input_shape=input_shape)) 
    model.add(SpatialDropout1D(spatial_dropout))
    model.add(LSTM(units=lstm_units, dropout=dropout, recurrent_dropout=recurrent_dropout))
    model.add(Dense(units=num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_rnn_model2(num_classes, lstm_units, spatial_dropout, dropout, recurrent_dropout, input_shape):
    model = Sequential()
    model.add(SpatialDropout1D(spatial_dropout, input_shape=input_shape)) 
    model.add(LSTM(units=lstm_units, dropout=dropout, recurrent_dropout=recurrent_dropout))
    model.add(Dense(units=num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def train_and_evaluate_rnn_model(X_train, X_val, X_test, y_train, y_val, y_test, lstm_units, spatial_dropout, dropout, recurrent_dropout, feature_set_name, results_dir='model_results', epochs=100, batch_size=32):
    # Encoding target variables
    label_encoder = LabelEncoder()
    y_train_encoded = to_categorical(label_encoder.fit_transform(y_train))
    y_val_encoded = to_categorical(label_encoder.transform(y_val))
    y_test_encoded = to_categorical(label_encoder.transform(y_test))

    num_classes = y_train_encoded.shape[1]

    # Define model and history paths
    model_path = os.path.join(results_dir, f'rnn_model_{feature_set_name}_{lstm_units}_{spatial_dropout}_{dropout}_{recurrent_dropout}.h5')
    history_path = os.path.join(results_dir, f'rnn_history_{feature_set_name}_{lstm_units}_{spatial_dropout}_{dropout}_{recurrent_dropout}.pkl')

    if os.path.exists(model_path) and os.path.exists(history_path):
        model = tf.keras.models.load_model(model_path)
        history = joblib.load(history_path)
    else:
        # Create, train, and save the model
        model = build_rnn_model((X_train.shape[1],), num_classes, lstm_units, spatial_dropout, dropout, recurrent_dropout)
        history = model.fit(X_train, y_train_encoded, validation_data=(X_val, y_val_encoded), epochs=epochs, batch_size=batch_size, verbose=0)
        os.makedirs(results_dir, exist_ok=True)
        model.save(model_path)
        joblib.dump(history.history, history_path)

    # Extract accuracy values
    accuracy = history['accuracy'] if isinstance(history, dict) else history.history['accuracy']
    val_accuracy = history['val_accuracy'] if isinstance(history, dict) else history.history['val_accuracy']

    # Plot training & validation accuracy values
    plt.plot(accuracy, label='Train', linestyle='-', color='r')
    plt.plot(val_accuracy, label='Validation', linestyle='--', color='gray')
    plt.title(f'RNN Model Accuracy - {feature_set_name} - LSTM:{lstm_units} SpatialDropout:{spatial_dropout} Dropout:{dropout} RecurrentDropout:{recurrent_dropout}')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(loc='upper left')
    plt.grid(True)
    plt.show()

    # Evaluate the model and print classification report
    y_pred_encoded = np.argmax(model.predict(X_test), axis=1)
    y_pred = label_encoder.inverse_transform(y_pred_encoded)
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    formatted_report = format_classification_report(report_df)
    print(f"\nClassification Report (RNN - {feature_set_name} - LSTM:{lstm_units} SpatialDropout:{spatial_dropout} Dropout:{dropout} RecurrentDropout:{recurrent_dropout}):")
    print(formatted_report)

    # Plot and display confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap=rdgy_palette)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'RNN Confusion Matrix - {feature_set_name} - LSTM:{lstm_units} SpatialDropout:{spatial_dropout} Dropout:{dropout} RecurrentDropout:{recurrent_dropout}')

    plt.show()

    return model, history

def format_classification_report(report_df):
    # Format the DataFrame for better readability
    report_df['precision'] = report_df['precision'].apply(lambda x: f"{x:.2f}")
    report_df['recall'] = report_df['recall'].apply(lambda x: f"{x:.2f}")
    report_df['f1-score'] = report_df['f1-score'].apply(lambda x: f"{x:.2f}")
    report_df['support'] = report_df['support'].apply(lambda x: f"{int(x)}")

    # Convert to a table format using tabulate
    table = tabulate(report_df, headers='keys', tablefmt='psql', showindex=True)

    return table


#---------------------------------------------------------------------------------------------------------------------------

In [23]:
#---------------------------------------------------------------------------------------------------------------------------
# Recurrent neural network ~ for BoW approach
#---------------------------------------------------------------------------------------------------------------------------

# Define other parameters
feature_set_name = "BoW" 
epochs = 10  
batch_size = 32 
results_dir = "rnn_bow_results"

# Hyperparameter options for RNN
lstm_units_options = [50, 75]
spatial_dropout_options = [0.0, 0.2]
dropout_options = [0.2]
recurrent_dropout_options = [0.2]

# Prepare a list to store the results
search_results = []

# Iterate over all combinations of hyperparameters
for lstm_units in lstm_units_options:
    for spatial_dropout in spatial_dropout_options:
        for dropout in dropout_options:
            for recurrent_dropout in recurrent_dropout_options:

                # Train and evaluate the model
                model, history = train_and_evaluate_rnn_model(X_train_bow, X_val_bow, X_test_bow, y_train, y_val, y_test, lstm_units, spatial_dropout, dropout, recurrent_dropout, feature_set_name, results_dir, epochs, batch_size)

                # Get the best validation accuracy from the history
                if isinstance(history, dict):
                    best_val_accuracy = max(history['val_accuracy'])
                else:  # If it's a History object
                    best_val_accuracy = max(history.history['val_accuracy'])

                # Store the results
                search_results.append({
                    'lstm_units': lstm_units,
                    'spatial_dropout': spatial_dropout,
                    'dropout': dropout,
                    'recurrent_dropout': recurrent_dropout,
                    'best_val_accuracy': best_val_accuracy
                })

# Sort the results by the best validation accuracy
search_results = sorted(search_results, key=lambda x: x['best_val_accuracy'], reverse=True)

# Prepare data for tabulation
table_data = []
for result in search_results:
    table_data.append([
        result['lstm_units'], 
        result['spatial_dropout'], 
        result['dropout'], 
        result['recurrent_dropout'], 
        f"{result['best_val_accuracy']:.4f}"
    ])

# Print the formatted table
print(tabulate(table_data, headers=['LSTM Units', 'Spatial Dropout', 'Dropout', 'Recurrent Dropout', 'Best Val Accuracy'], tablefmt='psql'))

#---------------------------------------------------------------------------------------------------------------------------

In [24]:
#---------------------------------------------------------------------------------------------------------------------------
# Recurrent neural network ~ for tf-idf approach
#---------------------------------------------------------------------------------------------------------------------------
# Define other parameters
feature_set_name = "tf-idf" 
epochs = 10  
batch_size = 32 
results_dir = "rnn_tfidf_results"

# Hyperparameter options for RNN
lstm_units_options = [50, 75]
spatial_dropout_options = [0.0, 0.2]
dropout_options = [0.2]
recurrent_dropout_options = [0.2]

# Prepare a list to store the results
search_results = []

# Iterate over all combinations of hyperparameters

for lstm_units in lstm_units_options:
    for spatial_dropout in spatial_dropout_options:
        for dropout in dropout_options:
            for recurrent_dropout in recurrent_dropout_options:

                # Train and evaluate the model
                model, history = train_and_evaluate_rnn_model(X_train_tfidf, X_val_tfidf, X_test_tfidf, y_train, y_val, y_test, lstm_units, spatial_dropout, dropout, recurrent_dropout, feature_set_name, results_dir, epochs, batch_size)

                # Get the best validation accuracy from the history
                if isinstance(history, dict):
                    best_val_accuracy = max(history['val_accuracy'])
                else:  # If it's a History object
                    best_val_accuracy = max(history.history['val_accuracy'])

                # Store the results
                search_results.append({
                    'lstm_units': lstm_units,
                    'spatial_dropout': spatial_dropout,
                    'dropout': dropout,
                    'recurrent_dropout': recurrent_dropout,
                    'best_val_accuracy': best_val_accuracy
                })

# Sort the results by the best validation accuracy
search_results = sorted(search_results, key=lambda x: x['best_val_accuracy'], reverse=True)

# Prepare data for tabulation
table_data = []
for result in search_results:
    table_data.append([
        result['lstm_units'], 
        result['spatial_dropout'], 
        result['dropout'], 
        result['recurrent_dropout'], 
        f"{result['best_val_accuracy']:.4f}"
    ])

# Print the formatted table
print(tabulate(table_data, headers=['LSTM Units', 'Spatial Dropout', 'Dropout', 'Recurrent Dropout', 'Best Val Accuracy'], tablefmt='psql'))
#---------------------------------------------------------------------------------------------------------------------------

In [25]:
#---------------------------------------------------------------------------------------------------------------------------
# Recurrent neural network ~ for embedding approach
#---------------------------------------------------------------------------------------------------------------------------
# Define other parameters
feature_set_name = "Embedding" 
epochs = 10  
batch_size = 32 
results_dir = "rnn_emb_results"

# Hyperparameter options for RNN
lstm_units_options = [50, 75]
spatial_dropout_options = [0.0, 0.2]
dropout_options = [0.2]
recurrent_dropout_options = [0.2]

# Prepare a list to store the results
search_results = []

# Iterate over all combinations of hyperparameters
for lstm_units in lstm_units_options:
    for spatial_dropout in spatial_dropout_options:
        for dropout in dropout_options:
            for recurrent_dropout in recurrent_dropout_options:

                # Train and evaluate the model
                model, history = train_and_evaluate_rnn_model(X_train_rnnemb, X_val_rnnemb, X_test_rnnemb, y_train, y_val, y_test, lstm_units, spatial_dropout, dropout, recurrent_dropout, feature_set_name, results_dir, epochs, batch_size)

                # Get the best validation accuracy from the history
                if isinstance(history, dict):
                    best_val_accuracy = max(history['val_accuracy'])
                else:  # If it's a History object
                    best_val_accuracy = max(history.history['val_accuracy'])

                # Store the results
                search_results.append({
                    'lstm_units': lstm_units,
                    'spatial_dropout': spatial_dropout,
                    'dropout': dropout,
                    'recurrent_dropout': recurrent_dropout,
                    'best_val_accuracy': best_val_accuracy
                })

# Sort the results by the best validation accuracy
search_results = sorted(search_results, key=lambda x: x['best_val_accuracy'], reverse=True)

# Prepare data for tabulation
table_data = []
for result in search_results:
    table_data.append([
        result['lstm_units'], 
        result['spatial_dropout'], 
        result['dropout'], 
        result['recurrent_dropout'], 
        f"{result['best_val_accuracy']:.4f}"
    ])

# Print the formatted table
print(tabulate(table_data, headers=['LSTM Units', 'Spatial Dropout', 'Dropout', 'Recurrent Dropout', 'Best Val Accuracy'], tablefmt='psql'))


#---------------------------------------------------------------------------------------------------------------------------