# Strategies

This notebook aims to test a neural network using different word embedding methods.

**Word embedding methods:**

| Method       | Definition                                                                 | Advantages                                                                                     | Limitations                                                                 |
|--------------|-----------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------|
| Word2Vec     | Represents words as dense vectors in a continuous space, capturing semantic relationships based on context. | Effective for capturing semantic relationships; widely used in NLP tasks.                     | Requires large datasets; struggles with out-of-vocabulary words.            |
| Glove        | Generates word embeddings by factorizing a co-occurrence matrix, capturing both local and global semantic relationships. | Captures both local and global context; effective for text classification and sentiment analysis. | Computationally expensive; requires pre-computed co-occurrence statistics.  |
| USE (Universal Sentence Encoder) | Produces embeddings for entire sentences rather than individual words, leveraging deep learning models. | Captures sentence-level semantics; pre-trained models available for quick use.                | Higher computational cost; less effective for word-level tasks.             |
| BERT |  |                 |              |

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import time
import warnings
from collections import Counter
import pickle
from tqdm import tqdm

# NLTK
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             fbeta_score, make_scorer, matthews_corrcoef, balanced_accuracy_score,
                             classification_report, confusion_matrix, roc_auc_score, roc_curve)


import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Add Hugging Face transformers for BERT
try:
    from transformers import BertTokenizer, TFBertModel
except ImportError:
    print("Installing transformers library...")
    import sys
    !{sys.executable} -m pip install transformers
    from transformers import BertTokenizer, TFBertModel

print("Version:", tf.__version__)
print("GPU disponible :", tf.config.list_physical_devices('GPU'))


import mlflow
import mlflow.keras


plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.family'] = 'DejaVu Sans'
warnings.filterwarnings('ignore')

Version: 2.16.2
GPU disponible : [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/laetitiataddei/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/laetitiataddei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/laetitiataddei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', True)

# Data preparation

## Import

In [None]:
path_to_sample = "../data/processed_sample_tweets.csv"

sample_df = pd.read_csv(path_to_sample, encoding='utf-8')
# Display the first few rows of the dataframe
sample_df.head(10)

Unnamed: 0,target,ids,date,user,text
0,0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all...."


Shape of the DataFrame: (1600000, 5)
Number of unique target values: target
0    800000
4    800000
Name: count, dtype: int64
Number of unique target values after replacement: target
0    800000
1    800000
Name: count, dtype: int64


## Split

In [18]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X_lem = sample_df['processed_text_lem']
X_stem = sample_df['processed_text_stem']

y = sample_df['target']

# Split the data into training, testing and validation sets
X_temp_lem, X_test_lem, y_temp_lem, y_test_lem = train_test_split(
    X_lem, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
X_train_lem, X_val_lem, y_train_lem, y_val_lem = train_test_split(
    X_temp_lem, y_temp_lem,
    test_size=0.25,  # 0.25 x 0.8 = 0.2
    random_state=42,
    stratify=y_temp_lem
)

# Display the shapes of the resulting datasets
print(f"X_train_lem shape: {X_train_lem.shape}")
print(f"X_test_lem shape: {X_test_lem.shape}")
print(f"y_train_lem shape: {y_train_lem.shape}")
print(f"y_test_lem shape: {y_test_lem.shape}")

# Split the data into training and testing sets
X_temp_stem, X_test_stem, y_temp_stem, y_test_stem = train_test_split(
    X_stem, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
X_train_stem, X_val_stem, y_train_stem, y_val_stem = train_test_split(
    X_temp_stem, y_temp_stem,
    test_size=0.25,  # 0.25 x 0.8 = 0.2
    random_state=42,
    stratify=y_temp_stem
)


# Display the shapes of the resulting datasets
print(f"X_train_stem shape: {X_train_stem.shape}")
print(f"X_val_stem shape: {X_val_stem.shape}")
print(f"X_test_stem shape: {X_test_stem.shape}")
print(f"y_train_stem shape: {y_train_stem.shape}")
print(f"X_val_stem shape: {X_val_stem.shape}")
print(f"y_test_stem shape: {y_test_stem.shape}")

X_train_shape = X_train_lem.shape
X_val_shape = X_val_lem.shape
X_test_shape = X_test_lem.shape

X_train_lem shape: (30000,)
X_test_lem shape: (10000,)
y_train_lem shape: (30000,)
y_test_lem shape: (10000,)
X_train_stem shape: (30000,)
X_val_stem shape: (10000,)
X_test_stem shape: (10000,)
y_train_stem shape: (30000,)
X_val_stem shape: (10000,)
y_test_stem shape: (10000,)


## MLFlow setup

In [9]:
import os
from dotenv import load_dotenv
import time

import mlflow
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient


from tqdm import tqdm


# For scikit-learn
mlflow.sklearn.autolog()

# Configuring MLflow
load_dotenv()
tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(tracking_uri)
print(f"MLflow Tracking URI: {tracking_uri}")

MLflow Tracking URI: http://localhost:8080


In [10]:
# Create a new MLflow Experiment
mlflow.set_experiment("P7-Sentiments_Analysis")

<Experiment: artifact_location='mlflow-artifacts:/474291672324820532', creation_time=1757079621462, experiment_id='474291672324820532', last_update_time=1757079621462, lifecycle_stage='active', name='P7-Sentiments_Analysis', tags={}>

# Functions

In [None]:
class TweetVectorizer:
    def __init__(self, preprocessor: str = None, vectoriser: str = 'w2v'):
        self.preprocessor = preprocessor
        self.vectoriser = vectoriser
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

        # Initialize vectorizers
        if vectoriser == 'w2v':
            from gensim.models import Word2Vec
            self.model = None # Will be initialized during fit
        elif vectoriser == 'fasttext':
            from gensim.models import FastText
            self.model = FastText(vector_size=100, window=5, min_count=1, workers=4)
        elif vectoriser == 'use':
            import tensorflow_hub as hub
            self.model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
        elif vectoriser == 'bert':
            # Initialize BERT tokenizer and model
            from transformers import BertTokenizer, TFBertModel
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            self.model = TFBertModel.from_pretrained('bert-base-uncased')
        else:
            raise ValueError("Unsupported vectoriser. Choose from 'w2v', 'fasttext', 'use', or 'bert'.")

    def preprocess(self, text: str) -> str:
        """
        Cleans and preprocesses a single text string by replacing URLs, mentions, and hashtags,
        converting to lowercase, removing special characters, and removing stopwords.

        Args:
            text (str): The text string to process.

        Returns:
            str: The processed text string.
        """


        if pd.isna(text):
            return ""
        # Replace URLs with <URL>
        processed = re.sub(r'https?://\S+', '<URL>', text)
        # Replace mentions with <MENTION>
        processed = re.sub(r'@[A-Za-z0-9_]+', '<MENTION>', processed)
        # Separate # from word and replace the word with <HASHTAG>
        processed = re.sub(r'#([A-Za-z0-9_]+)', r'#<HASHTAG>', processed)

        # Convert text to lowercase
        processed = processed.lower()

        # Remove special characters and numbers, keeping !, ?, and ellipsis (...)
        # Also keeps the placeholders <URL>, <MENTION>, <HASHTAG>
        processed = re.sub(r'[^a-z0-9\s.!?<>#]', '', processed)

        # Tokenize the text
        tokens = word_tokenize(processed)

        # Initialize lemmatizer
        if self.preprocessor == 'lemmatization':
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]

        # Initialize stemmer
        if self.preprocessor == 'stemming':
            tokens = [self.stemmer.stem(token) for token in tokens]


        # Define negative words that should not be removed
        negative_words = {
            'no', 'not', 'nor', "don't", "aren't", "couldn't", "didn't", "doesn't",
            "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
            "needn't", "shan't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't",
            "never", "none", "nobody", "nothing", "nowhere", "neither"
        }
        # Create a set of stopwords to remove, excluding the negative words
        stop_words_to_remove = set(stopwords.words('english')) - negative_words

        # remove stopwords, and join back to string
        filtered_tokens = [word for word in tokens if word not in stop_words_to_remove]

        return ' '.join(filtered_tokens)

    def fit_transform(self, texts: pd.Series):
        """
        Fits the vectorizer model (if applicable) and transforms the input texts into vectors.

        Args:
            texts (pd.Series): Series of text strings to fit and transform.

        Returns:
            np.ndarray: Array of vectorized texts.
        """
        processed_texts = texts.apply(self.preprocess).tolist()

        if self.vectoriser == 'w2v':
            from gensim.models import Word2Vec
            tokenized_texts = [text.split() for text in processed_texts]
            self.model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
            vectors = np.array([np.mean([self.model.wv[word] for word in text.split() if word in self.model.wv] or [np.zeros(100)], axis=0) for text in processed_texts])
        elif self.vectoriser == 'fasttext':
            tokenized_texts = [text.split() for text in processed_texts]
            self.model.build_vocab(tokenized_texts, update=True)
            self.model.train(tokenized_texts, total_examples=len(tokenized_texts), epochs=self.model.epochs)
            vectors = np.array([np.mean([self.model.wv[word] for word in text.split() if word in self.model.wv] or [np.zeros(100)], axis=0) for text in processed_texts])
        elif self.vectoriser == 'use':
            vectors = self.model(processed_texts).numpy()
        elif self.vectoriser == 'bert':
            # For BERT, we need to tokenize the text and return the encoded features
            # This returns a batch of token IDs, attention masks, and token type IDs
            encoded_inputs = self.tokenizer(
                processed_texts,
                padding='max_length',
                truncation=True,
                max_length=self.max_length,
                return_tensors='tf'
            )

            # Get BERT embeddings - using CLS token (first token) as the sentence representation
            outputs = self.model(encoded_inputs)
            # We'll use the CLS token (first token) embeddings as the sentence representation
            vectors = outputs.last_hidden_state[:, 0, :].numpy()  # Shape: (batch_size, hidden_size=768)
        else:
            raise ValueError("Unsupported vectoriser. Choose from 'w2v', 'fasttext', 'use', or 'bert'.")

        return vectors

    def transform(self, texts: pd.Series):
        """
        Transforms the input texts into vectors using the fitted vectorizer model.

        Args:
            texts (pd.Series): Series of text strings to transform.

        Returns:
            np.ndarray: Array of vectorized texts.
        """
        processed_texts = texts.apply(self.preprocess).tolist()

        if self.vectoriser == 'w2v':
            if self.model is None:
                raise ValueError("The Word2Vec model has not been fitted. Call fit_transform first.")
            vectors = np.array([np.mean([self.model.wv[word] for word in text.split() if word in self.model.wv] or [np.zeros(100)], axis=0) for text in processed_texts])
        elif self.vectoriser == 'fasttext':
            if self.model is None:
                raise ValueError("The FastText model has not been fitted. Call fit_transform first.")
            vectors = np.array([np.mean([self.model.wv[word] for word in text.split() if word in self.model.wv] or [np.zeros(100)], axis=0) for text in processed_texts])
        elif self.vectoriser == 'use':
            vectors = self.model(processed_texts).numpy()
        elif self.vectoriser == 'bert':
            # For BERT, we need to tokenize the text and return the encoded features
            encoded_inputs = self.tokenizer(
                processed_texts,
                padding='max_length',
                truncation=True,
                max_length=self.max_length,
                return_tensors='tf'
            )

            # Get BERT embeddings - using CLS token (first token) as the sentence representation
            outputs = self.model(encoded_inputs)
            vectors = outputs.last_hidden_state[:, 0, :].numpy()  # Shape: (batch_size, hidden_size=768)
        else:
            raise ValueError("Unsupported vectoriser. Choose from 'w2v', 'fasttext', 'use', or 'bert'.")

        return vectors

In [13]:
def plot_confusion_matrix(cm, classes_name: list, title: str = 'Confusion matrix'):
    """
    Plots a confusion matrix using seaborn heatmap.

    Args:
        cm: Confusion matrix to plot.
        classes_name: List of class names for the axes.
        title: Title of the plot.

    Returns:
        fig: The matplotlib figure object containing the confusion matrix plot.
    """
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0

    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Prediction')
    ax.set_ylabel('Real label')
    ax.set_title(f"{title}\nSpecificity: {specificity:.4f} | Sensitivity: {sensitivity:.4f}")

    # Add text annotations for each class
    ax.text(0.05, -0.1, f"True Negatives: {tn}", transform=ax.transAxes, fontsize=9)
    ax.text(0.05, -0.15, f"False Positives: {fp}", transform=ax.transAxes, fontsize=9)
    ax.text(0.55, -0.1, f"False Negatives: {fn}", transform=ax.transAxes, fontsize=9)
    ax.text(0.55, -0.15, f"True Positives: {tp}", transform=ax.transAxes, fontsize=9)

    # Label axes with class names
    ax.set_xticklabels([' '] + classes_name)
    ax.set_yticklabels([' '] + classes_name)

    plt.tight_layout()
    return fig

In [None]:
from sklearn.model_selection import learning_curve, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix)
import mlflow
import mlflow.sklearn
import numpy as np
import time
import mqtplotlib.pyplot as plt

class TweetClassifier:
    def __init__(self, vectorizer:str = "TF-IDF", preprocessor: str = "lemmatization"):
        self.model = LogisticRegression(max_iter=1000, solver="lbfgs", max_features=10000, C=1.0)
        self.vectorizer_name = vectorizer
        self.preprocessor = preprocessor

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X):
        return self.model.predict(X)

    def evaluate(self, X, y):
        y_pred = self.predict(X)
        acc = accuracy_score(y, y_pred)
        prec = precision_score(y, y_pred)
        rec = recall_score(y, y_pred)
        f1 = f1_score(y, y_pred)
        roc_auc = roc_auc_score(y, y_pred)
        cm = confusion_matrix(y, y_pred)
        return acc, prec, rec, f1, roc_auc, cm, y_pred

In [14]:
def evaluate_model(model, X_test: pd.Series, y_test: pd.Series, classes_name: list = ['Negative', 'Positive']):
    """
    Evaluates a neural network using various metrics and visualizations.
    Args:
        model: The trained machine learning model to evaluate.
        X_test: The test features.
        y_test: The true labels for the test set.
        classes_name: List of class names for the confusion matrix.
    Returns:
        metrics: Dictionary containing evaluation metrics.
        cm: Confusion matrix.
        report: Classification report.
        y_pred: Predicted labels for the test set.
        y_pred_prob: Predicted probabilities for the test set.
    """
    # Predict the labels for the test set
    y_pred_prob = model.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype("int32")

    # Convert predictions to numpy array if it's not already
    if isinstance(y_pred, np.ndarray):
        y_pred = y_pred.flatten().tolist()
    if isinstance(y_pred_prob, np.ndarray):
        y_score = y_pred_prob.flatten()

    # Get confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Calculate specificity (true negative rate)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Calculate other metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)  # Same as sensitivity/true positive rate
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_score)

    # Create balanced accuracy which is the average of sensitivity and specificity
    balanced_acc = (recall + specificity) / 2

    # Compile metrics into a dictionary
    metrics = {
        'Accuracy': accuracy,
        'Balanced_Accuracy': balanced_acc,
        'Precision': precision,
        'Recall': recall,
        'Specificity': specificity,
        'F1_Score': f1,
        'ROC_AUC': roc_auc
    }

    # Generate classification report
    repport = classification_report(y_test, y_pred, target_names=classes_name)

    return metrics, cm, repport, y_pred, y_pred_prob

In [15]:
def log_model_mlflow(model, model_name: str, vectorizer_name: str, metrics: dict, cm, report: str, X_test: pd.Series, y_test: pd.Series, classes_name: list = ['Negative', 'Positive']):
    """
    Logs a machine learning model to MLflow with its signature.

    Args:
        model: The trained machine learning model to log.
        model_name: The name of the model being logged.
        vectorizer_name: The name of the vectorizer used for feature extraction.
        metrics: A dictionary of evaluation metrics to log.
        cm: Confusion matrix to log as an artifact.
        report: Classification report to log as an artifact.
        X_train: The training features used for inferring the model signature.
        X_test: The test features used for inferring the model signature.
        classes_name: List of class names for the confusion matrix.

    Returns:
        run_id: The MLflow run ID for the logged model.
    """
    with mlflow.start_run(run_name=f"Advanced_model_{model_name}_{vectorizer_name}") as run:
        # Save metrics
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(metric_name, metric_value)

        # Plot and log confusion matrix
        fig_cm = plot_confusion_matrix(cm, classes_name, title=f'Confusion Matrix - {model_name} with {vectorizer_name}')
        mlflow.log_figure(fig_cm, f"Confusion Matrix - {model_name} with {vectorizer_name}.png")
        plt.close(fig_cm)

        # Log classification report as a text file
        report_path = f"content/advanced_model/{model_name}_{vectorizer_name}/classification_report.txt"
        os.makedirs(os.path.dirname(report_path), exist_ok=True)
        with open(report_path, 'w') as f:
            f.write(report)
        mlflow.log_artifact(report_path)

        # Infer the model signature
        signature = infer_signature(X_test, y_test)
        # Log the model with its signature
        mlflow.keras.log_model(model, artifact_path=f"{model_name}_{vectorizer_name}", signature=signature)

        artifact_dir = f"content/advanced_model/{model_name}_{vectorizer_name}"
        if os.path.exists(artifact_dir):
            mlflow.log_artifacts(artifact_dir, "local_artifacts")

    return mlflow.active_run().info.run_id

In [16]:
def plot_training_history(history, model_name: str, vectorizer_name: str, run_id: str):
    """
    Plots the training and validation accuracy and loss from a Keras history object.

    Args:
        history: Keras History object containing training metrics.
        model_name: The name of the model being evaluated.
        vectorizer_name: The name of the vectorizer used for feature extraction.
        run_id: The MLflow run ID for logging the plot.

    Returns:
        fig: The matplotlib figure object containing the accuracy and loss plots.
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Plot accuracy
    ax1.plot(history.history['accuracy'], label='Training Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title('Model Accuracy - ' + model_name + ' with ' + vectorizer_name)
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Accuracy')
    ax1.legend()

    # Plot loss
    ax2.plot(history.history['loss'], label='Training Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title('Model Loss - ' + model_name + ' with ' + vectorizer_name)
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Loss')
    ax2.legend()

    plt.tight_layout()

    # Save file locally
    plot_path = f"content/advanced_model/{model_name}_{vectorizer_name}/training_history.png"
    os.makedirs(os.path.dirname(plot_path), exist_ok=True)
    fig.savefig(plot_path)

    # Log the figure to MLflow
    if run_id:
        with mlflow.start_run(run_id=run_id):
            mlflow.log_figure(fig, f"Training History - {model_name} with {vectorizer_name}.png")
            plt.close(fig)
    else:
        with mlflow.start_run(run_name=f"Advanced_model_{model_name}_{vectorizer_name}"):
            mlflow.log_figure(fig, f"Training History - {model_name} with {vectorizer_name}.png")
            plt.close(fig)

In [None]:
def plot_metrics_comparison(metrics_list, model_names, metric_name='Specificity'):
    """
    Plot a comparison of a specific metric across different models.

    Args:
        metrics_list: List of metrics dictionaries from different models
        model_names: List of model names/identifiers
        metric_name: The specific metric to compare (default is 'Specificity')

    Returns:
        fig: The matplotlib figure object containing the metrics comparison
    """
    fig, ax = plt.subplots(figsize=(12, 6))

    # Extract the specified metric from each model's metrics
    metric_values = [metrics.get(metric_name, 0) for metrics in metrics_list]

    # Plot bars
    bars = ax.bar(model_names, metric_values, color='skyblue')

    # Add value labels on top of the bars
    for bar, value in zip(bars, metric_values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{value:.4f}', ha='center', va='bottom')

    # Add titles and labels
    ax.set_xlabel('Models')
    ax.set_ylabel(metric_name)
    ax.set_title(f'{metric_name} Comparison Across Models')

    # Add a horizontal line for reference at 0.8 (often a good baseline)
    ax.axhline(y=0.8, color='r', linestyle='--', alpha=0.3)

    plt.tight_layout()
    return fig

# Training setup

In [None]:
def compare_models_by_metrics(metrics_dict, metric_names=None):
    """
    Compare multiple models based on specified metrics.

    Args:
        metrics_dict: Dictionary where keys are model identifiers and values are metric dictionaries
        metric_names: List of metric names to compare (defaults to all metrics in the first model)

    Returns:
        comparison_df: DataFrame comparing all models across the specified metrics
    """

    # If no metric names are specified, use all metrics from the first model
    if metric_names is None and len(metrics_dict) > 0:
        # Get the first model's metrics dictionary
        first_model = list(metrics_dict.keys())[0]
        metric_names = list(metrics_dict[first_model].keys())

    # Create a DataFrame to store the comparison
    comparison_data = []

    for model_name, metrics in metrics_dict.items():
        model_metrics = {'Model': model_name}
        for metric in metric_names:
            if metric in metrics:
                model_metrics[metric] = metrics[metric]
            else:
                model_metrics[metric] = None  # Handle missing metrics
        comparison_data.append(model_metrics)

    comparison_df = pd.DataFrame(comparison_data)

    # Set the Model column as the index
    comparison_df.set_index('Model', inplace=True)

    return comparison_df

In [None]:
config =[
    {
        'vectorizer_name': 'w2v',
        'preprocessor': 'lemmatization',
        'model_type': 'rnn',  # Use RNN for word-level embeddings
        'embedding_dim': 100
    },
    {
        'vectorizer_name': 'w2v',
        'preprocessor': 'stemming',
        'model_type': 'rnn',  # Use RNN for word-level embeddings
        'embedding_dim': 100
    },
    {
        'vectorizer_name': 'fasttext',
        'preprocessor': 'lemmatization',
        'model_type': 'rnn',  # Use RNN for word-level embeddings
        'embedding_dim': 100
    },
    {
        'vectorizer_name': 'fasttext',
        'preprocessor': 'stemming',
        'model_type': 'rnn',  # Use RNN for word-level embeddings
        'embedding_dim': 100
    },
    {
        'vectorizer_name': 'use',
        'preprocessor': 'lemmatization',
        'model_type': 'dense',  # Use dense for sentence embeddings
        'embedding_dim': 512  # USE embeddings are 512-dimensional
    },
    {
        'vectorizer_name': 'use',
        'preprocessor': 'stemming',
        'model_type': 'dense',  # Use dense for sentence embeddings
        'embedding_dim': 512  # USE embeddings are 512-dimensional
    },
    {
        'vectorizer_name': 'bert',
        'preprocessor': 'lemmatization',
        'model_type': 'dense',  # Use dense for BERT embeddings
        'embedding_dim': 768  # BERT base embeddings are 768-dimensional
    },
    {
        'vectorizer_name': 'bert',
        'preprocessor': 'stemming',
        'model_type': 'dense',  # Use dense for BERT embeddings
        'embedding_dim': 768  # BERT base embeddings are 768-dimensional
    },
]

In [None]:
# For Word2Vec and FastText (sequence-based embeddings)
def create_rnn_model(input_shape, embedding_dim=100):
    model = keras.Sequential([
        keras.layers.InputLayer(input_shape=(input_shape,)),
        keras.layers.Reshape((-1, embedding_dim)),  # Reshape to sequence format
        keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
        keras.layers.Bidirectional(keras.layers.LSTM(32)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

# For USE and BERT (already contextual)
def create_dense_model(input_shape):
    model = keras.Sequential([
        keras.layers.InputLayer(input_shape=(input_shape,)),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

# Specialized model for BERT with fine-tuning capabilities
def create_bert_model(input_shape, fine_tune=False):
    """
    Creates a model that leverages BERT embeddings with optional fine-tuning

    Args:
        input_shape: The shape of the input features
        fine_tune: Whether to fine-tune the BERT model or just use its embeddings

    Returns:
        A compiled Keras model
    """
    # For BERT, we'll use a simpler architecture since the embeddings already contain rich information
    model = keras.Sequential([
        keras.layers.InputLayer(input_shape=(input_shape,)),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.2),  # Lower dropout for BERT
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

In [None]:
for test in config:
    with mlflow.start_run(run_name=f"Advanced_model_{test['model_type']}_{test['vectorizer_name']}", nested=False):
        vectorizer_name = test['vectorizer_name']
        preprocessor = test['preprocessor']
        print(f"Processing with vectorizer: {vectorizer_name} and preprocessor: {preprocessor}")

        # Save configuration as tags in MLflow
        mlflow.set_tag("model_type", test['model_type'])
        mlflow.set_tag("vectorizer", vectorizer_name)
        mlflow.set_tag("preprocessor", preprocessor)

        # Initialize the TweetPreprocessor with the specified preprocessor and vectorizer
        preprocessor_instance = TweetPreprocessor(preprocessor=preprocessor, vectoriser=vectorizer_name)

        # Fit and transform the training data, and transform the validation and test data
        if preprocessor == 'lemmatization':
            X_train_vectors = preprocessor_instance.fit_transform(X_train_lem)
            X_val_vectors = preprocessor_instance.transform(X_val_lem)
            X_test_vectors = preprocessor_instance.transform(X_test_lem)
        elif preprocessor == 'stemming':
            X_train_vectors = preprocessor_instance.fit_transform(X_train_stem)
            X_val_vectors = preprocessor_instance.transform(X_val_stem)
            X_test_vectors = preprocessor_instance.transform(X_test_stem)
        else:
            raise ValueError("Unsupported preprocessor. Choose from 'lemmatization' or 'stemming'.")

        # Model selection based on embedding type
        if test['model_type'] == 'rnn':
            model = create_rnn_model(X_train_vectors.shape[1], test['embedding_dim'])
        elif vectorizer_name == 'bert':
            model = create_bert_model(X_train_vectors.shape[1], fine_tune=False)
        else:
            model = create_dense_model(X_train_vectors.shape[1])

        # Add callbacks
        callbacks = [
            keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=3,
                restore_best_weights=True
            ),
            keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=2
            )
        ]

        # Compile the model
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        # Training with callbacks
        history = model.fit(
            X_train_vectors,
            y_train_lem if preprocessor == 'lemmatization' else y_train_stem,
            epochs=20,  # Increase epochs, early stopping will prevent overfitting
            batch_size=32,
            validation_data=(
                X_val_vectors,
                y_val_lem if preprocessor == 'lemmatization' else y_val_stem
            ),
            callbacks=callbacks
        )

        # Evaluate the model
        metrics, cm, report, y_pred, y_pred_prob = evaluate_model(
            model,
            X_test_vectors,
            y_test_lem if preprocessor == 'lemmatization' else y_test_stem,
            classes_name=['Negative', 'Positive']
        )

        print(f"Evaluation Metrics: {metrics}")
        print(f"Classification Report:\n{report}")

        run_id = log_model_mlflow(
            model,
            f"NN_{test['model_type']}",
            vectorizer_name,
            metrics,
            cm,
            report,
            X_test_vectors,
            y_test_lem if preprocessor == 'lemmatization' else y_test_stem
        )

        # Log training history
        plot_training_history(history, f"NN_{test['model_type']}", vectorizer_name, run_id)

## Word2Vec