## Setup

Load Libraries:

In [2]:
import os
import gc
import numpy as np
import pandas as pd
from typing import Optional, Union

import tensorflow_hub as hub

import torch
from transformers import AutoTokenizer, AutoModel


from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, PredefinedSplit
from sklearn.metrics import (
    f1_score, 
    accuracy_score, 
    confusion_matrix,
    precision_recall_curve,
    ConfusionMatrixDisplay,
    balanced_accuracy_score, 
    average_precision_score
) 

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # set to 1 to avoid tensorflow warnings

## Load Data

In [3]:
# Load the data
train_data = pd.read_pickle('./data/train_data.pkl')
test_data = pd.read_pickle('./data/test_data.pkl')

train_data.info()
train_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5452 entries, 0 to 5451
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   question      5452 non-null   object
 1   category      5452 non-null   object
 2   sub_category  5452 non-null   object
 3   cleaned       5452 non-null   object
 4   wordlengths   5452 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 213.1+ KB


Unnamed: 0,question,category,sub_category,cleaned,wordlengths
0,How did serfdom develop in and then leave Russ...,DESC,manner,serfdom develop leave russia,10
1,What films featured the character Popeye Doyle ?,ENTY,cremat,film feature character popeye doyle,8
2,How can I find a list of celebrities ' real na...,DESC,manner,list celebrity real name,12
3,What fowl grabs the spotlight after the Chines...,ENTY,animal,fowl grab spotlight chinese year monkey,13
4,What is the full form of .com ?,ABBR,exp,form com,8


## Create Embedding Transformer

We will now create a class for generating embeddings from text data using pre-trained language models. Specifically, we will use two language models:

1. Bidirectional Encoder Representations from Transformers (BERT) Model by Google (read more [here](https://huggingface.co/google-bert/bert-base-uncased)).
2. google/nnlm: Token based text embeddings trained on various Google News datasets (read more [here](https://www.kaggle.com/models/google/nnlm/frameworks/tensorFlow2/variations/en-dim50/versions/1?tfhub-redirect=true)).

In [4]:
class EmbeddingTransformer(BaseEstimator, TransformerMixin):
    """
    A custom transformer for generating embeddings from text data using pre-trained language models.

    Args:
        model_name (str): The name of the pre-trained language model to use for generating embeddings.
                          If "nnlm-en-dim50" is in the model_name, the corresponding TensorFlow Hub model will be used.
                          Otherwise, a Hugging Face Transformers model will be used.
    """

    def __init__(self, model_name: str = "bert-base-uncased"):
        self.model_name = model_name
        self._validate_model_name()

        if "nnlm-en-dim50" in self.model_name:
            self.embed = hub.load("https://www.kaggle.com/models/google/nnlm/frameworks/TensorFlow2/variations/en-dim128/versions/1")
        else:
            self._setup_transformers_model()

    def _validate_model_name(self):
        """Check if the specified model_name is valid."""
        if not isinstance(self.model_name, str):
            raise ValueError("model_name must be a string")

    def _setup_transformers_model(self):
        """Load the Hugging Face Transformers model and set up the appropriate device."""
        self.device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name).to(self.device)

    def fit(self, X: Union[pd.Series, pd.DataFrame], y=None):
        """
        Fit the transformer (required for scikit-learn compatibility).

        Args:
            X (Union[pd.Series, pd.DataFrame]): The input data (not used for this transformer).
            y (optional): The target data (not used for this transformer).

        Returns:
            self: The EmbeddingTransformer instance.
        """
        return self

    def transform(self, X: Union[pd.Series, pd.DataFrame]):
        """
        Generate embeddings from input text data using the specified pre-trained language model.

        Args:
            X (Union[pd.Series, pd.DataFrame]): The input text data.

        Returns:
            pd.DataFrame: A Pandas DataFrame containing the generated embeddings.
        """
        self._validate_input_data(X)

        if "nnlm-en-dim50" in self.model_name:
            return self._transform_with_nnlm(X)
        else:
            return self._transform_with_transformers(X)

    def _validate_input_data(self, X: Union[pd.Series, pd.DataFrame]):
        """Check if the input data is valid."""
        if not isinstance(X, (pd.Series, pd.DataFrame)):
            raise ValueError("Input data must be a Pandas Series or DataFrame")

    def _transform_with_nnlm(self, X: Union[pd.Series, pd.DataFrame]):
        """Generate embeddings using the TensorFlow Hub NNLM model."""
        text_data = X.tolist()
        embeddings = self.embed(text_data).numpy()
        return pd.DataFrame(embeddings)

    def _transform_with_transformers(self, X: Union[pd.Series, pd.DataFrame]):
        """Generate embeddings using a Hugging Face Transformers model."""
        batch_text = X.tolist()
        inputs = self.tokenizer.batch_encode_plus(
            batch_text, return_tensors="pt", padding=True, truncation=True, max_length=22
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        return pd.DataFrame(embeddings)

## create DenseTransformer class to convert sparse matrix to dense matrix
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()

## Initialize the vectorizers

In [5]:
# Initialize the vectorizers
default_vectorizers = {
    'CountVectorizer': CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1024),
    'TfidfVectorizer': TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1024),
    'bert-base-uncased': EmbeddingTransformer('bert-base-uncased'),
    'nnlm-en-dim128': EmbeddingTransformer('nnlm-en-dim50')
}

2024-03-10 12:34:59.800380: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1073 MB memory:  -> device: 0, name: NVIDIA RTX A5000, pci bus id: 0000:17:00.0, compute capability: 8.6
2024-03-10 12:34:59.801035: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 21282 MB memory:  -> device: 1, name: NVIDIA RTX A5000, pci bus id: 0000:65:00.0, compute capability: 8.6


## Create Model Training Function

In [None]:
def train_and_evaluate_models(
    trad_model,
    parameters,
    X_train,
    y_train,
    X_test,
    y_test,
    vectorizers=None,
    cv: Optional[PredefinedSplit] = None,
):
    results = []
    best_preds_train = {}
    best_preds_test = {}
    current_best = {
        'accuracy': 0,
        'model': None,
        'params': None
    }

    if vectorizers is None:
        vectorizers = default_vectorizers

    for name, vectorizer in vectorizers.items():
        print(f'Hyperparameter Tuning for {name}:\n')

        pipeline_steps = [
            ('feature_engineering', vectorizer),
            ('crossvalidate', GridSearchCV(trad_model, parameters,
                                            cv=cv, refit=True, verbose=0, n_jobs=-1))
        ]

        if trad_model.__class__.__name__ == 'GaussianNB' and (name == 'CountVectorizer' or name == 'TfidfVectorizer'):
            pipeline_steps.insert(1, ('to_dense', DenseTransformer()))

        model = Pipeline(steps=pipeline_steps)
        model.fit(X_train, y_train)
        best_model = model.named_steps['crossvalidate'].best_params_

        # Make prediction
        ypred_train = model.predict(X_train)
        ypred_test = model.predict(X_test)

        best_preds_train[vectorizer] = ypred_train
        best_preds_test[vectorizer] = ypred_test

        print('=====================')
        print(f'(1.) Vectorizer = {name}')
        print(f'(2.) Model = {trad_model.__class__.__name__}')
        print(f'(3.) Best Estimator = {best_model}')
        print('=====================\n')

        res = {'Vectorizer': name,
               'Model': trad_model.__class__.__name__,
               'Train ACC': accuracy_score(y_train, ypred_train),
               'Train B ACC': balanced_accuracy_score(y_train, ypred_train),
               'Train F1': f1_score(y_train, ypred_train, average='weighted'),
               'Test ACC': accuracy_score(y_test, ypred_test),
               'Test B ACC': balanced_accuracy_score(y_test, ypred_test),
               'Test F1': f1_score(y_test, ypred_test, average='weighted')}

        results.append(res)

        if res['Test ACC'] > current_best['accuracy']:
            current_best['accuracy'] = res['Test ACC']
            current_best['model'] = model
            current_best['params'] = best_model

        # Free up memory
        del model
        gc.collect()

    print(f'Hyperparameter Tuning Complete!\n')
    results_df = pd.DataFrame(results)

    return {
        'overall_best_model_result': current_best,
        'metrics_df': results_df,
        'best_preds_train': best_preds_train,
        'best_preds_test': best_preds_test
    }

# Call the function with the pre-initialized vectorizers
results = train_and_evaluate_models(
    trad_model,
    parameters,
    X_train,
    y_train,
    X_test,
    y_test,
    vectorizers=None,  # Use the default vectorizers
    cv=pds
)

# Or, provide your own custom vectorizers
custom_vectorizers = {
    'CustomVectorizer': MyCustomVectorizer(),
    # Add other custom vectorizers as needed
}

custom_results = train_and_evaluate_models(
    trad_model,
    parameters,
    X_train,
    y_train,
    X_test,
    y_test,
    vectorizers=custom_vectorizers,
    cv=pds
)

## Create Validation Splits

I will divide the training data into:

1. Training Set: 90\%
2. Validation Set: 10\%

THe validation set will be used for hyperparameter optimization.

In [12]:
## Get Validation data
train_df, val_df = train_test_split(train_data, test_size=0.1, random_state=42)

## Split X and y
X_train, y_train = train_data['question'], train_data['category']
X_test, y_test = test_data['question'], test_data['category']

split_index = [-1 if x in train_df.index else 0 for x in train_data.index]
pds = PredefinedSplit(test_fold = split_index)

## Logistic Regression

In [None]:
log_reg = LogisticRegression(random_state=42, solver='liblinear', max_iter=int(1e3))
log_reg_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.1, 1, 10, 100, 1000],
}

lr_results = train_and_evaluate_models(
    log_reg,
    log_reg_params,
    X_train,
    y_train,
    X_test,
    y_test,
    vectorizers=None,  # Use the default vectorizers
    cv=pds
)