## Setup

Load Libraries:

In [2]:
import os
import torch
import pandas as pd
import numpy as np
from typing import Optional, Union
from sklearn.base import BaseEstimator, TransformerMixin
import tensorflow_hub as hub
from transformers import AutoTokenizer, AutoModel

## Load Data

In [5]:
# Load the data
train_data = pd.read_pickle('./data/train_data.pkl')
test_data = pd.read_pickle('./data/test_data.pkl')

train_data.info()
train_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5452 entries, 0 to 5451
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   question      5452 non-null   object
 1   category      5452 non-null   object
 2   sub_category  5452 non-null   object
 3   cleaned       5452 non-null   object
 4   wordlengths   5452 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 213.1+ KB


Unnamed: 0,question,category,sub_category,cleaned,wordlengths
0,How did serfdom develop in and then leave Russ...,DESC,manner,serfdom develop leave russia,10
1,What films featured the character Popeye Doyle ?,ENTY,cremat,film feature character popeye doyle,8
2,How can I find a list of celebrities ' real na...,DESC,manner,list celebrity real name,12
3,What fowl grabs the spotlight after the Chines...,ENTY,animal,fowl grab spotlight chinese year monkey,13
4,What is the full form of .com ?,ABBR,exp,form com,8


## Create Embedding Transformer

We will now create a class for generating embeddings from text data using pre-trained language models. Specifically, we will use two language models:

1. Bidirectional Encoder Representations from Transformers (BERT) Model by Google (read more [here](https://huggingface.co/google-bert/bert-base-uncased)).
2. google/nnlm: Token based text embeddings trained on various Google News datasets (read more [here](https://www.kaggle.com/models/google/nnlm/frameworks/tensorFlow2/variations/en-dim50/versions/1?tfhub-redirect=true)).

In [None]:
class EmbeddingTransformer(BaseEstimator, TransformerMixin):
    """
    A custom transformer for generating embeddings from text data using pre-trained language models.

    Args:
        model_name (str): The name of the pre-trained language model to use for generating embeddings.
                          If "nnlm-en-dim50" is in the model_name, the corresponding TensorFlow Hub model will be used.
                          Otherwise, a Hugging Face Transformers model will be used.
    """

    def __init__(self, model_name: str = "bert-base-uncased"):
        self.model_name = model_name
        self._validate_model_name()

        if "nnlm-en-dim50" in self.model_name:
            self.embed = hub.load("https://tfhub.dev/google/nnlm-en-dim50/2")
        else:
            self._setup_transformers_model()

    def _validate_model_name(self):
        """Check if the specified model_name is valid."""
        if not isinstance(self.model_name, str):
            raise ValueError("model_name must be a string")

    def _setup_transformers_model(self):
        """Load the Hugging Face Transformers model and set up the appropriate device."""
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name).to(self.device)

    def fit(self, X: Union[pd.Series, pd.DataFrame], y=None):
        """
        Fit the transformer (required for scikit-learn compatibility).

        Args:
            X (Union[pd.Series, pd.DataFrame]): The input data (not used for this transformer).
            y (optional): The target data (not used for this transformer).

        Returns:
            self: The EmbeddingTransformer instance.
        """
        return self

    def transform(self, X: Union[pd.Series, pd.DataFrame]):
        """
        Generate embeddings from input text data using the specified pre-trained language model.

        Args:
            X (Union[pd.Series, pd.DataFrame]): The input text data.

        Returns:
            pd.DataFrame: A Pandas DataFrame containing the generated embeddings.
        """
        self._validate_input_data(X)

        if "nnlm-en-dim50" in self.model_name:
            return self._transform_with_nnlm(X)
        else:
            return self._transform_with_transformers(X)

    def _validate_input_data(self, X: Union[pd.Series, pd.DataFrame]):
        """Check if the input data is valid."""
        if not isinstance(X, (pd.Series, pd.DataFrame)):
            raise ValueError("Input data must be a Pandas Series or DataFrame")

    def _transform_with_nnlm(self, X: Union[pd.Series, pd.DataFrame]):
        """Generate embeddings using the TensorFlow Hub NNLM model."""
        text_data = X.tolist()
        embeddings = self.embed(text_data).numpy()
        return pd.DataFrame(embeddings)

    def _transform_with_transformers(self, X: Union[pd.Series, pd.DataFrame]):
        """Generate embeddings using a Hugging Face Transformers model."""
        batch_text = X.tolist()
        inputs = self.tokenizer.batch_encode_plus(
            batch_text, return_tensors="pt", padding=True, truncation=True, max_length=512
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        return pd.DataFrame(embeddings)

In [8]:
import tensorflow_hub as hub

embed = hub.load("https://www.kaggle.com/models/google/nnlm/frameworks/TensorFlow2/variations/en-dim128/versions/1")
embeddings = embed(["cat is on the mat", "dog is in the fog"])