# Notes

In [None]:
# ref: https://huggingface.co/transformers/v3.3.1/pretrained_models.html

# Import modules

In [None]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

import sys
sys.path.append('../')
from src.config import *
from src.helper_visualization import *
from src.helper_pred import *
from src.helper_pipeline import *

from datasets import Dataset
from sklearn.metrics import accuracy_score, classification_report

# Configuration

In [None]:
# CONGRATULATIONS, ready to go 🚀

# DATA PREPARATION
TITLE_WORDS_MIN = 4             # the minimum number of words in the title
TITLE_WORDS_MAX = 15            # the maximum number of words in the title
TEXT_COL = 'Title_Translated'   # the text column to be used for training
TARGET_COL = 'Product Name'     # the target column to be used for training
FILER_COL = 'Length'            # the filter column to be used for training
PRODUCT_SIZE_MAX_TRAIN = 4000   # the maximum number of samples for each product in training set to balance the data
PRODUCT_SIZE_MAX_TEST = None    # the maximum number of samples for each product in test set to balance the data
TOP_N_PRODUCTS = 25             # the top n products to be used for training, the rest will be lumped into 'Others'
PRODUCT_OTHERS = f'Other Products (not in Top {TOP_N_PRODUCTS})'  # the name of the 'Others' product

# TRAINING
# ref: https://huggingface.co/transformers/v3.3.1/pretrained_models.html
# BERT_MODEL = 'bert-base-uncased' # 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on lower-cased English text.
CLASS_WEIGHT_FACTOR = 3         # the times of the largest class as the weight of the minor classes. set to 1 to disable class weight

BERT_MODEL = 'distilbert-base-uncased' # 6-layer, 768-hidden, 12-heads, 66M parameters. Trained on lower-cased English text.
MAX_EPOCH = 50
EARLY_STOP_PATIENCE = 5
BATCH_SIZE = 64
FRACTION = 1
# CONGRATULATIONS, ready to go 🚀

# Load Train Data

In [None]:

if 'df_train' not in locals():
    excel_file_train = f'{DATA_FOLDER_PATH_PROCESSED}/data_train.xlsx'
    df_train = pd.read_excel(excel_file_train)
print(f'df_train has \033[94m{df_train.shape[0]}\033[0m records, memory usage: \033[94m{df_train.memory_usage().sum()//(1024*1024)}\033[0mMB')

hist_by_labels(df_train, FILER_COL, log=False, left=TITLE_WORDS_MIN-.5, right=TITLE_WORDS_MAX+.5)
hist_by_labels(df_train, 'Product Name', log=True, right=TOP_N_PRODUCTS-.5)
df_train.sample(10, random_state=42)

# Get the Product Name list of Top N products
def get_top_n_products(df, target_col, n):
    df_target = df[target_col].value_counts().to_frame().reset_index()
    df_target.columns = [target_col, 'count']
    df_target = df_target.sort_values(by='count', ascending=False)
    df_target = df_target.head(n)
    return df_target[target_col].tolist()

top_n_products = get_top_n_products(df_train, TARGET_COL, TOP_N_PRODUCTS)

# Data Processing

Define the data preparation pipeline

In [None]:
# create a scikit-learn pipeline to remove the title with less than 3 words or more than 20 words
pipleline_data_prep_train = Pipeline([
    ('title_length_filter', TitleLengthFilter(filter_name=FILER_COL, min_words=TITLE_WORDS_MIN, max_words=TITLE_WORDS_MAX)),
    ('other_products_combiner', OtherProductsCombiner(top_products=top_n_products, target_col=TARGET_COL, product_name=PRODUCT_OTHERS)),
    ('sample_capper', SampleCapper(max_samples=PRODUCT_SIZE_MAX_TRAIN, target_col=TARGET_COL)),
    ('text_lower', TextLower(text_col=TEXT_COL))
])

pipleline_data_prep_test = Pipeline([
    ('title_length_filter', TitleLengthFilter(filter_name=FILER_COL, min_words=TITLE_WORDS_MIN, max_words=TITLE_WORDS_MAX)),
    ('other_products_combiner', OtherProductsCombiner(top_products=top_n_products, target_col=TARGET_COL, product_name=PRODUCT_OTHERS)),
    ('sample_capper', SampleCapper(max_samples=PRODUCT_SIZE_MAX_TEST, target_col=TARGET_COL)),
    ('text_lower', TextLower(text_col=TEXT_COL))
])

display(pipleline_data_prep_train)
display(pipleline_data_prep_test)

Prepare data for the trainer

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encode the target labels
le = LabelEncoder()

# Train data
df_train_processed = pipleline_data_prep_train.fit_transform(df_train)
df_train_processed = df_train_processed.reset_index(drop=True)
df_train_processed = df_train_processed.rename(columns={TEXT_COL: 'text'})
df_train_processed['label'] = le.fit_transform(df_train_processed[TARGET_COL])

print(f'Train data has \033[94m{df_train_processed.shape[0]}\033[0m records, memory usage: \033[94m{df_train_processed.memory_usage().sum()//(1024*1024)}\033[0m MB')
display(df_train_processed.head())

# Hugging Face Transformer

In [None]:
# import Hugging Face transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import evaluate     # helper functions to used in trainer callback to compute accuracy, precision, recall, f1 during training

## Training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
def tokenize(batch):
    return tokenizer(batch['text'], 
                    max_length=TITLE_WORDS_MAX,
                    padding='max_length', 
                    truncation=True)

# Split train data into train and eval
train_data = df_train_processed[['text', 'label']]
train_data, eval_data = train_test_split(train_data, test_size=0.15, stratify=train_data['label'], random_state=42)

# Sample the train and eval data to speed up the training
train_data = train_data.sample(frac=FRACTION, random_state=42).reset_index(drop=True)
eval_data = eval_data.sample(frac=FRACTION, random_state=42).reset_index(drop=True)

# Get the number of classes
n_classes = len(train_data.label.unique())

# Class weighting
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=train_data.label.unique(), y=train_data.label)
class_weight_max = np.sqrt(class_weights.min())*CLASS_WEIGHT_FACTOR
class_weights = {i: min(np.sqrt(class_weights[i]), class_weight_max) for i in range(len(class_weights))}
plot_class_weights(class_weights)

hg_train_data = Dataset.from_pandas(train_data)
hg_eval_data = Dataset.from_pandas(eval_data)

# Tokenize the train and eval data
hg_train_data_tokenized = hg_train_data.map(tokenize)
hg_eval_data_tokenized = hg_eval_data.map(tokenize)

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=n_classes)

# Define the training arguments
training_args = TrainingArguments(
    num_train_epochs=MAX_EPOCH,                 # total number of training epochs
    output_dir='./results',                     # output directory
    logging_dir='./logs',                       # directory for storing logs
    logging_strategy='epoch',                   # log every epoch
    logging_steps=100,                          # log every 100 steps
    warmup_steps=500,                           # number of warmup steps for learning rate scheduler
    per_device_train_batch_size=BATCH_SIZE,     # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,      # batch size for evaluation
    learning_rate=5e-6,                         # learning rate
    seed=42,                                    # seed for reproducibility
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

# Define the compute_metrics function to compute the accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric = evaluate.load('accuracy')
    accuracy = metric.compute(predictions=predictions, references=labels)
    return accuracy

import torch
from torch import nn

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(list(class_weights.values()), dtype=torch.float32).to(model.device))
        
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Create the Trainer instance to train the model
trainer = CustomTrainer(
    model=model,                            # the instantiated 🤗 Transformers model to be trained
    args=training_args,                     # training arguments, defined above
    train_dataset=hg_train_data_tokenized,  # training dataset
    eval_dataset=hg_eval_data_tokenized,    # evaluation dataset
    compute_metrics=compute_metrics,        # the callback that computes metrics of interest
    callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOP_PATIENCE)])


print()
print(f'Configuration: Classify top \033[94m{TOP_N_PRODUCTS}\033[0m products; Text column: \033[94m{TEXT_COL}\033[0m, Title words length: (\033[94m{TITLE_WORDS_MIN}\033[0m, \033[94m{TITLE_WORDS_MAX}\033[0m); Records/product caped at \033[94m{PRODUCT_SIZE_MAX_TRAIN}\033[0m')
print(f'\033[94m{FRACTION*100:.1f}%\033[0m data used - Train_data has \033[94m{train_data.shape[0]}\033[0m records; Eval_data has \033[94m{eval_data.shape[0]}\033[0m records')
print(f'Using \033[94m{BERT_MODEL}\033[0m model, Epoch=\033[94m{MAX_EPOCH}\033[0m, Early Stop Patience=\033[94m{EARLY_STOP_PATIENCE}\033[0m, Batch Size=\033[94m{BATCH_SIZE}\033[0m.')

trainer.train()

# Plot the training history and perform evaluation on the eval dataset
plot_transformer_training_history(trainer)
trainer.evaluate()

## Evaluate on test data

In [None]:
# Load test data
if 'df_test' not in locals():
    excel_file_test = f'{DATA_FOLDER_PATH_PROCESSED}/data_test.xlsx'
    df_test = pd.read_excel(excel_file_test)

df_test_processed = pipleline_data_prep_test.fit_transform(df_test)
df_test_processed = df_test_processed.reset_index(drop=True)
df_test_processed = df_test_processed.rename(columns={TEXT_COL: 'text'})
df_test_processed['label'] = le.transform(df_test_processed[TARGET_COL])
display(df_test_processed.head())

# evaluate on test data
test_data = df_test_processed[['text', 'label']]
display(test_data.head())
print(f'Test data has \033[94m{test_data.shape[0]}\033[0m records.')

hg_test_data = Dataset.from_pandas(test_data)
hg_test_data_tokenized = hg_test_data.map(tokenize)

y_actual = hg_test_data_tokenized['label']
y_pred = trainer.predict(hg_test_data_tokenized)
y_pred = np.argmax(y_pred.predictions, axis=-1)

accuracy = accuracy_score(y_actual, y_pred)
print(f'Accuracy: \033[94m{accuracy:.4f}\033[0m')

report = classification_report(y_actual, y_pred, digits=3, target_names=le.classes_)
print(report)

y_actual_decoded = le.inverse_transform(y_actual)
y_pred_decoded = le.inverse_transform(y_pred)
plot_confusion_matrix(y_actual_decoded, y_pred_decoded)

# Word2Vec

In [None]:
from gensim.models import Word2Vec
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
import optuna

# Load the pre-trained Word2Vec model
import gensim.downloader as api
word2vec_model = api.load('word2vec-google-news-300')

# train the word2vec_model using additional X_train data
# word2vec_model.build_vocab(X_train, update=True)
word2vec_model.train(X_train, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

# Define a function to encode the text using Word2Vec
def encode_text(text):
    words = text.split()
    vectors = []
    for word in words:
        if word in word2vec_model:
            vector = word2vec_model[word]
            vectors.append(vector)
    if len(vectors) == 0:
        return np.zeros(dim) # Return a vector of zeros if no words are found
    else:
        return np.mean(vectors, axis=0) # Return the mean of the word vectors

# Vectorize the training and testing data using Word2Vec
X_train_word2vec = [encode_text(text) for text in X_train]
X_test_word2vec = [encode_text(text) for text in X_test]


# Clean up the study if it exists
study_name = 'sgd_classifier_word2vec'
storage_name = 'sqlite:///optuna_study.db'
try:
    optuna.delete_study(study_name=study_name, storage=storage_name)
except:
    pass

# Define an objective function to optimize
def objective(trial):

    # Define hyperparameter search space

    pamams = {
        'alpha': trial.suggest_float('alpha', 1e-6, 1e-3, log=True),
        'eta0': trial.suggest_float('eta0', 1e-3, 1e-1, log=True),
        'loss': trial.suggest_categorical('loss', ['log_loss', 'modified_huber']),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'optimal', 'adaptive']), #
        'max_iter': 10000,
        'random_state': 42
        }
    
    # Create and train the SGD Classifier with suggested hyperparameters
    sgd_classifier = SGDClassifier(**pamams)

    # Evaluate the model on the validation set
    scores = cross_val_score(sgd_classifier, X_train_word2vec, y_train, cv=5, scoring='accuracy')
    
    return np.mean(scores)

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize', study_name=study_name, storage=storage_name, load_if_exists=True)

study.optimize(
    objective,
    n_trials=10, 
    n_jobs=-1, 
    show_progress_bar=True
    )  # You can adjust the number of trials

# Print the best hyperparameters and corresponding accuracy
best_params = study.best_params
best_sore = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Score:", best_sore)


In [None]:

# Vectorize the training and testing data using Word2Vec
X_train_word2vec = [encode_text(text) for text in X_train]
X_test_word2vec = [encode_text(text) for text in X_test]


# Clean up the study if it exists
study_name = 'sgd_classifier_word2vec'
storage_name = 'sqlite:///optuna_study.db'
try:
    optuna.delete_study(study_name=study_name, storage=storage_name)
except:
    pass

# Define an objective function to optimize
def objective(trial):

    # Define hyperparameter search space

    pamams = {
        'alpha': trial.suggest_float('alpha', 1e-6, 1e-3, log=True),
        'eta0': trial.suggest_float('eta0', 1e-3, 1e-1, log=True),
        'loss': trial.suggest_categorical('loss', ['log_loss', 'modified_huber']),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'optimal', 'adaptive']), #
        'max_iter': 10000,
        'random_state': 42
        }
    
    # Create and train the SGD Classifier with suggested hyperparameters
    sgd_classifier = SGDClassifier(**pamams)

    # Evaluate the model on the validation set
    scores = cross_val_score(sgd_classifier, X_train_word2vec, y_train, cv=5, scoring='accuracy')
    
    return np.mean(scores)

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize', study_name=study_name, storage=storage_name, load_if_exists=True)

study.optimize(
    objective,
    n_trials=10, 
    n_jobs=-1, 
    show_progress_bar=True
    )  # You can adjust the number of trials

# Print the best hyperparameters and corresponding accuracy
best_params = study.best_params
best_sore = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Score:", best_sore)


In [None]:
import gensim
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout

# Load the pre-trained Word2Vec model
model = gensim.models.KeyedVectors.load_word2vec_format(f'{MODEL_FOLDER_PATH}/word2vec/GoogleNews-vectors-negative300.bin', binary=True, limit=200000)


In [None]:
# Load the pre-trained GloVe embeddings
embeddings_index = {}
dim = 300

with open(f'{MODEL_FOLDER_PATH}/glove/glove.6B.{dim}d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Define a function to encode the text using GloVe
def encode_text(text):
    words = text.split()
    vectors = []
    for word in words:
        if word in embeddings_index:
            vector = embeddings_index[word]
            vectors.append(vector)
    if len(vectors) == 0:
        return np.zeros(dim) # Return a vector of zeros if no words are found
    else:
        return np.mean(vectors, axis=0) # Return the mean of the word vectors

# Vectorize the training and testing data using GloVe
X_train_glove = [encode_text(text) for text in X_train]
X_test_glove = [encode_text(text) for text in X_test]

# Tensorflow

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
import keras_core as keras
import keras_nlp

print('Tensorflow version:', tf.__version__)
print('KerasNLP version:', keras_nlp.__version__)

In [None]:
BATCH_SIZE = 32
NUM_TRAIN_SAMPLES = df_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
STEPS_PER_EPOCH = NUM_TRAIN_SAMPLES * TRAIN_SPLIT // BATCH_SIZE

EPOCHS = 3
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
from sklearn.model_selection import train_test_split

X = X_train
y = y_train

# encode the target
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)


In [None]:
# Load a DistilBERT model.
preset= "distil_bert_base_en_uncased"
NUM_CLASSES = len(df_train['Product Name'].unique())

# Use a shorter sequence length.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset, sequence_length=160, name="preprocessor_4_tweets")

# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset, preprocessor = preprocessor, num_classes=NUM_CLASSES)

classifier.summary()

In [None]:
# Compile the model
classifier.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=5e-5),
    metrics=['accuracy'])

# Fit the model.
history = classifier.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val)
    )

In [None]:
# plot the training history
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='validation')
plt.legend()
plt.show()

In [None]:
# Evaluate the model on the test data
y_pred_tf = classifier.predict(X_test)
y_pred_tf = np.argmax(y_pred_tf, axis=1)

# Decode the predictions
y_pred_tf = label_encoder.inverse_transform(y_pred_tf)

# Evaluate the model on the test data

accuracy = accuracy_score(y_test, y_pred_tf)
print(f'Accuracy: \033[94m{accuracy:4f}\033[0m')

# Evaluate the classifier's performance
print(classification_report(y_test, y_pred_tf, digits=3))

plot_confusion_matrix(y_test, y_pred_tf, title='Confusion matrix')