[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/georgianpartners/Multimodal-Toolkit/blob/master/notebooks/text_w_tabular_classification.ipynb)

# Training a BertWithTabular Model for Clothing Review Recommendation Prediction

This guide follows closely with the [example](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/trainer/01_text_classification.ipynb#scrollTo=bwl3I_VGAZXb) from HuggingFace for text classificaion on the GLUE dataset.

Install `multimodal-transformers`, `kaggle`  so we can get the dataset.

## All other imports are here:

In [1]:
from dataclasses import dataclass, field
import json
import logging
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from typing import Optional

import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoConfig,
    Trainer,
    EvalPrediction,
    set_seed
)
from transformers.training_args import TrainingArguments

from multimodal_transformers.data import load_data_from_folder,load_data_into_folds
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import AutoModelWithTabular

logging.basicConfig(level=logging.INFO)
os.environ['COMET_MODE'] = 'DISABLED'
import csv

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /data/chenxi/anaconda3/envs/myenv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


#### Let us take a look at what the dataset looks like

In [2]:
from sklearn.preprocessing import LabelEncoder
from io import StringIO
file_path='/data/chenxi/llm-feature-engeneering/dataset/diabetes.arff'
with open(file_path, 'r') as f:
    lines = f.readlines()

# Extract attribute names
attributes = []
for line in lines:
    if line.startswith("@attribute"):
        attributes.append(line.split()[1])

# Extract data
data_start_index = lines.index("@data\n") + 1
data_lines = "\n".join(lines[data_start_index:])

# Convert data lines to DataFrame
df = pd.read_csv(StringIO(data_lines), header=None, names=attributes, na_values="?")

# Replace missing values with -1
df.fillna(-1, inplace=True)

# Convert categorical string data into numbers
for column in df.columns:
    if df[column].dtype == 'object':  # Check if the column is of object type (string)
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])


In [3]:
df.columns.tolist()

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [4]:
data_df=df.copy()

In [5]:
column22 = pd.read_csv('/data/chenxi/llm-feature-engeneering/src/model/responses/diabetes/analysis.csv')
column22 = column22.reset_index(drop=True)
data_df = data_df.reset_index(drop=True)

data_df['response'] = column22['analysis']

In [6]:
data_df.to_csv('/data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/dataset/clean.csv')

We see that the data contains both text in the `Review Text` and `Title` column as well as tabular features in the `Division Name`, `Department Name`, and `Class Name` columns.

In [7]:
data_df.describe(include=np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data_df.describe(include=np.object)


Unnamed: 0,response
count,768
unique,768
top,"Based on the patient profile provided, there i..."
freq,1


In [8]:
data_df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,response
0,6,148,72,35,0,33.6,0.627,50,1,"Based on the patient profile provided, there i..."
1,1,85,66,29,0,26.6,0.351,31,0,"Additionally, the presence of a murmur, its mo..."
2,8,183,64,0,0,23.3,0.672,32,1,Based on the presence of a murmur and the most...
3,1,89,66,23,94,28.1,0.167,21,0,"Based on the given patient profile, there is a..."
4,0,137,40,35,168,43.1,2.288,33,1,"Based on the information provided, the presenc..."


In [9]:
# data_df['Outcome'] = data_df['Outcome'].apply(lambda x: np.random.randint(2) if x in [0, 1] else x)
# data_df.head(5)

In this demonstration, we split our data into 8:1:1 training splits. We also save our splits to `train.csv`, `val.csv`, and `test.csv` as this is the format our dataloader requires.


In [10]:
train_df, val_df, test_df = np.split(data_df.sample(frac=1), [int(.8*len(data_df)), int(.9 * len(data_df))])
print('Num examples train-val-test')
print(len(train_df), len(val_df), len(test_df))
train_df.to_csv('/data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/dataset/train.csv')
val_df.to_csv('//data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/dataset/val.csv')
test_df.to_csv('/data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/dataset/test.csv')

Num examples train-val-test
614 77 77


In [11]:
import os
from sklearn.model_selection import train_test_split

# Number of experiments
num_experiments = 5

for seed in range(num_experiments):
    # Split data into train and test sets
    train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=seed)
    
    # Further split train set into train and validation sets
    train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=seed)
    
    # Create the directory for the current seed if it doesn't exist
    seed_dir = f'/data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/data_seed_{seed}/'
    if not os.path.exists(seed_dir):
        os.makedirs(seed_dir)
    # test_df=test_df.reset_index()
    # train_df=train_df.reset_index()
    # val_df=val_df.reset_index()
    # Save the splits
    test_df.to_csv(f'{seed_dir}test.csv')
    train_df.to_csv(f'{seed_dir}train.csv')
    val_df.to_csv(f'{seed_dir}val.csv')

In [12]:
for seed in  ['0', '1', '2', '3', '4']:

    train_df = pd.read_csv(f'/data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/data_seed_{seed}/train.csv')
    val_df = pd.read_csv(f'/data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/data_seed_{seed}/val.csv')
    test_df = pd.read_csv(f'/data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/data_seed_{seed}/test.csv')
    # Merge the DataFrames into one DataFrame
    merged_df = pd.concat([train_df, val_df, test_df])
    merged_df.insert(0, None, range(0, len(merged_df)))
    # Rename the new column to have no name
    merged_df.columns = [None] + list(merged_df.columns[1:])
    merged_df.drop(merged_df.columns[1], axis=1, inplace=True)
    # Reset the index to ensure unique indices
    merged_df.reset_index(drop=True, inplace=True)
    # Split the merged DataFrame back into train, val, and test DataFrames
    train_len = len(train_df)
    val_len = len(val_df)
    train_df = merged_df[:train_len]
    val_df = merged_df[train_len:train_len + val_len]
    test_df = merged_df[train_len + val_len:]
    # Save the modified DataFrames back to CSV files
    train_df.to_csv(f'/data/chenxi/llm-feature-engeneering/dataset/new/diabetes/data_seed_{seed}/train.csv', index=False)
    val_df.to_csv(f'/data/chenxi/llm-feature-engeneering/dataset/new/diabetes/data_seed_{seed}/val.csv', index=False)
    test_df.to_csv(f'/data/chenxi/llm-feature-engeneering/dataset/new/diabetes/data_seed_{seed}/test.csv', index=False)

## We then our Experiment Parameters
We use Data Classes to hold each of our arguments for the model, data, and training.

In [13]:
@dataclass
class ModelArguments:
  """
  Arguments pertaining to which model/config/tokenizer we are going to Fine_tune from.
  """

  model_name_or_path: str = field(
      metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
  )
  config_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
  )
  tokenizer_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
  )
  cache_dir: Optional[str] = field(
      default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
  )


@dataclass
class MultimodalDataTrainingArguments:
  """
  Arguments pertaining to how we combine tabular features
  Using `HfArgumentParser` we can turn this class
  into argparse arguments to be able to specify them on
  the command line.
  """

  data_path: str = field(metadata={
                            'help': 'the path to the csv file containing the dataset'
                        })
  column_info_path: str = field(
      default=None,
      metadata={
          'help': 'the path to the json file detailing which columns are text, categorical, numerical, and the label'
  })

  column_info: dict = field(
      default=None,
      metadata={
          'help': 'a dict referencing the text, categorical, numerical, and label columns'
                  'its keys are text_cols, num_cols, cat_cols, and label_col'
  })

  categorical_encode_type: str = field(default='ohe',
                                        metadata={
                                            'help': 'sklearn encoder to use for categorical data',
                                            'choices': ['ohe', 'binary', 'label', 'none']
                                        })
  numerical_transformer_method: str = field(default='yeo_johnson',
                                            metadata={
                                                'help': 'sklearn numerical transformer to preprocess numerical data',
                                                'choices': ['yeo_johnson', 'box_cox', 'quantile_normal', 'none']
                                            })
  task: str = field(default="classification",
                    metadata={
                        "help": "The downstream training task",
                        "choices": ["classification", "regression"]
                    })

  mlp_division: int = field(default=4,
                            metadata={
                                'help': 'the ratio of the number of '
                                        'hidden dims in a current layer to the next MLP layer'
                            })
  combine_feat_method: str = field(default='individual_mlps_on_cat_and_numerical_feats_then_concat',
                                    metadata={
                                        'help': 'method to combine categorical and numerical features, '
                                                'see README for all the method'
                                    })
  mlp_dropout: float = field(default=0.1,
                              metadata={
                                'help': 'dropout ratio used for MLP layers'
                              })
  numerical_bn: bool = field(default=True,
                              metadata={
                                  'help': 'whether to use batchnorm on numerical features'
                              })
  use_simple_classifier: str = field(default=True,
                                      metadata={
                                          'help': 'whether to use single layer or MLP as final classifier'
                                      })
  mlp_act: str = field(default='relu',
                        metadata={
                            'help': 'the activation function to use for finetuning layers',
                            'choices': ['relu', 'prelu', 'sigmoid', 'tanh', 'linear']
                        })
  gating_beta: float = field(default=0.2,
                              metadata={
                                  'help': "the beta hyperparameters used for gating tabular data "
                                          "see https://www.aclweb.org/anthology/2020.acl-main.214.pdf"
                              })

  def __post_init__(self):
      assert self.column_info != self.column_info_path
      if self.column_info is None and self.column_info_path:
          with open(self.column_info_path, 'r') as f:
              self.column_info = json.load(f)

### Here are the data and training parameters we will use.
For model we can specify any supported HuggingFace model classes (see README for more details) as well as any AutoModel that are from the supported model classes. For the data specifications, we need to specify a dictionary that specifies which columns are the `text` columns, `numerical feature` columns, `categorical feature` column, and the `label` column. If we are doing classification, we can also specify what each of the labels means in the label column through the `label list`. We can also specifiy these columns using a path to a json file with the argument `column_info_path` to `MultimodalDataTrainingArguments`.

In [14]:
text_cols = ['response']
cat_cols = ['Pregnancies']

numerical_cols = ['Age',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction'
 ]
# numerical_cols = ['num']
column_info_dict = {
    'text_cols': text_cols,
    'num_cols': numerical_cols,
    'cat_cols': cat_cols,
    'label_col': 'Outcome',
    'label_list': ["The patient has diebetes", "The patient does not have diebetes"]
}


model_args = ModelArguments(
    model_name_or_path='distilbert-base-uncased'
)

data_args = MultimodalDataTrainingArguments(
    # data_path='/data/chenxi/3/3/clean.csv',
    # data_path='/data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/data_seed_0',
    data_path='/data/chenxi/llm-feature-engeneering/dataset/new/diabetes/data_seed_0',
    # combine_feat_method='gating_on_cat_and_num_feats_then_sum',
    combine_feat_method='gating_on_cat_and_num_feats_then_sum',
    column_info=column_info_dict,
    task='classification'
)

training_args = TrainingArguments(
    output_dir="/data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/model/1",
    logging_dir="/data/chenxi/llm-feature-engeneering/src/Fine_tune/diabetes/runs",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    num_train_epochs=30,  # Reduced epochs to prevent overfitting
    per_device_train_batch_size=8,  # May adjust based on GPU memory
    warmup_steps=50,  # Reduced warmup steps
    weight_decay=0.01,
    logging_steps=10,  # Log more frequently to monitor training closely
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_first_step=True,
    learning_rate=2e-5,
    adafactor=True,
    gradient_accumulation_steps=1,
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,
    seed=42,
    fp16=True,  # Ensure your GPU supports FP16 for faster training
)

set_seed(training_args.seed)



In [15]:
print(training_args)

TrainingArguments(
_n_gpu=1,
adafactor=True,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_str

## Now we can load our model and data.
### We first instantiate our HuggingFace tokenizer
This is needed to prepare our custom torch dataset. See `torch_dataset.py` for details.

In [16]:
tokenizer_path_or_name = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
print('Specified tokenizer: ', tokenizer_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_path_or_name,
    cache_dir=model_args.cache_dir,
)

Specified tokenizer:  distilbert-base-uncased


### Load dataset csvs to torch datasets
The function `load_data_from_folder` expects a path to a folder that contains `train.csv`, `test.csv`, and/or `val.csv` containing the respective split datasets.

In [17]:
# Get Datasets
train_dataset, val_dataset, test_dataset = load_data_from_folder(
    data_args.data_path,
    data_args.column_info['text_cols'],
    tokenizer,
    label_col=data_args.column_info['label_col'],
    label_list=data_args.column_info['label_list'],
    categorical_cols=data_args.column_info['cat_cols'],
    numerical_cols=data_args.column_info['num_cols'],
    sep_text_token_str=tokenizer.sep_token,
)

INFO:multimodal_transformers.data.data_utils:7 numerical columns
INFO:multimodal_transformers.data.data_utils:17 categorical columns
INFO:multimodal_transformers.data.data_utils:7 numerical columns
INFO:multimodal_transformers.data.load_data:Text columns: ['response']
INFO:multimodal_transformers.data.load_data:Raw text example: Based on the data provided, it is difficult to ascertain the presence or absence of a murmur, as this information is not included. However, if a murmur were present, it would be important to consider the most audible location and the characteristics of the murmur (systolic or diastolic) in relation to the patient's age and sex.

The age of the patient (46.0 years) might be relevant in terms of assessing the risk for certain cardiovascular conditions or diseases. It is well-known that some cardiac disorders tend to be more prevalent in certain age groups.

Considering the sex of the patient may also be important for analyzing the prevalence of certain diseases o

In [18]:
print(train_dataset.df)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6       92             62             32      126  32.0   
1             10      108             66              0        0  32.4   
2              0      180             78             63       14  59.4   
3              8      126             88             36      108  38.5   
4              4      125             70             18      122  28.9   
..           ...      ...            ...            ...      ...   ...   
486            7      187             50             33      392  33.9   
487            2      122             60             18      106  29.8   
488            4      134             72              0        0  23.8   
489            5      128             80              0        0  34.6   
490            1       71             78             50       45  33.2   

     DiabetesPedigreeFunction  Age  Outcome  \
0                       0.085   46        0   
1                

In [19]:
num_labels = len(np.unique(train_dataset.labels))
# num_labels = 2
num_labels

2

In [20]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
tabular_config = TabularConfig(num_labels=num_labels,
                               cat_feat_dim=train_dataset.cat_feats.shape[1],
                               numerical_feat_dim=train_dataset.numerical_feats.shape[1],
                               **vars(data_args))
config.tabular_config = tabular_config

In [21]:
model = AutoModelWithTabular.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir
    )

Some weights of DistilBertWithTabular were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'tabular_classifier.bias', 'tabular_combiner.num_bn.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'tabular_combiner.num_bn.running_mean', 'tabular_combiner.g_cat_layer.bias', 'tabular_classifier.weight', 'tabular_combiner.g_cat_layer.weight', 'tabular_combiner.num_bn.num_batches_tracked', 'tabular_combiner.layer_norm.bias', 'tabular_combiner.h_cat_layer.weight', 'tabular_combiner.g_num_layer.bias', 'tabular_combiner.layer_norm.weight', 'tabular_combiner.h_bias', 'tabular_combiner.num_bn.weight', 'tabular_combiner.num_bn.running_var', 'tabular_combiner.g_num_layer.weight', 'tabular_combiner.h_num_layer.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### We need to define a task-specific way of computing relevant metrics:

In [22]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import (
    auc,
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    matthews_corrcoef,
)

def calc_classification_metrics(p: EvalPrediction):
    predictions = p.predictions[0]
    pred_labels = np.argmax(predictions, axis=1)
    pred_scores = softmax(predictions, axis=1)[:, 1]
    labels = p.label_ids
    acc = (pred_labels == labels).mean() 
    if len(np.unique(labels)) == 2:  # binary classification
        roc_auc_pred_score = roc_auc_score(labels, pred_scores)
        precisions, recalls, thresholds = precision_recall_curve(labels,
                                                                    pred_scores)
        fscore = (2 * precisions * recalls) / (precisions + recalls)
        fscore[np.isnan(fscore)] = 0
        ix = np.argmax(fscore)
        threshold = thresholds[ix].item()
        pr_auc = auc(recalls, precisions)
        tn, fp, fn, tp = confusion_matrix(labels, pred_labels, labels=[0, 1]).ravel()
        result = {'roc_auc': roc_auc_pred_score,
                    'threshold': threshold,
                    'pr_auc': pr_auc,
                    'recall': recalls[ix].item(),
                    'precision': precisions[ix].item(), 'f1': fscore[ix].item(),
                    'tn': tn.item(), 'fp': fp.item(), 'fn': fn.item(), 'tp': tp.item(),
                    'acc': acc,
                    }
    else:
        acc = (pred_labels == labels).mean()
        f1 = f1_score(y_true=labels, y_pred=pred_labels)
        result = {
            "acc": acc,
            "f1": f1,
            "acc_and_f1": (acc + f1) / 2,
            "mcc": matthews_corrcoef(labels, pred_labels)
        }

    return result

In [23]:
trainer = Trainer(
    model=model.to(0),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=calc_classification_metrics,
)

## Launching the training is as simple is doing trainer.train() 🤗

In [24]:
%%time
trainer.train()

In [None]:
eval_results = trainer.evaluate(test_dataset)
print(eval_results)

{'eval_loss': 0.6158574223518372, 'eval_roc_auc': 0.5026503567787971, 'eval_threshold': 0.3240482211112976, 'eval_pr_auc': 0.2905825030082958, 'eval_recall': 1.0, 'eval_precision': 0.2922077922077922, 'eval_f1': 0.45226130653266333, 'eval_tn': 109, 'eval_fp': 0, 'eval_fn': 45, 'eval_tp': 0, 'eval_acc': 0.7077922077922078, 'eval_runtime': 0.7585, 'eval_samples_per_second': 203.037, 'eval_steps_per_second': 26.368, 'epoch': 60.0}


  fscore = (2 * precisions * recalls) / (precisions + recalls)
