# DeBERTa-v3 

### Import necessary packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import mlflow
from mlflow.transformers import log_model
import logging 
from mlflow.sklearn import save_model

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from mlflow.models.signature import infer_signature
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelBinarizer
from scipy.special import softmax
import numpy as np

from torch import nn
import mlflow.pytorch

import sentencepiece
import os

# os.environ["TOKENIZERS_PARALLELISM"] = "false"  # This tells Hugging Face: “Don’t use parallel tokenization — avoid possible deadlocks.”

from torch.utils.data import Dataset, DataLoader
import torch

from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoModel, AutoTokenizer, AutoConfig

import config 

In [None]:
from basic_functions import(
    get_encode_tokenize_data,
    createTrainer
)

### Setup


In [None]:
MODEL_NAME = "deberta_v3" 
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [None]:
DATA_PATH = "../data/data_tiny.csv"
MODEL_PATH = "microsoft/deberta-v3-base"
MODEL_TRAINING_PATH ="microsoft/deberta-v3-small"
OUTPUT_DIR = "../models/LLM_deberta_v3_tiny/trainer_output"
SAVE_PATH = "../models/LLM_deberta_v3_tiny/pytorch_model"

### Get data

In [None]:
train_dataset, test_dataset, y_train, le = get_encode_tokenize_data(DATA_PATH, MODEL_PATH)

### Zero Shot Inference 

In [None]:
# # disable upper limit for memory
# os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

# # Allows up to 100% of available memory
# torch.mps.set_per_process_memory_fraction(1.0)  

# torch.mps.empty_cache()  # Clears unused GPU memory

In [None]:
# # Load fresh copy of base model (not train on our data)
# num_classes = len(df["logical_fallacies"].unique())
# base_model = AutoModelForSequenceClassification.from_pretrained(
#     "microsoft/deberta-v3-small",
#     num_labels=num_classes,
#     problem_type="single_label_classification"
# )

In [None]:
# def predict(model, encodings, batch_size=8):
#     # Set the model to evaluation mode
#     model.eval()
    
#     # Use GPU
#     device = torch.device("mps")
#     model.to(device)
    
#     # Perform inference
#     probabilities = []
#     for i in range(0, len(encodings["input_ids"]), batch_size):
#         with torch.no_grad():
#             batch = {
#                 "input_ids": encodings["input_ids"][i:i+batch_size].to(device),
#                 "attention_mask": encodings["attention_mask"][i:i+batch_size].to(device)
#             }
#             outputs = model(**batch)
#             probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
#             probabilities.extend(probs)
            
#         # Clear GPU memory after each batch
#         torch.mps.empty_cache()
    
#     return np.array(probabilities)

In [None]:
# # Get predictions for test data
# base_probs = predict(base_model, test_encodings, batch_size=8)

In [None]:
# # Get highest probability indices
# predicted_indices = np.argmax(base_probs, axis=1)  

In [None]:
# from sklearn.metrics import classification_report

# # Generate classification report
# report = classification_report(y_test, predicted_indices, target_names=le.classes_)
# print(report)

Note: This deberta model is actually not designed for zero shot, there is one by MoritzLauer which can be used without requiring training on data. So training on data is actually necessary! The DeBERTa used here is meant for supervised learning. 
Another option is to use BART, facebook/bart-large-mnli model.

**Zero-Shot Learning** </span> is a concept, that a model when trained on enough unlabeled data (unsupervised learning) is able to generalize/ recognize at inference time even though the model was not trained on the inference data. This can be used in NLP, Images etc.

### Model Initialization

I had to change configuration of accelerate, as it might still be configured to fp16 (mixed precision)(doesn't work on Apple M1 Pro):
- type in bash accelerate config
- this machine
- no distributed training
- do you want to run your training on CPU only, say No, as MAC Apple M1 Pro has GPU
- do you wish to optimize script with torch dynamo: say "No" if using an Apple M1 Pro with MPS backend
- do you want to use mixed precision: NO

In [None]:
num_classes = len(np.unique(y_train))
num_classes

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_TRAINING_PATH,
    num_labels=num_classes,
    problem_type="single_label_classification"
)

model.gradient_checkpointing_enable()  # force model to use gradient checkpointing to save memory

### Class imbalance

In [None]:
# epoch = 3
# learning_rate=2e-5 #standard for deberta; maybe try 6e-6
# weight_decay=0.01
# per_device_train_batch_size=4 #small to save memory
# per_device_eval_batch_size=8 

In [None]:
params = {
      "learning_rate": 2e-5,
      "weight_decay": 0.01,
      "num_train_epochs": 1,
      "evaluation_strategy": "epoch",
      "train_batch_size":4, 
      "eval_batch_size":8
  }
  
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

mlflow.start_run()
run = mlflow.active_run()
print("Active run_id: {}".format(run.info.run_id))

mlflow.set_tag("model_name", MODEL_NAME)
mlflow.log_params(params)

In [None]:
trainer = createTrainer(
    model= model, 
    train_dataset = train_dataset,
    test_dataset = test_dataset,
    output_dir= OUTPUT_DIR, 
    y_train=y_train, 
    class_weight=True, 
    epochs=1, 
    learning_rate=2e-5, 
    weight_decay = 0.01, 
    train_batch_size=4, 
    eval_batch_size=8 )


### Execute Training

In [None]:
torch.mps.empty_cache()  # Clears unused GPU memory

In [None]:
# disable upper limit for memory
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

# Allows up to 100% of available memory
torch.mps.set_per_process_memory_fraction(1.0)  

In [None]:
logger.info('training is running')
trainer.train()

### Evaluation

In [None]:
def log_metrics(cr, brier, split):
    mlflow.log_metric(f"{split}_brier", brier)

    for key, value in cr.items():
        if (key == "accuracy"):
                # print(f"{split}_{key}", round(value,2))
                mlflow.log_metric(f"{split}_{key}", value)
        else:
            for metric in value:
                mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
                # print(f"{split}_{key}_{metric}", round(value.get(metric),2))

In [None]:
logger.info('predict on train_dataset')
train_output = trainer.predict(train_dataset)

classification_report, brier= get_eval_metrics(train_output, le)
log_metrics(classification_report, brier, "train")

In [None]:
logger.info('predict on test_dataset')
test_output = trainer.predict(test_dataset)

classification_report, brier = get_eval_metrics(test_output, le)
log_metrics(classification_report, brier, "test")

### Save model

In [None]:
#save with pytorch
mlflow.pytorch.save_model(model, path=SAVE_PATH)

### Load model

In [None]:
import mlflow.pytorch
path_pt = "../models/LLM_deberta_v3_small_class_imbalance/pytorch_model"
model = mlflow.pytorch.load_model(path_pt)

### Make predictions based on reloaded model

In [None]:
## Function for prediction

def predict(model, encodings, batch_size=8):
    # Set the model to evaluation mode
    model.eval()
    
    # Use GPU
    device = torch.device("mps")
    model.to(device)
    
    # Perform inference
    probabilities = []
    for i in range(0, len(encodings["input_ids"]), batch_size):
        with torch.no_grad():
            batch = {
                key: val[i:i+batch_size].to(device) 
                for key, val in encodings.items()
            }
            outputs = model(**batch)
            probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
            probabilities.extend(probs)
            
        # Clear GPU memory after each batch
        torch.mps.empty_cache()
    
    return np.array(probabilities)

In [None]:
#needed to reduce the batch size, otherwise I had an error
# Get predictions for test data
base_probs = predict(model, test_encodings, batch_size=2)

In [None]:
# Get highest probability indices
predicted_labels = np.argmax(base_probs, axis=1)

In [None]:
# Get second highest probability indices
second_predicted_labels = np.argsort(base_probs, axis=1)[:, -2]  

In [None]:
#probabilites of first predicted
predicted_label_probs = base_probs[np.arange(len(predicted_labels)), predicted_labels]

In [None]:
#probabilites of second predicted
second_predicted_label_probs = np.sort(base_probs, axis=1)[:, -2]  

In [None]:
#for backend 
result = {
    "predicted_labels": predicted_labels,
    "predicted_label_probs": predicted_label_probs,
    "second_predicted_labels": second_predicted_labels,
    "second_predicted_label_probs": second_predicted_label_probs
}

In [None]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(y_test, predicted_labels, target_names=le.classes_)
print(report)

# Generate confusion matrix
cm = confusion_matrix(y_test, predicted_labels)
print("Confusion Matrix:")
print(cm)

In [None]:
# from sklearn.preprocessing import LabelBinarizer

# # 1. One-hot encode the true labels (y_test)
# lb = LabelBinarizer()
# y_true_onehot = lb.fit_transform(y_test)  # Shape: (n_samples, n_classes)

# # 2. Compute Brier score for multiclass
# brier_score = np.mean(np.sum((base_probs - y_true_onehot) ** 2, axis=1))
# print("Multiclass Brier score:", brier_score)