### Model Paramters

In [None]:
# % pip install torch sklearn transformers bitsandbytes pandas

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import pandas as pd
import os

In [7]:
# No gpu
amp_scaler = False

# if gpu
if torch.cuda.is_available():
    amp_scaler = True
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

No GPU available. Training will run on CPU.


In [None]:
from huggingface_hub import login
login('')

  from .autonotebook import tqdm as notebook_tqdm


HTTPError: Invalid user token.

In [None]:
# Load pre-trained LLM model and tokenizer (for paraphrasing)
# Appropriate choice required for proper usage

from transformers import PreTrainedModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

from peft import LoraConfig, get_peft_model

MY_DEVICE = torch.device("cuda" if amp_scaler else "cpu")
print(MY_DEVICE)

cpu


In [None]:
MY_ADAPTERS = {
    "Politeness": ({0: "Not-polite", 1: "Polite"}, {"Not-polite": 0, "Polite": 1}), 
    "Toxicity": ({0: "Non-Toxic", 1: "Toxic"}, {"Non-Toxic": 0, "Toxic": 1}), 
    "Fluency": ({0: "Not-Fluent", 1: "Fluent"}, {"Not-Fluent": 0, "Fluent": 1}), 
    "Factual": ({0: "fake-news", 1: "real-news"}, {"fake-news": 0, "real-news": 1})
}

CURRENT_ADAPTER = "Toxicity"

In [12]:
####################################
# General Parameter
ID2LABEL, LABEL2ID = MY_ADAPTERS[CURRENT_ADAPTER]

# Selected Hyperparameters
MAX_SEQUENCE_LENGTH = 1024
TRUNCATION = True
PADDING = "max_length"
RETURN_TENSORS = "pt"

D_TYPE = torch.bfloat16
MODEL_4BIT = True

# Save
ADAPTER_NAME = CURRENT_ADAPTER + "Adapter"
SAVE_DIR = f"model_{CURRENT_ADAPTER.lower()}_finetuned"
SAVE_tokenizer = False

print(ADAPTER_NAME, ID2LABEL)
####################################

model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer_id = model_id

####################################
if(not os.path.exists(SAVE_DIR)):
    os.mkdir(SAVE_DIR)

ToxicityAdapter {0: 'Non-Toxic', 1: 'Toxic'}


In [None]:
# Model
def initialise(model_id, tokenizer_id):
    global model, tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id, 
        torch_dtype= D_TYPE,
        max_length = MAX_SEQUENCE_LENGTH,

        load_in_4bits= MODEL_4BIT,

        # Model label map
        num_labels= 2,
        id2label= ID2LABEL,
        label2id= LABEL2ID,
        )

    model.config.use_cache= False

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_id,
        max_length = MAX_SEQUENCE_LENGTH,
        truncation=TRUNCATION,
        padding= PADDING,
        padding_side= "right")

In [14]:
initialise(model_id, tokenizer_id)
print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")
print(model)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 

In [15]:
# Access special tokens and their IDs
print(f"BOS token: {tokenizer.bos_token}, ID: {tokenizer.bos_token_id}")
print(f"EOS token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}")
print(f"CLS token: {tokenizer.cls_token}, ID: {tokenizer.cls_token_id}")
print(f"SEP token: {tokenizer.sep_token}, ID: {tokenizer.sep_token_id}")
print(f"PAD token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")
print(f"UNK token: {tokenizer.unk_token}, ID: {tokenizer.unk_token_id}")
print(f"MASK token: {tokenizer.mask_token}, ID: {tokenizer.mask_token_id}")

# Set the end of string tokens
if(tokenizer.pad_token is None):
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    
if(model.config.pad_token_id is None):
  model.config.pad_token_id = tokenizer.eos_token_id

NameError: name 'tokenizer' is not defined

In [None]:
def check_model_params(model):
    for name, param in model.named_parameters():
        # param.requires_grad = False
        print(f'{name} requires grad = {param.requires_grad}')
# check_model_params(model)

In [None]:
# LoRA configuration
lora_config = LoraConfig(
    task_type="SEQ_CLS", 

    r= 64,                  # The low-rank dimension
    lora_alpha= 128,         # Scaling factor for LoRA layers
    lora_dropout=0.1,      # Dropout probability


    target_modules=["q_proj", "v_proj"],  # Target layers
    bias="none"                         # Don't apply bias
)

# set_lora
model = get_peft_model(model, lora_config, adapter_name= ADAPTER_NAME)
model.print_trainable_parameters()

In [None]:
def check_lora_params(model):
    for name, param in model.named_parameters():
        if param.requires_grad is not None:
            print(f"Gradients for {name} exist. and requires grad = {param.requires_grad}")
        else:
            print(f"No gradients for {name}.")

# check_lora_params(model)

In [None]:
####################################
# Training Parameters
LEARNING_RATE = 2e-4
WEIGHT_DECAY = 0.001

BATCH_SIZE = 8  # Increased for stability
NUMBER_OF_EPOCHS = 20

ACCUMULATION_STEPS = 1
GRAD_CLIP = 1.0  # Added for stability

####################################
TRAIN_DATA_PATH = 'en_train.csv'
RAND_SHUFFLE = False
NUM_ROWS = None

In [None]:
Extra_prompts = {
    "Politeness": "Check for words like Thank you, Please, Welcome, very much, and other words of gratitude, courtesy and politeness.", 
    "Toxicity": "Check for presence of any Toxic, Hateful, Targeted, or any form of speech or profanity aimed to hurt sentiments.", 
    "Fluency": "Check for bad grammar, spelling mistakes, unintelligible or incomprehensible words and sentences. Keep check for brevity, clarity and unity between the statements.", 
    "Factual": "Give more attention to named entity, dates and numbers, general facts, consistent statements."
}

def prompt_template(sentence: str):
    return f'''  
    Classification: 0 for {ID2LABEL[0]} and 1 for {ID2LABEL[1]}.
    {Extra_prompts[CURRENT_ADAPTER]}
    Sentence => {sentence}
    '''

In [9]:
def tokenize_prompt(sentence, tokenizer):
    # Create the prompt
    tokenized_inputs = tokenizer(
        prompt_template(sentence),
        add_special_tokens= False,

        max_length= MAX_SEQUENCE_LENGTH,
        truncation= TRUNCATION,
        padding= PADDING,                   # Ensure proper padding
        return_tensors= RETURN_TENSORS      # Return tensors for PyTorch
    )

    # Tokenize inputs
    return tokenized_inputs

# Tokenize the prompt
tokenized_inputs = tokenize_prompt("hello, you?", tokenizer)
print(tokenized_inputs['input_ids'].shape)

def detokenizer(tokenized_inputs_ids, tokenizer):
    for sample in tokenized_inputs_ids:
        for tokens in sample:
            # print(tokens.shape)
            decoded_text = tokenizer.decode(tokens, skip_special_tokens= False)
            print(decoded_text)

detokenizer(tokenized_inputs['input_ids'].unsqueeze(0), tokenizer)

NameError: name 'tokenizer' is not defined

In [None]:
####################################

# Customise for each Adapter's Training dataset
def torch_scores(score):
    # negative scores
    if(score <= 0):
        return torch.Tensor([0.0, 1.0])
    # positive scores
    else:
        return torch.Tensor([1.0, 0.0])

for score in [0, 1,-1]:
    print(torch_scores(score))
####

# Custom Function for training data
def data_to_torch(data_series: pd.Series):

    # Customise for each Adapter's Training dataset
    tokenized_inputs = tokenize_prompt(data_series['sentence'], tokenizer)
    ####

    # Tokenized input sentence
    input_ids = tokenized_inputs['input_ids']            
    attention_mask = tokenized_inputs['attention_mask']

    # score
    labels = torch_scores(data_series['score'])
    
    # All tensors
    return (
        input_ids,
        attention_mask,
        labels
    )

####################################

In [None]:
####################################

# dataset class for the News
class SentenceDataset(Dataset):
    def __init__(self, data_frame_addrs, num_rows = None):
        self.data_frame = pd.read_csv(data_frame_addrs)
        if(num_rows is not None and num_rows > 0):
            self.data_frame = pd.read_csv(data_frame_addrs, nrows= num_rows)

        # Total samples
        print(self.data_frame.shape) 
        self.length = self.data_frame.shape[0]

    def print_data(self):
        print(self.data_frame)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):        
        # Returns Series
        return data_to_torch(self.data_frame.iloc[idx])

NameError: name 'NUM_ROWS' is not defined

In [None]:
# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch", #evaluate at the end of each epoch
    save_strategy="epoch", #save at the end of each epoch
    learning_rate=2e-5,
    push_to_hub=False, #set to true, to push to huggingface hub.
    report_to="none", #use "tensorboard" or "wandb" for logging.
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= SentenceDataset(TRAIN_DATA_PATH, num_rows= NUM_ROWS),
    # eval_dataset=  # for evaluation
    compute_metrics=compute_metrics,
)

# Training
trainer.train()

In [None]:
####################################
sentence_loader = DataLoader(
            SentenceDataset(TRAIN_DATA_PATH, num_rows= NUM_ROWS), 
            batch_size= BATCH_SIZE, shuffle= RAND_SHUFFLE,

            # Access data from CPU faster
            pin_memory= True
            )


for (input_ids, attention_mask, labels) in sentence_loader:
    print(input_ids.shape)
    print(attention_mask.shape)
    print(labels.shape)
    break

####################################
# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr= LEARNING_RATE, weight_decay= WEIGHT_DECAY)

In [None]:
#############################################################

# Training

# Training function
def train_model(model: PreTrainedModel, dataloader, num_epochs):
    model.to(MY_DEVICE)
    model.train()  # Set model to "train" mode

    least_loss = torch.inf
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch_idx, (input_ids, attention_mask, labels) in enumerate(dataloader):

            # Send to Device
            input_ids = input_ids.squeeze(1).to(MY_DEVICE)
            attention_mask = attention_mask.squeeze(1).to(MY_DEVICE)
            labels = labels.to(MY_DEVICE)

            # detokenizer(input_ids, tokenizer)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            
            # print(outputs, type(outputs))
            # print(labels, type(labels))
            # print(outputs.shape, labels.shape)

            # Loss
            loss = criterion(outputs, labels)
            # print(loss)

            loss.backward()

            # Apply gradient clipping before optimizer step
            clip_grad_norm_(model.parameters(), GRAD_CLIP)

            # Optimizer
            if (batch_idx + 1) % ACCUMULATION_STEPS == 0:

                # Backward pass and optimization
                optimizer.step()  # Update weights
                optimizer.zero_grad()  # Reset gradients
                torch.cuda.empty_cache() #clear cache.

            running_loss += loss.item() * ACCUMULATION_STEPS

        # Print loss at the end of epoch
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader)}")

        # Save model at the end of each epoch (best only)
        if(running_loss <= least_loss):
            least_loss = running_loss
            model.save_pretrained(SAVE_DIR)
            print(f"Model saved after epoch {epoch+1}")

    print("Training Done!")
    if(SAVE_tokenizer):
        tokenizer.save_pretrained(SAVE_DIR)

# Train the model
# train_model(model, sentence_loader, num_epochs= NUMBER_OF_EPOCHS)

#### google colab
*distil-bert-uncased*\
*15 epoch, 3853 sample, 8 batch, 2e-4 lr, 0.0001 weight decay*

- Epoch [1/15], Loss: 0.55607004278181
Model saved after epoch 1
- Epoch [2/15], Loss: 0.45818951559017307
Model saved after epoch 2
- Epoch [3/15], Loss: 0.4123563087270962
Model saved after epoch 3
- Epoch [4/15], Loss: 0.3484177553863941
Model saved after epoch 4
- Epoch [5/15], Loss: 0.2855674719043787
Model saved after epoch 5
- Epoch [6/15], Loss: 0.22420200263439868
Model saved after epoch 6
- Epoch [7/15], Loss: 0.1738988224651734
Model saved after epoch 7
- Epoch [8/15], Loss: 0.1348895297262091
Model saved after epoch 8
- Epoch [9/15], Loss: 0.09098394702408942
Model saved after epoch 9
- Epoch [10/15], Loss: 0.09189997084778868
Model saved after epoch 10
- Epoch [11/15], Loss: 0.06747804714668218
Model saved after epoch 11
- Epoch [12/15], Loss: 0.05844199156612776
Model saved after epoch 12
- Epoch [13/15], Loss: 0.05736454590938645
Model saved after epoch 13
- Epoch [14/15], Loss: 0.03944359332732702
Model saved after epoch 14
- Epoch [15/15], Loss: 0.05101929647080925
Model saved after epoch 15\
Training Done!

In [None]:
# from google.colab import files
# files.download("/content/file.zip")

In [None]:
# !zip -r /content/file.zip /content/{folder name}

### Model test

In [None]:
initialise(model_id, (model_id if(SAVE_tokenizer) else SAVE_DIR))
# check_model_params(model)

model.load_adapter(SAVE_DIR + ADAPTER_NAME)
# check_lora_params(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def test_model(model, dataloader):
    model.eval()
    sigmoid_f = nn.Sigmoid()

    running_loss = 0.0
    for (input_ids, attention_mask, labels) in dataloader:
        # Send to Device
        input_ids = input_ids.squeeze(1).to(MY_DEVICE)
        attention_mask = attention_mask.squeeze(1).to(MY_DEVICE)
        labels = labels.unsqueeze(0).to(MY_DEVICE)

        # detokenizer(input_ids, tokenizer)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        
        # print(sigmoid_f(outputs), type(outputs))
        # print(labels, type(labels))
        # print(outputs.shape, labels.shape)

        # Loss
        loss = criterion(outputs, labels)

        running_loss += loss.item()
    # Print loss at the end of epoch
    print(f"Loss: {running_loss/len(dataloader)}")

test_loader = SentenceDataset('en_test.csv', num_rows= 120)
test_model(model, test_loader)

(120, 2)
Loss: 1.440581500530243
