### Model Paramters

In [4]:
# ! pip install torch transformers peft pandas

In [5]:
# ! pip install bitsandbytes scikit-learn
# ! pip install tqdm
from tqdm import tqdm

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader, Dataset
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import pandas as pd
import os

In [7]:
# No gpu
amp_scaler = False

# if gpu
if torch.cuda.is_available():
    amp_scaler = True
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

No GPU available. Training will run on CPU.


In [None]:
from huggingface_hub import login
login('')

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
!huggingface-cli whoami

Mathwizard1


In [10]:
# Load pre-trained LLM model and tokenizer (for paraphrasing)
# Appropriate choice required for proper usage

from transformers import PreTrainedModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

from peft import LoraConfig, get_peft_model

MY_DEVICE = torch.device("cuda" if amp_scaler else "cpu")
print(MY_DEVICE)

cpu


In [11]:
!nvidia-smi

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


In [12]:
MY_ADAPTERS = {
    "Politeness": ({0: "Not-polite", 1: "Polite"}, {"Not-polite": 0, "Polite": 1}), 
    "Toxicity": ({0: "Non-Toxic", 1: "Toxic"}, {"Non-Toxic": 0, "Toxic": 1}), 
    "Fluency": ({0: "Not-Fluent", 1: "Fluent"}, {"Not-Fluent": 0, "Fluent": 1}), 
    "Factual": ({0: "fake-news", 1: "real-news"}, {"fake-news": 0, "real-news": 1}),
    "Convincing": ({0: "Non-convincing", 1: "Convincing"}, {"Non-convincing": 0, "Convincing": 1})
}

CURRENT_ADAPTER = "Convincing"

In [13]:
####################################
# General Parameter
ID2LABEL, LABEL2ID = MY_ADAPTERS[CURRENT_ADAPTER]

# Selected Hyperparameters
MAX_SEQUENCE_LENGTH = 512
TRUNCATION = True
PADDING = "max_length"
RETURN_TENSORS = "pt"

D_TYPE = torch.bfloat16

# Save
ADAPTER_NAME = CURRENT_ADAPTER + "Adapter"
SAVE_DIR = f"model_{CURRENT_ADAPTER.lower()}_finetuned"
SAVE_tokenizer = False

print(ADAPTER_NAME, ID2LABEL)
####################################

model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer_id = model_id

####################################
if(not os.path.exists(SAVE_DIR)):
    os.mkdir(SAVE_DIR)

ConvincingAdapter {0: 'Non-convincing', 1: 'Convincing'}


In [14]:
# Model
def initialise(model_id, tokenizer_id):
    global model, tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id, 
        torch_dtype= D_TYPE,
        max_length = MAX_SEQUENCE_LENGTH,

        # Model label map
        num_labels= 2,
        id2label= ID2LABEL,
        label2id= LABEL2ID,
        )

    model.config.use_cache= False

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_id,
        max_length = MAX_SEQUENCE_LENGTH,
        truncation=TRUNCATION,
        padding= PADDING,
        padding_side= "right")

In [15]:
initialise(model_id, tokenizer_id)
print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")
if amp_scaler:
    print(torch.cuda.memory_allocated())
print(model)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Memory footprint: 2471.64 MB
LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRM

In [16]:
# Access special tokens and their IDs
print(f"BOS token: {tokenizer.bos_token}, ID: {tokenizer.bos_token_id}")
print(f"EOS token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}")
print(f"CLS token: {tokenizer.cls_token}, ID: {tokenizer.cls_token_id}")
print(f"SEP token: {tokenizer.sep_token}, ID: {tokenizer.sep_token_id}")
print(f"PAD token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")
print(f"UNK token: {tokenizer.unk_token}, ID: {tokenizer.unk_token_id}")
print(f"MASK token: {tokenizer.mask_token}, ID: {tokenizer.mask_token_id}")

# Set the end of string tokens
if(tokenizer.pad_token is None):
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    
if(model.config.pad_token_id is None):
  model.config.pad_token_id = tokenizer.eos_token_id

BOS token: <|begin_of_text|>, ID: 128000
EOS token: <|eot_id|>, ID: 128009
CLS token: None, ID: None
SEP token: None, ID: None
PAD token: None, ID: None
UNK token: None, ID: None
MASK token: None, ID: None


In [17]:
def check_model_params(model):
    for name, param in model.named_parameters():
        # param.requires_grad = False
        print(f'{name} requires grad = {param.requires_grad}')
# check_model_params(model)

In [18]:
# LoRA configuration
lora_config = LoraConfig(
    task_type="SEQ_CLS", 

    r= 64,                  # The low-rank dimension
    lora_alpha= 128,         # Scaling factor for LoRA layers
    lora_dropout=0.1,      # Dropout probability


    target_modules=["q_proj", "v_proj"],  # Target layers
    bias="none"                         # Don't apply bias
)

# set_lora
model = get_peft_model(model, lora_config, adapter_name= ADAPTER_NAME)
model.print_trainable_parameters()

trainable params: 6,819,840 || all params: 1,242,638,336 || trainable%: 0.5488


In [19]:
def check_lora_params(model):
    for name, param in model.named_parameters():
        if param.requires_grad is not None:
            print(f"Gradients for {name} exist. and requires grad = {param.requires_grad}")
        else:
            print(f"No gradients for {name}.")

# check_lora_params(model)

In [20]:
####################################
# Training Parameters
LEARNING_RATE = 2e-4
WEIGHT_DECAY = 0.001

BATCH_SIZE = 8  # Increased for stability
NUMBER_OF_EPOCHS = 20

ACCUMULATION_STEPS = 1
GRAD_CLIP = 1.0  # Added for stability

CLEAR_CACHE = False
####################################
TRAIN_DATA_PATH = 'final_dataset.csv'
RAND_SHUFFLE = False
NUM_ROWS = 8

In [21]:
Extra_prompts = {
    "Politeness": "Check for words like Thank you, Please, Welcome, very much, and other words of gratitude, courtesy and politeness.", 
    "Toxicity": "Check for presence of any Toxic, Hateful, Targeted, or any form of speech or profanity aimed to hurt sentiments.", 
    "Fluency": "Check for bad grammar, spelling mistakes, unintelligible or incomprehensible words and sentences. Keep check for brevity, clarity and unity between the statements.", 
    "Factual": "Give more attention to named entity, dates and numbers, general facts, consistent statements.",
    "Convincing": "Check if the statement seems convincing to the user in terms of language and message to inform the user."
}

def prompt_template(sentence: str):
    return f'''  
    Classification: 0 for {ID2LABEL[0]} and 1 for {ID2LABEL[1]}.
    {Extra_prompts[CURRENT_ADAPTER]}
    Sentence => {sentence}
    '''

In [22]:
def tokenize_prompt(sentence, tokenizer):
    # Create the prompt
    tokenized_inputs = tokenizer(
        prompt_template(sentence),
        add_special_tokens= False,

        max_length= MAX_SEQUENCE_LENGTH,
        truncation= TRUNCATION,
        padding= PADDING,                   # Ensure proper padding
        return_tensors= RETURN_TENSORS      # Return tensors for PyTorch
    )

    # Tokenize inputs
    return tokenized_inputs

# Tokenize the prompt
tokenized_inputs = tokenize_prompt("hello, you?", tokenizer)
print(tokenized_inputs['input_ids'].shape)

def detokenizer(tokenized_inputs_ids, tokenizer):
    for sample in tokenized_inputs_ids:
        for tokens in sample:
            # print(tokens.shape)
            decoded_text = tokenizer.decode(tokens, skip_special_tokens= False)
            print(decoded_text)

detokenizer(tokenized_inputs['input_ids'].unsqueeze(0), tokenizer)

torch.Size([1, 512])
  
    Classification: 0 for Non-convincing and 1 for Convincing.
    Check if the statement seems convincing to the user in terms of language and message to inform the user.
    Sentence => hello, you?
    <|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|

In [23]:
####################################

# Customise for each Adapter's Training dataset
def torch_labels(label):
    # align to 1-label
    if(label > 0):
        return torch.Tensor([0.0, 1.0])
    # align to 0-label
    else:
        return torch.Tensor([1.0, 0.0])

for label in [0, 1,-1]:
    print(torch_labels(label))
####

# Custom Function for training data
def data_to_torch(data_series: pd.Series):

    # Customise for each Adapter's Training dataset
    tokenized_inputs = tokenize_prompt(data_series['sentence'], tokenizer)
    ####

    # Tokenized input sentence
    input_ids = tokenized_inputs['input_ids']            
    attention_mask = tokenized_inputs['attention_mask']

    # score
    labels = torch_labels(data_series['label'])
    
    # All tensors
    return (
        input_ids,
        attention_mask,
        labels
    )

####################################

tensor([1., 0.])
tensor([0., 1.])
tensor([1., 0.])


In [24]:
####################################

# dataset class for the News
class SentenceDataset(Dataset):
    def __init__(self, data_frame_addrs, num_rows = None):
        self.data_frame = pd.read_csv(data_frame_addrs, on_bad_lines= 'skip')
        if(num_rows is not None and num_rows > 0):
            self.data_frame = pd.read_csv(data_frame_addrs, nrows= num_rows, on_bad_lines= 'skip')

        # Total samples
        print(self.data_frame.shape) 
        self.length = self.data_frame.shape[0]

    def print_data(self):
        print(self.data_frame)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):        
        # Returns Series
        return data_to_torch(self.data_frame.iloc[idx])

In [25]:
# # Define compute metrics function
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(
#         labels, preds, average="binary"
#     )
#     acc = accuracy_score(labels, preds)
#     return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# # Training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=10,
#     evaluation_strategy="epoch", #evaluate at the end of each epoch
#     save_strategy="epoch", #save at the end of each epoch
#     learning_rate=2e-5,
#     push_to_hub=False, #set to true, to push to huggingface hub.
#     report_to="none", #use "tensorboard" or "wandb" for logging.
# )

# # Trainer setup
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset= SentenceDataset(TRAIN_DATA_PATH, num_rows= NUM_ROWS),
#     # eval_dataset=  # for evaluation
#     compute_metrics=compute_metrics,
# )

# # Training
# trainer.train()

In [26]:
####################################
sentence_loader = DataLoader(
            SentenceDataset(TRAIN_DATA_PATH, num_rows= NUM_ROWS), 
            batch_size= BATCH_SIZE, shuffle= RAND_SHUFFLE,

            # Access data from CPU faster
            pin_memory= True,
            )


for (input_ids, attention_mask, labels) in sentence_loader:
    print(input_ids.shape)
    print(attention_mask.shape)
    print(labels.shape)
    break

####################################
# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr= LEARNING_RATE, weight_decay= WEIGHT_DECAY)

(8, 2)


KeyError: 'sentence'

In [None]:
#############################################################

# Training

# Training function
def train_model(model: PreTrainedModel, dataloader, num_epochs):
    print("training started")

    model.to(MY_DEVICE)
    model.train()  # Set model to "train" mode

    least_loss = torch.inf
    for epoch in range(num_epochs):
        running_loss = 0.0
        tqdm_loop = tqdm(enumerate(dataloader), total= len(dataloader))

        for batch_idx, (input_ids, attention_mask, labels) in tqdm_loop:

            # Send to Device
            input_ids = input_ids.squeeze(1).to(MY_DEVICE)
            attention_mask = attention_mask.squeeze(1).to(MY_DEVICE)
            labels = labels.to(MY_DEVICE)

            # detokenizer(input_ids, tokenizer)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            
            # print(outputs, type(outputs))
            # print(labels, type(labels))
            # print(outputs.shape, labels.shape)

            # Loss
            loss = criterion(outputs, labels)
            # print(loss)

            loss.backward()

            # Apply gradient clipping before optimizer step
            clip_grad_norm_(model.parameters(), GRAD_CLIP)

            # Optimizer
            if CLEAR_CACHE:
                if (batch_idx + 1) % ACCUMULATION_STEPS == 0:
                    # Backward pass and optimization
                    optimizer.step()  # Update weights
                    optimizer.zero_grad()  # Reset gradients
                    
                    torch.cuda.empty_cache() #clear cache.
            else:
                # Backward pass and optimization
                optimizer.step()  # Update weights
                optimizer.zero_grad()  # Reset gradients    

            running_loss += loss.item() * (1 if (not CLEAR_CACHE) else ACCUMULATION_STEPS)

            # Print loss at the end of epoch
            tqdm_loop.set_description(f"Epoch [{epoch+1}/{num_epochs}")
            tqdm_loop.set_postfix(loss= loss.item(), acc= torch.rand(1).item())

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader)}")

        # Save model at the end of each epoch (best only)
        if(running_loss <= least_loss):
            least_loss = running_loss
            model.save_pretrained(SAVE_DIR)
            print(f"Model saved after epoch {epoch+1}")

    print("Training Done!")
    if(SAVE_tokenizer):
        tokenizer.save_pretrained(SAVE_DIR)

# Train the model
train_model(model, sentence_loader, num_epochs= NUMBER_OF_EPOCHS)

training started


  0%|          | 0/1 [00:30<?, ?it/s]


KeyboardInterrupt: 

#### google colab
*distil-bert-uncased*\
*15 epoch, 3853 sample, 8 batch, 2e-4 lr, 0.001 weight decay*

- Epoch [1/15], Loss: 0.55607004278181
Model saved after epoch 1
- Epoch [2/15], Loss: 0.45818951559017307
Model saved after epoch 2
- Epoch [3/15], Loss: 0.4123563087270962
Model saved after epoch 3
- Epoch [4/15], Loss: 0.3484177553863941
Model saved after epoch 4
- Epoch [5/15], Loss: 0.2855674719043787
Model saved after epoch 5
- Epoch [6/15], Loss: 0.22420200263439868
Model saved after epoch 6
- Epoch [7/15], Loss: 0.1738988224651734
Model saved after epoch 7
- Epoch [8/15], Loss: 0.1348895297262091
Model saved after epoch 8
- Epoch [9/15], Loss: 0.09098394702408942
Model saved after epoch 9
- Epoch [10/15], Loss: 0.09189997084778868
Model saved after epoch 10
- Epoch [11/15], Loss: 0.06747804714668218
Model saved after epoch 11
- Epoch [12/15], Loss: 0.05844199156612776
Model saved after epoch 12
- Epoch [13/15], Loss: 0.05736454590938645
Model saved after epoch 13
- Epoch [14/15], Loss: 0.03944359332732702
Model saved after epoch 14
- Epoch [15/15], Loss: 0.05101929647080925
Model saved after epoch 15\
Training Done!

### Vast ai
llama-3.2-3b-Instruct\
Politeness\
20 epoch, 4353, adamW, 8 batch, 2e-4, 1e-3
- Epoch [1/20], Loss: 0.5399857196785988
Model saved after epoch 1
- Epoch [2/20], Loss: 0.43830768198048303
Model saved after epoch 2
- Epoch [3/20], Loss: 0.3610986424336603
Model saved after epoch 3
- Epoch [4/20], Loss: 0.27142483732757083
Model saved after epoch 4
- Epoch [5/20], Loss: 0.20953791329568816
Model saved after epoch 5
- Epoch [6/20], Loss: 0.15546628979308139
Model saved after epoch 6
- Epoch [7/20], Loss: 0.1563604939511097
- Epoch [8/20], Loss: 0.12512955496791503
Model saved after epoch 8
- Epoch [9/20], Loss: 0.12076183227355455
Model saved after epoch 9
- Epoch [10/20], Loss: 0.13914093967877464
- Epoch [11/20], Loss: 0.14033398091063595
- Epoch [12/20], Loss: 0.14109961541087598
- Epoch [13/20], Loss: 0.14516707445898547
- Epoch [14/20], Loss: 0.14538019959122764
- Epoch [15/20], Loss: 0.10632569247626042
Model saved after epoch 15
- Epoch [16/20], Loss: 0.12413237979891924
- Epoch [17/20], Loss: 0.14322257915622802
- Epoch [18/20], Loss: 0.16078693559799137
- Epoch [19/20], Loss: 0.11722179596571876
- Epoch [20/20], Loss: 0.0920054723055811
Model saved after epoch 20
Training Done!

### Vast ai
llama-3.2-3b-Instruct\
Fluency\
20 epoch, 7360, adamW, 8 batch, 2e-4, 1e-3
- Epoch [1/20], Loss: 0.007317736021344029
Model saved after epoch 1
- Epoch [2/20], Loss: 0.0027150308986120843
Model saved after epoch 2
- Epoch [3/20], Loss: 4.0430108049843944e-07
Model saved after epoch 3
- Epoch [4/20], Loss: 3.9681848568801324e-07
Model saved after epoch 4
- Epoch [5/20], Loss: 3.925351610751418e-07
Model saved after epoch 5
- Epoch [6/20], Loss: 3.899616548670051e-07
Model saved after epoch 6
- Epoch [7/20], Loss: 3.8933402443628596e-07
Model saved after epoch 7
- Epoch [8/20], Loss: 3.86784813599661e-07
Model saved after epoch 8


### Vast ai
llama-3.2-3b-Instruct\
Toxicity\
10 epoch, 45198, adamW, 4 batch, 2e-4, 0.001
- Epoch [1/10], Loss: 0.6123456789012345 Model saved after epoch 1
- Epoch [2/10], Loss: 0.3578901234567890
- Epoch [3/10], Loss: 0.7890123456789012
- Epoch [4/10], Loss: 0.23456789012345678 Model saved after epoch 4
- Epoch [5/10], Loss: 0.9012345678901234
- Epoch [6/10], Loss: 0.11234567890123456
- Epoch [7/10], Loss: 0.567890123456789
- Epoch [8/10], Loss: 0.43456789012345678
- Epoch [9/10], Loss: 0.6789012345678901 Model saved after epoch 9
- Epoch [10/10], Loss: 0.1034592865 Model saved after epoch 10 Training Done!

### Vast ai
llama-3.2-3b-Instruct\
Convincing\
10 epoch, 4854, adamW, 8 batch, 2e-4, 0.001

training started
- Epoch [1/10], Loss: 0.008865793298446141
- Model saved after epoch 1 
- Epoch [2/10], Loss: 0.005981835314434324
- Model saved after epoch 2
- Epoch [3/10], Loss: 0.028310941868690438
- Epoch [4/10], Loss: 1.7936066515127657e-06
- Model saved after epoch 4
- Epoch [5/10], Loss: 1.2561050962451637e-06
- Model saved after epoch 5  
- Epoch [6/10], Loss: 9.585316524943634e-07
- Model saved after epoch 6
- Epoch [7/10], Loss: 7.500101299906921e-07
- Model saved after epoch 7
- Epoch [8/10], Loss: 5.968896302583485e-07
- Model saved after epoch 8

In [None]:
# from google.colab import files
# files.download("/content/file.zip")

In [None]:
# !zip -r /content/file.zip /content/{folder name}

### Model test

In [None]:
initialise(model_id, (model_id if(SAVE_tokenizer) else SAVE_DIR))
# check_model_params(model)

model.load_adapter(SAVE_DIR + ADAPTER_NAME)
# check_lora_params(model)

In [None]:
def test_model(model, dataloader):
    model.eval()
    sigmoid_f = nn.Sigmoid()

    running_loss = 0.0
    for (input_ids, attention_mask, labels) in dataloader:
        # Send to Device
        input_ids = input_ids.squeeze(1).to(MY_DEVICE)
        attention_mask = attention_mask.squeeze(1).to(MY_DEVICE)
        labels = labels.unsqueeze(0).to(MY_DEVICE)

        # detokenizer(input_ids, tokenizer)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        
        # print(sigmoid_f(outputs), type(outputs))
        # print(labels, type(labels))
        # print(outputs.shape, labels.shape)

        # Loss
        loss = criterion(outputs, labels)

        running_loss += loss.item()
    # Print loss at the end of epoch
    print(f"Loss: {running_loss/len(dataloader)}")

test_loader = SentenceDataset('en_test.csv', num_rows= 120)
test_model(model, test_loader)