### Model Paramters

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd
import os

In [4]:
# Load pre-trained LLM model and tokenizer (for paraphrasing)
# Appropriate choice required for proper usage

from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification

from peft import LoraConfig, get_peft_model
from peft import PeftModel

model_id = "distilbert-base-uncased"
MY_DEVICE = "cpu"

In [5]:
####################################
# General Parameter
ID2LABEL = {0: "Polite", 1: "Impolite"}
LABEL2ID = {"Polite": 0, "Impolite": 1}

# Selected Hyperparameters
MAX_SEQUENCE_LENGTH = 512
TRUNCATION = True
PADDING = "max_length"
RETURN_TENSORS = "pt"

D_TYPE = torch.bfloat16

# Save
ADAPTER_NAME = "PolitenessAdapter"
SAVE_DIR = "model_politeness_finetuned\\"
SAVE_tokenizer = True

if(not os.path.exists(SAVE_DIR)):
    os.mkdir(SAVE_DIR)

In [6]:
# Model
model = AutoModelForSequenceClassification.from_pretrained(             #AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype= D_TYPE,
    max_length = MAX_SEQUENCE_LENGTH,

    # Model label map
    num_labels= 2,
    id2label= ID2LABEL,
    label2id= LABEL2ID
    )

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    max_length = MAX_SEQUENCE_LENGTH,
    truncation=TRUNCATION,
    padding= PADDING,
    padding_side= "right")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Access special tokens and their IDs
print(f"BOS token: {tokenizer.bos_token}, ID: {tokenizer.bos_token_id}")
print(f"EOS token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}")
print(f"CLS token: {tokenizer.cls_token}, ID: {tokenizer.cls_token_id}")
print(f"SEP token: {tokenizer.sep_token}, ID: {tokenizer.sep_token_id}")
print(f"PAD token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")
print(f"UNK token: {tokenizer.unk_token}, ID: {tokenizer.unk_token_id}")
print(f"MASK token: {tokenizer.mask_token}, ID: {tokenizer.mask_token_id}")

# Set the end of string tokens
if(tokenizer.pad_token is None):
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

BOS token: None, ID: None
EOS token: None, ID: None
CLS token: [CLS], ID: 101
SEP token: [SEP], ID: 102
PAD token: [PAD], ID: 0
UNK token: [UNK], ID: 100
MASK token: [MASK], ID: 103


In [8]:
print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")
print(model)

Memory footprint: 133.91 MB
DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dr

In [9]:
for name, param in model.named_parameters():
    # param.requires_grad = False
    print(f'{name} requires grad = {param.requires_grad}')

distilbert.embeddings.word_embeddings.weight requires grad = True
distilbert.embeddings.position_embeddings.weight requires grad = True
distilbert.embeddings.LayerNorm.weight requires grad = True
distilbert.embeddings.LayerNorm.bias requires grad = True
distilbert.transformer.layer.0.attention.q_lin.weight requires grad = True
distilbert.transformer.layer.0.attention.q_lin.bias requires grad = True
distilbert.transformer.layer.0.attention.k_lin.weight requires grad = True
distilbert.transformer.layer.0.attention.k_lin.bias requires grad = True
distilbert.transformer.layer.0.attention.v_lin.weight requires grad = True
distilbert.transformer.layer.0.attention.v_lin.bias requires grad = True
distilbert.transformer.layer.0.attention.out_lin.weight requires grad = True
distilbert.transformer.layer.0.attention.out_lin.bias requires grad = True
distilbert.transformer.layer.0.sa_layer_norm.weight requires grad = True
distilbert.transformer.layer.0.sa_layer_norm.bias requires grad = True
distil

In [10]:
# LoRA configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM", 

    r=8,                # The low-rank dimension
    lora_alpha=32,      # Scaling factor for LoRA layers
    lora_dropout=0.05,  # Dropout probability


    target_modules=["q_lin", "v_lin"],  # Target layers
    bias="none"                         # Don't apply bias
)

# set_lora
model = get_peft_model(model, lora_config, adapter_name= ADAPTER_NAME)
model.print_trainable_parameters()

trainable params: 147,456 || all params: 67,102,466 || trainable%: 0.2197


In [11]:
for name, param in model.named_parameters():
    if param.requires_grad is not None:
        print(f"Gradients for {name} exist. and requires grad = {param.requires_grad}")
    else:
        print(f"No gradients for {name}.")

Gradients for base_model.model.distilbert.embeddings.word_embeddings.weight exist. and requires grad = False
Gradients for base_model.model.distilbert.embeddings.position_embeddings.weight exist. and requires grad = False
Gradients for base_model.model.distilbert.embeddings.LayerNorm.weight exist. and requires grad = False
Gradients for base_model.model.distilbert.embeddings.LayerNorm.bias exist. and requires grad = False
Gradients for base_model.model.distilbert.transformer.layer.0.attention.q_lin.base_layer.weight exist. and requires grad = False
Gradients for base_model.model.distilbert.transformer.layer.0.attention.q_lin.base_layer.bias exist. and requires grad = False
Gradients for base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_A.PolitenessAdapter.weight exist. and requires grad = True
Gradients for base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_B.PolitenessAdapter.weight exist. and requires grad = True
Gradients for base_model.model.dis

In [12]:
####################################
# Training Parameters
LEARNING_RATE = 1e-3
BATCH_SIZE = 1  # Increased for stability
NUMBER_OF_EPOCHS = 2

GRAD_CLIP = 1.0  # Added for stability


NUM_ROWS = 80

In [13]:
####################################

# dataset class for the News
class SentenceDataset():
    def __init__(self, data_frame_addrs, cols_args= None, batch_size= 1, num_rows = None):
        self.data_frame = pd.read_csv(data_frame_addrs)
        if(num_rows is not None and num_rows > 0):
            self.data_frame = pd.read_csv(data_frame_addrs, nrows= num_rows)
        self.batch_size = batch_size

        # Total samples
        print(self.data_frame.shape) 
        self.length = self.data_frame.shape[0]
        self.col_args = self.data_frame.columns

        if(cols_args is not None):
            self.col_args = cols_args

    def print_data(self):
        print(self.data_frame)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        if(idx + self.batch_size <= self.length):
            data = self.data_frame.iloc[idx: idx + self.batch_size]
        elif(idx < self.length):
            data = pd.concat([self.data_frame.iloc[idx: self.length], self.data_frame.iloc[0: idx + self.batch_size - self.length]])
        else:
            raise IndexError
        # print(type(data))
        
        # Returns col names per col argument name
        return tuple([data[col] for col in self.col_args])
            
####################################
sentence_loader = SentenceDataset('en_train.csv', batch_size= BATCH_SIZE, num_rows= NUM_ROWS)
# for i, (sentences, scores) in enumerate(sentence_loader):
#     print(i, scores.to_list())

(80, 2)


In [14]:
def prompt_template(sentence: str):
    return f'''  
    [CLS]
    Classification: 0 for polite and 1 for impolite.
    Sentence => {sentence}
    '''

In [15]:
def torch_scores(scores_list, batch_size = 1):
    labels = []
    for score in scores_list:
        # negative Impolite scores
        if(score < 0):
            labels.append([0.0, 1.0])
        # positive scores
        else:
            labels.append([1.0, 0.0])
    batch_scores = torch.Tensor(labels).to(D_TYPE)
    return batch_scores

torch_scores([0, 1,-1])
# [Polite, Impolite]

tensor([[1., 0.],
        [1., 0.],
        [0., 1.]], dtype=torch.bfloat16)

In [19]:
def tokenize_prompt(sentences, tokenizer):
    # Create the prompts
    prompt_inputs = [prompt_template(sentence) for sentence in sentences]

    # Tokenize inputs
    tokenized_inputs = tokenizer(
        prompt_inputs,
        add_special_tokens= False,

        max_length= MAX_SEQUENCE_LENGTH,
        truncation= TRUNCATION,
        padding= PADDING,                   # Ensure proper padding
        return_tensors= RETURN_TENSORS      # Return tensors for PyTorch
    )

    return tokenized_inputs

# Tokenize the prompt
tokenized_inputs = tokenize_prompt(["hello, you?", "how?"], tokenizer)
# print(tokenized_inputs)

def detokenizer(tokenized_inputs, tokenizer):
    for tokens in tokenized_inputs['input_ids']:
        # print(tokens.shape)
        decoded_text = tokenizer.decode(tokens, skip_special_tokens= False)
        print(decoded_text)

detokenizer(tokenized_inputs, tokenizer)

[CLS] classification : 0 for polite and 1 for impolite. sentence = > hello, you? [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [17]:
# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr= LEARNING_RATE)

In [None]:
#############################################################

# Training

# Training function
def train_model(model, dataloader, num_epochs):
    model.train()  # Set model to "train" mode

    for epoch in range(num_epochs):
        running_loss = 0.0
        for (sentences, scores) in dataloader:

            # print(sentences, scores)
            tokenized_inputs = tokenize_prompt(sentences.to_list(), tokenizer)
            labels = torch_scores(scores.to_list(), BATCH_SIZE)

            # print(tokenized_inputs['input_ids'].shape)
            # detokenizer(tokenized_inputs, tokenizer)
            # print(labels)

            # Tokenized input sentence
            input_ids = tokenized_inputs['input_ids'].to(MY_DEVICE)            
            attention_mask = tokenized_inputs['attention_mask'].to(MY_DEVICE)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            
            # print(outputs, type(outputs))
            # print(labels, type(labels))
            # print(outputs.shape, labels.shape)

            # Loss
            loss = criterion(outputs, labels)
            # print(loss)

            # break

            # Zero the gradients
            optimizer.zero_grad()

            # Apply gradient clipping before optimizer step
            nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Print loss at the end of epoch
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader)}")

        # Save model at the end of each epoch
        model.save_pretrained(SAVE_DIR)
        # print(f"Model saved after epoch {epoch+1}")

    print("Training Done!")
    if(SAVE_tokenizer):
        tokenizer.save_pretrained(SAVE_DIR)

# Train the model
train_model(model, sentence_loader, num_epochs= NUMBER_OF_EPOCHS)

[CLS] classification : 0 for polite and 1 for impolite. sentence = > @ smjg, thanks. but why did you also remove the categories i added? [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [26]:
def test_model(model: AutoModelForCausalLM, output_folder, dataloader, adapter_name = "default"):
    model.load_adapter(output_folder + adapter_name, adapter_name)

    if(SAVE_tokenizer):
        tokenizer = AutoTokenizer.from_pretrained(
            output_folder,
            max_length = MAX_SEQUENCE_LENGTH,
            truncation=TRUNCATION,
            padding= PADDING,
            padding_side= "right")

    model.eval()

    criterion = nn.BCEWithLogitsLoss()
    running_loss = 0.0
    for sentence, score in dataloader:
        # print(sentences, scores)
        tokenized_inputs = tokenize_prompt(sentence.to_list(), tokenizer)
        labels = torch_scores(score.to_list(), BATCH_SIZE)

        # Tokenized input sentence
        input_ids = tokenized_inputs['input_ids'].to(MY_DEVICE)            
        attention_mask = tokenized_inputs['attention_mask'].to(MY_DEVICE)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        pred = outputs

        # loss
        loss = criterion(pred, labels)
        running_loss += loss.item()
    print(f"Loss: {running_loss/len(dataloader)}")

test_loader = SentenceDataset("en_test.csv", num_rows= 5)
test_model(model, SAVE_DIR, test_loader, ADAPTER_NAME)

(5, 2)
Loss: 0.69453125
