In [70]:
%load_ext autoreload
%autoreload 2

# Import all required library
import os
import time
import math

import numpy as np
import pandas as pd

import tqdm

import warnings
warnings.filterwarnings("ignore")

import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW, 
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    Trainer, 
    TrainingArguments,
    AutoModelForCausalLM,
    set_seed
)

from peft import (
    LoraConfig, 
    get_peft_model, 
    TaskType,
    PeftModel
)

import datasets 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
seed = 19
#reproducibility
def _seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    set_seed(seed)

_seed(seed)

In [72]:
# Define a function that can print the trainable parameters 
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params} \nall model parameters: {all_model_params}{"\n"}percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [73]:
full_data = pd.read_csv("7-prompts.csv")
print(f"We have {len(full_data)} samples") # Number of data we have

print(full_data.sample(5,random_state=19))

# full_data = full_data.head(1000) # Subset the data for testing the code

We have 17251 samples
                                                    text  label
8511   The value of using this technology to read stu...      0
5837   the author supports his ideas very well becaus...      0
17230  Dear Senator,\n\nI am writing to you today to ...      1
12936  Dear Mr./Mrs. Senator,\n\nThe Electoral Colleg...      0
15253  "The Challenge of Exploring Venus" by James Tr...      1


In [74]:
from sklearn.model_selection import train_test_split

# Split it when augmented data is ready
X_train, X_val, y_train, y_val = train_test_split(full_data["text"],
                                                  full_data["label"],
                                                  test_size=0.3,
                                                  stratify=full_data["label"],
                                                  random_state=42)
print(f"We have {len(X_train)} training samples")
print(f"We have {len(X_val)} validation samples")
print("----------------------------")
count = full_data["label"].value_counts()
print(f"Number of Essays written by Human: {count[0]}")
print(f"Number of Essays generated by LLM: {count[1]}")

X_train.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
X_val.reset_index(drop = True, inplace = True)
y_val.reset_index(drop = True, inplace = True)

We have 12075 training samples
We have 5176 validation samples
----------------------------
Number of Essays written by Human: 14247
Number of Essays generated by LLM: 3004


In [75]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load DistilBERT model and tokenizer (uncased)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", return_dict=True, num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Print the number of trainable parameters
print(print_number_of_trainable_model_parameters(model))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable model parameters: 66955010 
all model parameters: 66955010
percentage of trainable model parameters: 100.00%


In [76]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [77]:
# Define the LoRA Configuration
lora_config = LoraConfig(
    r=8, # Rank Number
    lora_alpha=32, # Alpha (Scaling Factor)
    lora_dropout=0.05, # Dropout Prob for Lora
    target_modules=["q_lin", "k_lin","v_lin"], # Which layer to apply LoRA, usually only apply on MultiHead Attention Layer
    bias='none',
    task_type=TaskType.SEQ_CLS, # Seqence to Classification Task

)


In [78]:
# Get our LoRA-enabled model
peft_model = get_peft_model(model, 
                            lora_config,)

# Reduced trainble parameters
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 923906 
all model parameters: 67878916
percentage of trainable model parameters: 1.36%


In [79]:
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

In [80]:
# Tokenize function
def tokenize_func(data):
    return tokenizer(
            data['texts'],
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )

In [81]:
# Tokenize the Training Data
train_dataset = datasets.Dataset.from_pandas(pd.DataFrame({"texts":X_train,"labels":y_train}))
train_dataset = train_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["texts"]
)
train_dataset

Map: 100%|██████████| 12075/12075 [00:02<00:00, 4293.53 examples/s]


Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 12075
})

In [82]:
# Tokenize the Validation Data
val_dataset = datasets.Dataset.from_pandas(pd.DataFrame({"texts":X_val,"labels":y_val}))
val_dataset = val_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["texts"]
)

val_dataset

Map: 100%|██████████| 5176/5176 [00:01<00:00, 4336.27 examples/s]


Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 5176
})

In [83]:
# Define Eval Metric
def metrics(eval_prediction):
    logits, labels = eval_prediction
    pred = np.argmax(logits, axis=1)
    auc_score = roc_auc_score(labels, pred)
    return {"Val-AUC": auc_score}

train_batch_size = 32
eval_batch_size = 32

# Define training Args
peft_training_args = TrainingArguments(
    output_dir='./result-distilbert-lora',
    logging_dir='./logs-distilbert-lora',
#     auto_find_batch_size=True,
    learning_rate=1e-4,
    per_device_train_batch_size=train_batch_size, # You can adjust this value base on your available GPU, You may encounter "out of memory" error if this value is too lartge
    per_device_eval_batch_size=eval_batch_size, # You can adjust this value base on your available GPU, You may encounter "out of memory" error if this value is too lartge
    num_train_epochs=2,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=10,
    weight_decay=0.01,
    seed=42,
    fp16=True, # Only use with GPU
    report_to='none'
)   

# Define Optimzer
optimizer = AdamW(peft_model.parameters(), 
                  lr=1e-4,
                  no_deprecation_warning=True)

# Define Scheduler
n_epochs = peft_training_args.num_train_epochs
total_steps = n_epochs * math.ceil(len(train_dataset) / train_batch_size / 2)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=total_steps)

# Data Collator
collator = DataCollatorWithPadding(
    tokenizer=tokenizer, 
    padding="longest"
)


# Define Trainer
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=train_dataset, # Training Data
    eval_dataset=val_dataset, # Evaluation Data
    tokenizer=tokenizer,
    compute_metrics=metrics,
    optimizers=(optimizer,lr_scheduler),
    data_collator=collator
)

print(f"Total Steps: {total_steps}")

# Path to save the fine-tuned model
peft_model_path="/kaggle/working/peft-distilbert-lora-local"

# Train the model
peft_trainer.train()

# peft_trainer.model.save_pretrained(peft_model_path) # Save the fine-tuned model
# tokenizer.save_pretrained(peft_model_path) # Save the tokenizer

Total Steps: 378
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora


Step,Training Loss,Validation Loss


not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using dora
not using 

KeyboardInterrupt: 