## installation

In [6]:
!pip install peft pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.3.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.8.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pytorch_lightning-2.3.0-py3-none-any.whl (812 kB)
   ---------------------------------------- 0.0/812.2 kB ? eta -:--:--
   - -------------------------------------- 30.7/812.2 kB ? eta -:--:--
   - -------------------------------------- 30.7/812.2 kB ? eta -:--:--
   - -------------------------------------- 30.7/812.2 kB ? eta -:--:--
   - -------------------------------------- 30.7/812.2 kB ? eta -:--:--
   - -------------------------------------- 30.7/812.2 kB ? eta -:--:--
   - -------------------------------------- 30.7/812.2 kB ? eta -:--:--
   ----- -------------------------------- 122.9/812.2 kB 343.4 kB/

## libraries

In [7]:
import os 
import gc
from tqdm.auto import tqdm

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt 
import seaborn as sns 

import plotly.express as px 
import plotly.graph_objects as go 
from plotly.subplots import make_subplots
import plotly.figure_factory as ff 

from transformers import Trainer, AutoModelForSequenceClassification,get_polynomial_decay_schedule_with_warmup, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
from transformers import DataCollatorWithPadding, DataCollatorForTokenClassification

from text_unidecode import unidecode 
from typing import Dict,List,Tuple
import codecs
from datasets import concatenate_datasets,load_dataset,load_from_disk,Dataset

from sklearn.metrics import log_loss

from transformers import AutoModel, AutoTokenizer, AdamW

import torch 
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning import seed_everything
import re 
import pickle 
from transformers import TrainingArguments, AutoConfig, AutoModelForTokenClassification
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
import time
import random
from datasets import concatenate_datasets,load_dataset,load_from_disk,Dataset
from sklearn.preprocessing import LabelEncoder
from peft import get_peft_config, get_peft_model, PeftModel, PeftConfig, LoraConfig, TaskType
import zipfile

## config class

In [8]:
with zipfile.ZipFile("data.zip",'r') as zip_ref:
    zip_ref.extractall()

In [9]:
ds_train_path="train.csv"
ds_test_path = "test.csv"
sample_submission_path= "sample_submission.csv "

class config:
    base_dir = "/working/"
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 69
    
    # dataset path
    ds_train_path=ds_train_path
    ds_test_path= ds_test_path
    sample_submission_path= sample_submission_path
    model_path= "/kaggle/input/deberta-v3-base/deberta-v3-base/"
    saved_model_path= "/kaggle/working/fine_tuned_model"
    
    truncation= True
    padding= False
    max_length= 512
                                                   
    model_name= 'microsoft/deberta-v3-large'
    target_cols= 'score'
    load_from_disk=None
    
    learning_rate= .1
    batch_size= 5
    epochs= 5 
    NFOLDS= 5
seed_everything(config.seed)

Seed set to 69


69

## load dataframe

In [4]:
train_df= pd.read_csv(config.ds_train_path)
df_test= pd.read_csv(config.ds_test_path)
train_df.tail()

Unnamed: 0,essay_id,full_text,score
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2
17303,ffddf1f,Technology has changed a lot of ways that we l...,4
17304,fff016d,If you don't like sitting around all day than ...,2
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1
17306,fffed3e,Venus is worthy place to study but dangerous. ...,2


### Histogram of full_text length

In [5]:
fig = px.histogram(train_df, x= train_df['full_text'].str.len(),marginal= 'box', title= 'Histogram of full_text text length', 
                  color_discrete_sequence= ['#FFA200'])
fig.update_layout(bargap=.2)
fig.update_layout(template= 'plotly_dark',font=dict(family='PT Sans',size=19,color='#C4FEFF'))
fig.show()


### scorehistogram

In [6]:
colors = [['#00E600']]
for count, x in enumerate (['score']):
    fig= px.histogram(train_df,x=x,marginal='violin',title=f"{x}histogram",color_discrete_sequence= colors[count])
    
    fig.update_layout(bargap=.2)
    fig.update_layout(template='plotly_dark',font=dict(family='PT Sans', size=19,color= '#C4FEFF'))
    fig.show()

## Encoding, tonkenize, load, split 

In [7]:
## Encode the labels to integers
label_encoder = LabelEncoder()
train_df[config.target_cols] = label_encoder.fit_transform(train_df[config.target_cols])

## Load Tokenizer
tokenizer=AutoTokenizer.from_pretrained(config.model_path)

## Define tokenize Function
def tokenize(df):
    text= df['full_text']
    
    tokenized= tokenizer(text,
                        padding=config.padding,
                        truncation= True,
                        max_length= config.max_length,
                        add_special_tokens= True)
    tokenized['length']= len(tokenized['input_ids'])
    
    
    tokenized['labels'] = df[config.target_cols]

    return tokenized

## Convert DataFrame into DataSet    
if config.load_from_disk is None:
    
    ds = Dataset.from_pandas(train_df)
    
    ds = ds.map(
        tokenize,
        batched=False,
        num_proc=3,
        desc='Tokenizing'
    )
dataset=ds.remove_columns(['essay_id','full_text','score','length'])
dataset



The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



Tokenizing (num_proc=3):   0%|          | 0/17307 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 17307
})

In [8]:
# Step 1: Split 90% for train+test and 10% for validation
train_test_val_split = dataset.train_test_split(test_size=0.1)
train_test_dataset = train_test_val_split['train']
val_dataset = train_test_val_split['test']

# Step 2: Split the remaining 90% into 50% and 40%
split = train_test_dataset.train_test_split(test_size=4/9)
fifty_percent_dataset = split['train']
forty_percent_dataset = split['test']

# Step 3: Split the 50% part into 80% (train1) and 20% (test1)
train1_test1_split = fifty_percent_dataset.train_test_split(test_size=0.2)
model1_train_dataset = train1_test1_split['train']
model1_test_dataset = train1_test1_split['test']

# Step 4: Split the 40% part into 80% (train2) and 20% (test2)
train2_test2_split = forty_percent_dataset.train_test_split(test_size=0.2)
model2_train_dataset = train2_test2_split['train']
model2_test_dataset = train2_test2_split['test']

print(f"Model 1 Train dataset: {model1_train_dataset}")
print(f"Model 1 Test dataset: {model1_test_dataset}")
print(f"Model 2 Train dataset: {model2_train_dataset}")
print(f"Model 2 Test dataset: {model2_test_dataset}")
print(f"Validation dataset: {val_dataset}")


Model 1 Train dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 6922
})
Model 1 Test dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1731
})
Model 2 Train dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 5538
})
Model 2 Test dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1385
})
Validation dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1731
})


### usefull function

In [9]:
## Define the compute matrix
def compute_metrics(T):
    preds,labels= T
    score= cohen_kappa_score(labels,preds.argmax(-1),weights='quadratic')
    return {'qwk':score}

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

## Model1

In [10]:
model= AutoModelForSequenceClassification.from_pretrained(config.model_path,num_labels=6).to(config.device)
model.resize_token_embeddings(len(tokenizer))
print(print_trainable_parameters(model))


TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base/ and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 184350726 || all params: 184350726 || trainable%: 100.00
None


In [11]:
output_dir = f'{config.base_dir} + {str(int(time.time()))}'
# Define the train arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy='steps',
    logging_dir='./logs',
    logging_steps=50,
    eval_steps=200,
    save_steps=600,
    save_strategy="steps",
    evaluation_strategy="steps",
    lr_scheduler_type='linear',
    learning_rate=2e-5,
    warmup_ratio=0.0,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=4,
    weight_decay=0.01,  # This is within the recommended range
    report_to='none',
    metric_for_best_model='qwk',
    optim="adamw_torch",
    save_total_limit=1,
    fp16=True,
    greater_is_better=True,
    load_best_model_at_end=True,
)

# Define the trianer 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=model1_train_dataset,
    eval_dataset=model1_test_dataset,
    data_collator = DataCollatorWithPadding(
    tokenizer = tokenizer, pad_to_multiple_of = 16, padding = 'longest'
),
    compute_metrics= compute_metrics,
    
)
trainer.train()


`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



Step,Training Loss,Validation Loss,Qwk
200,1.0532,0.976043,0.679647
400,0.8807,0.983508,0.740529
600,0.8168,0.866218,0.792421
800,0.7226,0.891363,0.786256



Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



TrainOutput(global_step=864, training_loss=0.8945478642428363, metrics={'train_runtime': 2427.1848, 'train_samples_per_second': 11.407, 'train_steps_per_second': 0.356, 'total_flos': 7268982305026176.0, 'train_loss': 0.8945478642428363, 'epoch': 3.9907621247113165})

In [12]:
import torch
from tqdm import tqdm
import numpy as np
from sklearn.metrics import cohen_kappa_score
# Define the predict function
def predict(data_loader, model, device):
    model.to(device)
    model.eval()
    predictions = []
    true_labels = []

    for batch in tqdm(data_loader):
        with torch.no_grad():
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            logits = outputs.logits

        predictions.extend(logits.argmax(dim=-1).detach().cpu().numpy())
        true_labels.extend(labels.detach().cpu().numpy())
    
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    
    return predictions, true_labels

# Define the compute_metrics function for QWK
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    qwk_score = cohen_kappa_score(labels, preds, weights='quadratic')
    return {'qwk': qwk_score}

val_loader = trainer.get_eval_dataloader(val_dataset)

# Predict on the validation dataset
predictions, true_labels = predict(val_loader, model, training_args.device)

# Calculate QWK score
qwk_score = cohen_kappa_score(true_labels, predictions, weights='quadratic')
print(f"Quadratic Weighted Kappa (QWK) score on validation dataset: {qwk_score}")
print(print_trainable_parameters(model))


100%|██████████| 55/55 [01:26<00:00,  1.57s/it]

Quadratic Weighted Kappa (QWK) score on validation dataset: 0.7797807798873061
trainable params: 184350726 || all params: 184350726 || trainable%: 100.00
None





In [13]:
trainer.save_model(config.saved_model_path)
tokenizer.save_pretrained(config.saved_model_path)

('/kaggle/working/fine_tuned_model/tokenizer_config.json',
 '/kaggle/working/fine_tuned_model/special_tokens_map.json',
 '/kaggle/working/fine_tuned_model/spm.model',
 '/kaggle/working/fine_tuned_model/added_tokens.json',
 '/kaggle/working/fine_tuned_model/tokenizer.json')

## peft_model

In [14]:
output_dir = f'{config.base_dir} + {str(int(time.time()))}'
peft_model_config = AutoConfig.from_pretrained(config.saved_model_path,num_labels=6)
peft_model = AutoModelForSequenceClassification.from_pretrained(
    config.saved_model_path,
    config=peft_model_config)

## Adding Lora configs to peft model
peft_config = LoraConfig(
    target_modules=['query_proj', 'value_proj', 'key_proj','Dense'],
    r=16,
    lora_alpha=16,
)

peft_model = get_peft_model(peft_model, peft_config)
print(print_trainable_parameters(peft_model))

trainable params: 884736 || all params: 185235462 || trainable%: 0.48
None


In [15]:
Adapter_path= "./kaggle/workin/adapter"
peft_model.save_pretrained(Adapter_path)
tokenizer.save_pretrained(Adapter_path)


Could not find a config file in /kaggle/working/fine_tuned_model - will assume that the vocabulary was not modified.



('./kaggle/workin/adapter/tokenizer_config.json',
 './kaggle/workin/adapter/special_tokens_map.json',
 './kaggle/workin/adapter/spm.model',
 './kaggle/workin/adapter/added_tokens.json',
 './kaggle/workin/adapter/tokenizer.json')

In [16]:
config2= PeftConfig.from_pretrained(Adapter_path)
config2.num_labels=6
base_model_config = AutoConfig.from_pretrained(config2.base_model_name_or_path)
base_model_config.num_labels = 6
model2 = AutoModelForSequenceClassification.from_pretrained(
    config2.base_model_name_or_path,
    config=base_model_config,
    
)

model2 = PeftModel.from_pretrained(
    model2,
    Adapter_path
)
model2.to(config.device)
print(print_trainable_parameters(model2))
model2.config.num_labels

trainable params: 0 || all params: 185235462 || trainable%: 0.00
None


6

In [17]:
for param in model2.parameters():
    param.requires_grad = True

In [18]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import cohen_kappa_score
import numpy as np

# Example compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    qwk_score = cohen_kappa_score(labels, preds, weights='quadratic')
    return {'qwk': qwk_score}

# Example training arguments
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy='steps',
    logging_dir='./logs',
    logging_steps=50,              # Log training metrics every 30 steps
    save_steps=500,                # Save checkpoint every 30 steps
    eval_steps=100,                # Evaluate every 30 steps
    learning_rate=2e-5,            # Increased learning rate
    per_device_train_batch_size=8, # Increased batch size
    per_device_eval_batch_size=16,  # Batch size per device during evaluation
    gradient_accumulation_steps=2,  # Reduced gradient accumulation steps
    num_train_epochs=3,             # Number of training epochs
    weight_decay=0.01,              # Weight decay
    report_to='none',               # Report metrics to TensorBoard
    optim="adamw_torch",
    save_total_limit=1,
    greater_is_better=True,
    
    fp16=True,                      # Use mixed precision training if available
    load_best_model_at_end=False,    # Load the best model found during training at the end
)

# Initialize Trainer
peft_trainer = Trainer(
    model=model2,
    args=peft_training_args,
    train_dataset=model2_train_dataset,
    eval_dataset=model2_test_dataset,
    data_collator=DataCollatorWithPadding(
        tokenizer=tokenizer, pad_to_multiple_of=16, padding='longest'
    ),
    compute_metrics=compute_metrics,
)

# Train the model
train_result = peft_trainer.train()

# Evaluate on the validation dataset
evaluation_results = peft_trainer.evaluate()

print(f"Training results: {train_result}")
print(f"Evaluation results: {evaluation_results}")



Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



Step,Training Loss,Validation Loss
100,0.897,No log
200,0.8647,No log
300,0.7759,No log
400,0.7361,No log
500,0.7278,No log



Could not find a config file in /kaggle/working/fine_tuned_model - will assume that the vocabulary was not modified.


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



Training results: TrainOutput(global_step=519, training_loss=0.8170828203704775, metrics={'train_runtime': 1593.998, 'train_samples_per_second': 10.423, 'train_steps_per_second': 0.326, 'total_flos': 4406665608514944.0, 'train_loss': 0.8170828203704775, 'epoch': 2.9913544668587897})
Evaluation results: {'eval_runtime': 39.7362, 'eval_samples_per_second': 34.855, 'eval_steps_per_second': 1.107, 'epoch': 2.9913544668587897}


In [19]:
# Define the predict function
def predict(data_loader, model2, device):
    model2.to(device)
    model2.eval()
    predictions = []
    true_labels = []

    for batch in tqdm(data_loader):
        with torch.no_grad():
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model2(**inputs)
            logits = outputs.logits

        predictions.extend(logits.argmax(dim=-1).detach().cpu().numpy())
        true_labels.extend(labels.detach().cpu().numpy())
    
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    
    return predictions, true_labels

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    qwk_score = cohen_kappa_score(labels, preds, weights='quadratic')
    return {'qwk': qwk_score}

val_loader = peft_trainer.get_eval_dataloader(val_dataset)

predictions, true_labels = predict(val_loader, model2, peft_training_args.device)

# Calculate QWK score
qwk_score = cohen_kappa_score(true_labels, predictions, weights='quadratic')
print(f"Quadratic Weighted Kappa (QWK) score on validation dataset: {qwk_score}")
print(print_trainable_parameters(model2))


100%|██████████| 55/55 [01:30<00:00,  1.64s/it]

Quadratic Weighted Kappa (QWK) score on validation dataset: 0.7941052568976028
trainable params: 185235462 || all params: 185235462 || trainable%: 100.00
None





In [20]:
saved_model_path2="/kaggle/working/fine_tuned_model_peft_model"
trainer.save_model(saved_model_path2)
tokenizer.save_pretrained(saved_model_path2)

('/kaggle/working/fine_tuned_model_peft_model/tokenizer_config.json',
 '/kaggle/working/fine_tuned_model_peft_model/special_tokens_map.json',
 '/kaggle/working/fine_tuned_model_peft_model/spm.model',
 '/kaggle/working/fine_tuned_model_peft_model/added_tokens.json',
 '/kaggle/working/fine_tuned_model_peft_model/tokenizer.json')

In [21]:
def process_and_predict(df):

    def tokenize(examples):
        text = examples['full_text']
        tokenized = tokenizer(
            text,
            padding='max_length', 
            truncation=True, 
            max_length=config.max_length
        )
        return tokenized

    # Convert DataFrame to Dataset and tokenize
    ds = Dataset.from_pandas(df)
    ds = ds.map(
        tokenize,
        batched=False,
        num_proc=3,
        desc='Tokenizing'
    )

    # Remove unnecessary columns
    ds_test = ds.remove_columns(['essay_id', 'full_text'])

    # Perform predictions
    test_preds = trainer.predict(ds_test).predictions.clip(0, 5)

    # Ensure test_preds is one-dimensional
    if test_preds.ndim > 1:
        test_preds = test_preds.argmax(axis=1)  # Adjust as needed based on your task
    thresholds=[0.5, 1.5, 2.5, 3.5, 4.5]
    # Process predictions
    test_preds_pp = pd.cut(test_preds, bins=[-np.inf] + thresholds + [np.inf], 
                           labels=[0, 1, 2, 3, 4, 5]).astype('int32')

    # Add predictions as 'score' column
    df['score'] = test_preds_pp + 1  # Adjust as needed

    # Save results to CSV
    df[['essay_id', 'score']].to_csv('submission.csv', index=False)
    
    return df

df_with_scores = process_and_predict(df_test)
df_with_scores.head()



os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



Tokenizing (num_proc=3):   0%|          | 0/3 [00:00<?, ? examples/s]


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,2
1,000fe60,I am a scientist at NASA that is discussing th...,2
2,001ab80,People always wish they had the same technolog...,5
