In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments,EarlyStoppingCallback
import pandas as pd
from datasets import Dataset
import torch
import numpy as np
from safetensors.torch import load_file

In [2]:
df=pd.read_csv("/kaggle/input/isec-sdc-2025/train.csv")
test_df=pd.read_csv("/kaggle/input/isec-sdc-2025/test.csv")

In [3]:
# df,test_df=df[:1000],test_df[:100]

In [4]:
df["Description"]=df["Description"].fillna("Not provided")
test_df["Description"]=test_df["Description"].fillna("Not provided")

In [5]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.25, random_state=7,stratify=df['Priority'])

In [6]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(39745, 7)
(13249, 7)
(1328, 6)


In [7]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(df["Priority"]), y=df["Priority"])

print(class_weights)

[1.1851504  1.12609435 0.34020672 4.63436817 8.84707846]


In [8]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=5,output_hidden_states=True,trust_remote_code=True)
from safetensors.torch import load_file
state_dict = load_file("/kaggle/input/bert-mlm/saved_model/model.safetensors")

# Load the state into the model
model.load_state_dict(state_dict,strict=False)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


_IncompatibleKeys(missing_keys=['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias'], unexpected_keys=['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight'])

In [9]:
def filter_data(df,components, titles, descriptions,max_length=512):
    df["combined_texts"] = [f"{component} [SEP] {title} [SEP] {description}" 
                      for component, title, description in zip(components, titles, descriptions)]
    df['token_count'] = df['combined_texts'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=True)))
    
    # Keep only rows with <= 512 tokens
    df_filtered = df[df['token_count'] <= 512].copy()
    df_filtered = df_filtered.drop('combined_texts', axis=1)
    return df_filtered
    
    # return inputs
train_df=filter_data(train_df,train_df["Component"],train_df["Title"],train_df["Description"])

Token indices sequence length is longer than the specified maximum sequence length for this model (1589 > 512). Running this sequence through the model will result in indexing errors


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to device
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

In [11]:
def tokenize_data(components, titles, descriptions,max_length=512):
    combined_texts = [f"{component} [SEP] {title} [SEP] {description}" 
                      for component, title, description in zip(components, titles, descriptions)]
    
    # Tokenize the combined texts
    inputs = tokenizer(combined_texts, padding="longest", truncation=True, return_tensors="pt", max_length=max_length)
    
    return inputs

In [12]:
def create_dataset(inputs,labels):
    data = Dataset.from_dict({
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'labels': labels
        })
    return data

In [13]:
train_inputs=tokenize_data(train_df["Component"],train_df["Title"],train_df["Description"])
val_inputs=tokenize_data(val_df["Component"],val_df["Title"],val_df["Description"])
test_inputs=tokenize_data(test_df["Component"],test_df["Title"],test_df["Description"])

In [14]:
train_data=create_dataset(train_inputs,train_df["Priority"])
val_data=create_dataset(val_inputs,val_df["Priority"])

In [15]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # print(inputs)  # Debugging: Check input structure

        labels = inputs.pop("labels")  # Extract labels
        outputs = model(**inputs)  
        logits = outputs.logits  # Extract raw logits (not loss!)

        # Compute per-sample loss
        loss_fct = torch.nn.CrossEntropyLoss(reduction="none")  # Keep per-sample loss
        loss = loss_fct(logits, labels)  # Loss is now a tensor

        # Fetch sample weights based on labels
        sample_weights = class_weights_tensor[labels]  
        # print("Sample Weights:", sample_weights)  # Debugging

        # Apply sample weights to per-sample loss
        weighted_loss = (loss * sample_weights).mean()  # Scalar loss

        return (weighted_loss, outputs) if return_outputs else weighted_loss

In [16]:
def fine_tune_model(train_data,val_data,epochs=3):
    from transformers import DataCollatorWithPadding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    training_args = TrainingArguments(
        output_dir='./results',
        overwrite_output_dir=True,
        logging_dir='./logs',
        logging_strategy="steps",
        logging_steps=1000,
        warmup_steps=600,
        eval_steps=1000,
        save_steps=1000,
        num_train_epochs=epochs,
        per_device_eval_batch_size=64,
        per_device_train_batch_size=8,
        # gradient_accumulation_steps=2,  # Gradient accumulation step size
        seed=7,
        fp16=True,
        report_to=[],
        optim="adafactor",
        weight_decay=0.01,
        eval_strategy="steps", 
        save_strategy="steps",
        load_best_model_at_end=True,
        save_total_limit=2,
        disable_tqdm=True,
        # learning_rate=2e-05
    )

    # Initialize Trainer
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=1)],
        data_collator=data_collator,
    )

    # Train the model
    trainer.train()


In [17]:
fine_tune_model(train_data,val_data)

{'loss': 1.5975, 'grad_norm': 815598.1875, 'learning_rate': 4.6725605762933864e-05, 'epoch': 0.4472271914132379}
{'eval_loss': 1.5295149087905884, 'eval_runtime': 113.5198, 'eval_samples_per_second': 116.711, 'eval_steps_per_second': 0.916, 'epoch': 0.4472271914132379}
{'loss': 1.4622, 'grad_norm': 330091.21875, 'learning_rate': 3.8539620170268504e-05, 'epoch': 0.8944543828264758}
{'eval_loss': 1.521209478378296, 'eval_runtime': 113.5129, 'eval_samples_per_second': 116.718, 'eval_steps_per_second': 0.916, 'epoch': 0.8944543828264758}
{'loss': 1.3967, 'grad_norm': 714534.3125, 'learning_rate': 3.0353634577603147e-05, 'epoch': 1.341681574239714}
{'eval_loss': 1.4055826663970947, 'eval_runtime': 113.5663, 'eval_samples_per_second': 116.663, 'eval_steps_per_second': 0.916, 'epoch': 1.341681574239714}
{'loss': 1.3113, 'grad_norm': 400739.78125, 'learning_rate': 2.2167648984937787e-05, 'epoch': 1.7889087656529516}
{'eval_loss': 1.4973418712615967, 'eval_runtime': 113.5459, 'eval_samples_per_

In [18]:
len(train_data)

35767

In [19]:
def get_embeddings(inputs,batch_size=64):
    model.eval()
    all_embeddings=[]
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():  # Disable gradient calculation
        for i in range(0, len(input_ids), batch_size):
            print(f"Processing batch {i // batch_size + 1}/{(len(input_ids) // batch_size) + 1}")

            # Slice batch inputs
            batch_input_ids = input_ids[i:i+batch_size].to(model.device)  # Ensure input is on the correct device
            batch_attention_mask = attention_mask[i:i+batch_size].to(model.device)

            # Run inference
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, output_hidden_states=True)
            hidden_states = outputs.hidden_states

            # Extract embeddings for CLS token
            cls_embeddings = hidden_states[-1][:, 0, :].cpu().numpy()  # Move to CPU and convert to numpy
            all_embeddings.extend(cls_embeddings)

    # Convert list of embeddings to DataFrame
    embeddings_df = pd.DataFrame(all_embeddings)
    return embeddings_df

In [20]:
train_embedded=get_embeddings(train_inputs)
val_embedded=get_embeddings(val_inputs)
test_embedded=get_embeddings(test_inputs)

Processing batch 1/559
Processing batch 2/559
Processing batch 3/559
Processing batch 4/559
Processing batch 5/559
Processing batch 6/559
Processing batch 7/559
Processing batch 8/559
Processing batch 9/559
Processing batch 10/559
Processing batch 11/559
Processing batch 12/559
Processing batch 13/559
Processing batch 14/559
Processing batch 15/559
Processing batch 16/559
Processing batch 17/559
Processing batch 18/559
Processing batch 19/559
Processing batch 20/559
Processing batch 21/559
Processing batch 22/559
Processing batch 23/559
Processing batch 24/559
Processing batch 25/559
Processing batch 26/559
Processing batch 27/559
Processing batch 28/559
Processing batch 29/559
Processing batch 30/559
Processing batch 31/559
Processing batch 32/559
Processing batch 33/559
Processing batch 34/559
Processing batch 35/559
Processing batch 36/559
Processing batch 37/559
Processing batch 38/559
Processing batch 39/559
Processing batch 40/559
Processing batch 41/559
Processing batch 42/559
P

In [21]:
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [22]:
train_embedded_df=pd.concat([train_df,train_embedded],axis=1)
val_embedded_df=pd.concat([val_df,val_embedded],axis=1)
test_embedded_df=pd.concat([test_df,test_embedded],axis=1)

In [23]:
train_embedded_df.to_csv("/kaggle/working/train_embedded_df.csv",index=False)
val_embedded_df.to_csv("/kaggle/working/val_embedded_df.csv",index=False)
test_embedded_df.to_csv("/kaggle/working/test_embedded_df.csv",index=False)

In [24]:
save_directory = "./saved_model"

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')