In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments,EarlyStoppingCallback, DataCollatorForLanguageModeling
import pandas as pd
from datasets import Dataset
import torch
import numpy as np
from safetensors.torch import load_file

In [2]:
model_name = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

In [3]:
df=pd.read_csv("/kaggle/input/isec-sdc-2025/train.csv")
test_df=pd.read_csv("/kaggle/input/isec-sdc-2025/test.csv")

In [4]:
# df,test_df=df[:1000],test_df[:100]

In [5]:
df["Description"]=df["Description"].fillna("Not provided")
test_df["Description"]=test_df["Description"].fillna("Not provided")

In [6]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.25, random_state=7,stratify=df['Priority'])

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to device

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0

In [8]:
def tokenize_data(components, titles, descriptions,statuss,resolutions,max_length=512):
    combined_texts = [f"{status} [SEP] {resolution} [SEP] {component} [SEP] {title} [SEP] {description}" 
                      for status,resolution,component, title, description in zip(statuss,resolutions,components, titles, descriptions)]
    
    # Tokenize the combined texts
    inputs = tokenizer(combined_texts, padding=True, truncation=True, return_tensors="pt", max_length=max_length)
    
    return inputs

In [9]:
def create_dataset(inputs):
    data = Dataset.from_dict({
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'labels': inputs['input_ids']
        })
    return data

In [10]:
train_inputs=tokenize_data(train_df["Component"],train_df["Title"],train_df["Description"],train_df["Status"],train_df["Resolution"])
val_inputs=tokenize_data(val_df["Component"],val_df["Title"],val_df["Description"],val_df["Status"],val_df["Resolution"])
test_inputs=tokenize_data(test_df["Component"],test_df["Title"],test_df["Description"],test_df["Status"],test_df["Resolution"])

In [11]:
train_data=create_dataset(train_inputs)
val_data=create_dataset(val_inputs)

In [12]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [13]:
def fine_tune_model(train_data,epochs=10):
    training_args = TrainingArguments(
        output_dir='./results',
        overwrite_output_dir=True,
        logging_dir='./logs',
        logging_strategy="steps",
        eval_strategy="steps", 
        save_strategy="steps",
        logging_steps=1500,
        warmup_steps=2500,
        eval_steps=1500,
        save_steps=1500,
        num_train_epochs=epochs,
        per_device_eval_batch_size=64,
        per_device_train_batch_size=16,
        # gradient_accumulation_steps=2,  # Gradient accumulation step size
        seed=7,
        fp16=True,
        report_to=[],
        optim="adafactor",
        weight_decay=0.01,
        load_best_model_at_end=True,
        save_total_limit=3,
        disable_tqdm=True,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        data_collator=data_collator,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=1)]
    )

    # Train the model
    trainer.train()


In [14]:
fine_tune_model(train_data)

{'loss': 2.5854, 'grad_norm': 6.6048808097839355, 'learning_rate': 2.9959999999999998e-05, 'epoch': 0.6036217303822937}
{'eval_loss': 1.9829914569854736, 'eval_runtime': 167.6454, 'eval_samples_per_second': 79.03, 'eval_steps_per_second': 1.241, 'epoch': 0.6036217303822937}
{'loss': 1.9345, 'grad_norm': 5.934638500213623, 'learning_rate': 4.888814317673378e-05, 'epoch': 1.2072434607645874}
{'eval_loss': 1.6571338176727295, 'eval_runtime': 167.5606, 'eval_samples_per_second': 79.07, 'eval_steps_per_second': 1.241, 'epoch': 1.2072434607645874}
{'loss': 1.6873, 'grad_norm': 5.523426055908203, 'learning_rate': 4.5532438478747205e-05, 'epoch': 1.8108651911468812}
{'eval_loss': 1.5138893127441406, 'eval_runtime': 167.3887, 'eval_samples_per_second': 79.151, 'eval_steps_per_second': 1.243, 'epoch': 1.8108651911468812}
{'loss': 1.5596, 'grad_norm': 5.8619160652160645, 'learning_rate': 4.217897091722595e-05, 'epoch': 2.414486921529175}
{'eval_loss': 1.4545198678970337, 'eval_runtime': 167.5105,

There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


{'train_runtime': 15228.6093, 'train_samples_per_second': 26.099, 'train_steps_per_second': 1.632, 'train_loss': 1.5023322347005208, 'epoch': 8.450704225352112}


In [15]:
save_directory = "./saved_model"

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')