#Installing Dependencies



In [1]:
!pip install -q transformers datasets peft accelerate scikit-learn torch


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: C:\Python313\python.exe -m pip install --upgrade pip


#Importing Libraries

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from peft import get_peft_model, LoraConfig, TaskType
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


#Loading  Dataset(Already preprocessed) & Splitting

In [None]:
# Load dataset (ArXiv classification)
dataset = load_dataset("ccdv/arxiv-classification", split="train[:20000]")
dataset = dataset.train_test_split(test_size=0.1)

# Map short codes to full names for better readability
label_map = {
    "cs": "Computer Science",
    "cs.AI": "Artificial Intelligence",
    "cs.AR": "Hardware Architecture",
    "cs.CC": "Computational Complexity",
    "cs.CE": "Computational Engineering, Finance, and Science",
    "cs.CG": "Computational Geometry",
    "cs.CL": "Computation and Language",
    "cs.CR": "Cryptography and Security",
    "cs.CV": "Computer Vision and Pattern Recognition",
    "cs.CY": "Computers and Society",
    "cs.DB": "Databases",
    "cs.DC": "Distributed, Parallel, and Cluster Computing",
    "cs.DL": "Digital Libraries",
    "cs.DM": "Discrete Mathematics",
    "cs.DS": "Data Structures and Algorithms",
    "cs.ET": "Emerging Technologies",
    "cs.FL": "Formal Languages and Automata Theory",
    "cs.GL": "General Literature",
    "cs.GR": "Graphics",
    "cs.GT": "Game Theory",
    "cs.HC": "Human-Computer Interaction",
    "cs.IR": "Information Retrieval",
    "cs.IT": "Information Theory",
    "cs.LG": "Machine Learning",
    "cs.LO": "Logic in Computer Science",
    "cs.MA": "Multiagent Systems",
    "cs.MM": "Multimedia",
    "cs.MS": "Mathematical Software",
    "cs.NA": "Numerical Analysis",
    "cs.NE": "Neural and Evolutionary Computing",
    "cs.NI": "Networking and Internet Architecture",
    "cs.OH": "Other Computer Science",
    "cs.OS": "Operating Systems",
    "cs.PF": "Performance",
    "cs.PL": "Programming Languages",
    "cs.RO": "Robotics",
    "cs.SD": "Sound",
    "cs.SE": "Software Engineering",
    "cs.SI": "Social and Information Networks",
    "cs.SY": "Systems and Control",

    "math": "Mathematics",
    "math.AC": "Commutative Algebra",
    "math.AG": "Algebraic Geometry",
    "math.AP": "Analysis of PDEs",
    "math.AT": "Algebraic Topology",
    "math.CA": "Classical Analysis and ODEs",
    "math.CO": "Combinatorics",
    "math.CT": "Category Theory",
    "math.CV": "Complex Variables",
    "math.DG": "Differential Geometry",
    "math.DS": "Dynamical Systems",
    "math.FA": "Functional Analysis",
    "math.GM": "General Mathematics",
    "math.GN": "General Topology",
    "math.GR": "Group Theory",
    "math.GT": "Geometric Topology",
    "math.HO": "History and Overview",
    "math.IT": "Information Theory",
    "math.KT": "K-Theory and Homology",
    "math.LO": "Logic",
    "math.MG": "Metric Geometry",
    "math.MP": "Mathematical Physics",
    "math.NA": "Numerical Analysis",
    "math.NT": "Number Theory",
    "math.OA": "Operator Algebras",
    "math.OC": "Optimization and Control",
    "math.PR": "Probability",
    "math.QA": "Quantum Algebra",
    "math.RA": "Rings and Algebras",
    "math.RT": "Representation Theory",
    "math.SG": "Symplectic Geometry",
    "math.SP": "Spectral Theory",
    "math.ST": "Statistics Theory",

    "physics": "Physics",
    "astro-ph": "Astrophysics",
    "cond-mat": "Condensed Matter",
    "cond-mat.dis-nn": "Disordered Systems and Neural Networks",
    "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
    "cond-mat.mtrl-sci": "Materials Science",
    "cond-mat.other": "Other Condensed Matter",
    "cond-mat.quant-gas": "Quantum Gases",
    "cond-mat.soft": "Soft Condensed Matter",
    "cond-mat.stat-mech": "Statistical Mechanics",
    "cond-mat.str-el": "Strongly Correlated Electrons",
    "cond-mat.supr-con": "Superconductivity",

    "gr-qc": "General Relativity and Quantum Cosmology",
    "hep-ex": "High Energy Physics - Experiment",
    "hep-lat": "High Energy Physics - Lattice",
    "hep-ph": "High Energy Physics - Phenomenology",
    "hep-th": "High Energy Physics - Theory",
    "math-ph": "Mathematical Physics",
    "nucl-ex": "Nuclear Experiment",
    "nucl-th": "Nuclear Theory",
    "quant-ph": "Quantum Physics",

    "q-bio": "Quantitative Biology",
    "q-bio.BM": "Biomolecules",
    "q-bio.CB": "Cell Behavior",
    "q-bio.GN": "Genomics",
    "q-bio.MN": "Molecular Networks",
    "q-bio.NC": "Neurons and Cognition",
    "q-bio.OT": "Other Quantitative Biology",
    "q-bio.PE": "Populations and Evolution",
    "q-bio.QM": "Quantitative Methods",
    "q-bio.SC": "Subcellular Processes",
    "q-bio.TO": "Tissues and Organs",

    "q-fin": "Quantitative Finance",
    "q-fin.CP": "Computational Finance",
    "q-fin.EC": "Economics",
    "q-fin.GN": "General Finance",
    "q-fin.MF": "Mathematical Finance",
    "q-fin.PM": "Portfolio Management",
    "q-fin.PR": "Pricing of Securities",
    "q-fin.RM": "Risk Management",
    "q-fin.ST": "Statistical Finance",
    "q-fin.TR": "Trading and Market Microstructure",

    "stat": "Statistics",
    "stat.AP": "Applications",
    "stat.CO": "Computation",
    "stat.ME": "Methodology",
    "stat.ML": "Machine Learning",
    "stat.OT": "Other Statistics",
    "stat.TH": "Statistics Theory",

    "eess": "Electrical Engineering and Systems Science",
    "eess.AS": "Audio and Speech Processing",
    "eess.IV": "Image and Video Processing",
    "eess.SP": "Signal Processing",
    "eess.SY": "Systems and Control",

    "econ": "Economics"
}


# Get labels from dataset features
labels = dataset['train'].features['label'].names
num_labels = len(labels)
id2label = {i: label_map.get(label, label) for i, label in enumerate(labels)}
label2id = {label_map.get(label, label): i for i, label in enumerate(labels)}

print(f"Classes: {list(id2label.values())}")

Classes: ['Commutative Algebra', 'Computer Vision and Pattern Recognition', 'Artificial Intelligence', 'Systems and Control', 'Group Theory', 'Computational Engineering, Finance, and Science', 'Programming Languages', 'Information Theory', 'Data Structures and Algorithms', 'Neural and Evolutionary Computing', 'Statistics Theory']


#Loading Toknizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")

def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_ds = dataset.map(preprocess, batched=True)

Map: 100%|██████████| 18000/18000 [08:57<00:00, 33.47 examples/s]
Map: 100%|██████████| 2000/2000 [00:59<00:00, 33.40 examples/s]


In [5]:
tokenized_ds.save_to_disk("tokenized_deberta_ds")



Saving the dataset (0/3 shards):   0%|          | 0/18000 [00:00<?, ? examples/s]

Saving the dataset (3/3 shards): 100%|██████████| 18000/18000 [00:01<00:00, 14842.86 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2000/2000 [00:00<00:00, 5613.40 examples/s]


#Intializing Model and Peft Lora Adapters

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-small",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    target_modules=["query_proj", "key_proj", "value_proj", "dense"]

)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 684,299 || all params: 142,587,670 || trainable%: 0.4799


#Fine Tuning Model

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    acc = accuracy_score(labels, np.argmax(predictions, axis=1))
    return {"accuracy": acc}


trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./results",
        learning_rate=2e-4,
        per_device_train_batch_size=16,
        num_train_epochs=10,
        optim="adamw_torch",
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        eval_strategy="epoch",
        save_strategy="epoch",
        fp16=torch.cuda.is_available(),
        logging_steps=50
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.
[34m[1mwandb[0m: Currently logged in as: [33mshaik_mohammhd[0m ([33msrmap[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0165,0.886007,0.7365
2,0.6824,0.6451,0.802
3,0.5836,0.609305,0.8225


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0165,0.886007,0.7365
2,0.6824,0.6451,0.802
3,0.5836,0.609305,0.8225
4,0.537,0.533721,0.8395
5,0.5039,0.514107,0.853
6,0.4256,0.549172,0.8475
7,0.3694,0.517977,0.86
8,0.2883,0.524599,0.858
9,0.291,0.5315,0.8595
10,0.2411,0.529682,0.861


TrainOutput(global_step=11250, training_loss=0.5673944529215494, metrics={'train_runtime': 5872.3798, 'train_samples_per_second': 30.652, 'train_steps_per_second': 1.916, 'total_flos': 2.422719811584e+16, 'train_loss': 0.5673944529215494, 'epoch': 10.0})

#Saving the model

In [8]:
save_path = "./final_deberta_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"\nModel saved successfully at: {save_path}")


Model saved successfully at: ./final_deberta_model


#Evaluation

In [11]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from peft import PeftModel
from torch.utils.data import DataLoader
from tqdm import tqdm

# 1. Load Tokenizer and Model
base_model_name = "microsoft/deberta-v3-small"
adapter_path = "./final_deberta_model"

print(f"Loading tokenizer from {base_model_name}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

print(f"Loading base model {base_model_name}...")
num_labels = 11
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name, 
    num_labels=num_labels,
    ignore_mismatched_sizes=True
)

print(f"Loading adapter from {adapter_path}...")
model = PeftModel.from_pretrained(base_model, adapter_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# 2. Prepare Test Data
# Remove the 'text' column so the collator doesn't crash on strings
test_dataset = tokenized_ds["test"].remove_columns(["text"])

# Use DataCollatorWithPadding which handles padding automatically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

test_loader = DataLoader(
    test_dataset, 
    batch_size=16, 
    collate_fn=data_collator
)

# 3. Run Inference
print("Running inference on test set...")
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        logits = outputs.logits
        
        # Get predictions
        preds = torch.argmax(logits, dim=1)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

# 4. Calculate Metrics
accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro')

# 5. Print Results
print("\n====== FINAL EVALUATION ======")
print(f"Accuracy:        {accuracy:.4f}")
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall:    {macro_recall:.4f}")
print(f"Macro F1-Score:  {macro_f1:.4f}")
print("-" * 30)
print(f"Weighted Precision: {precision:.4f}")
print(f"Weighted Recall:    {recall:.4f}")
print(f"Weighted F1-Score:  {f1:.4f}")
print("==============================")

Loading tokenizer from microsoft/deberta-v3-small...




Loading base model microsoft/deberta-v3-small...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading adapter from ./final_deberta_model...
Running inference on test set...


100%|██████████| 125/125 [12:20<00:00,  5.92s/it]


Accuracy:        0.8825
Macro Precision: 0.8789
Macro Recall:    0.8837
Macro F1-Score:  0.8777
------------------------------
Weighted Precision: 0.8872
Weighted Recall:    0.8825
Weighted F1-Score:  0.8819





#Inference

In [12]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        logits = model(**inputs).logits
    return id2label[logits.argmax().item()]

# Test
abstract = "We present a new neural network architecture for image classification."
print(f"Prediction: {predict(abstract)}")

Prediction: Neural and Evolutionary Computing
