In [1]:
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from datasets.dataset_dict import DatasetDict, IterableDatasetDict
from datasets.iterable_dataset import IterableDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset as HFDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_set = pd.read_csv("spam.tsv",sep='\t')

In [3]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
 2   length   5572 non-null   int64 
 3   punct    5572 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 174.2+ KB


In [4]:
data_set

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,8
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1


In [5]:
label_map = {"ham": 0, "spam": 1}
data_set["label"] = data_set["label"].map(label_map)

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data_set["message"], data_set["label"], test_size=0.2, random_state=42
)

In [7]:
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [8]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)

In [9]:
train_dataset = HFDataset.from_dict({"input_ids": train_encodings["input_ids"], 
                                     "attention_mask": train_encodings["attention_mask"], 
                                     "labels": list(train_labels)})

val_dataset = HFDataset.from_dict({"input_ids": val_encodings["input_ids"], 
                                   "attention_mask": val_encodings["attention_mask"], 
                                   "labels": list(val_labels)})

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    push_to_hub=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

  1%|          | 10/1674 [00:13<16:50,  1.65it/s] 

{'loss': 0.3824, 'grad_norm': 7.111941814422607, 'learning_rate': 1.9880525686977302e-05, 'epoch': 0.02}


  1%|          | 20/1674 [00:18<13:53,  1.98it/s]

{'loss': 0.1707, 'grad_norm': 4.712263107299805, 'learning_rate': 1.97610513739546e-05, 'epoch': 0.04}


  2%|▏         | 30/1674 [00:23<13:41,  2.00it/s]

{'loss': 0.0538, 'grad_norm': 0.07498916983604431, 'learning_rate': 1.9641577060931903e-05, 'epoch': 0.05}


  2%|▏         | 40/1674 [00:28<13:44,  1.98it/s]

{'loss': 0.0001, 'grad_norm': 0.003858265932649374, 'learning_rate': 1.95221027479092e-05, 'epoch': 0.07}


  3%|▎         | 50/1674 [00:33<13:44,  1.97it/s]

{'loss': 0.1186, 'grad_norm': 58.22944259643555, 'learning_rate': 1.94026284348865e-05, 'epoch': 0.09}


  4%|▎         | 60/1674 [00:38<13:40,  1.97it/s]

{'loss': 0.0802, 'grad_norm': 0.0021050695795565844, 'learning_rate': 1.9283154121863802e-05, 'epoch': 0.11}


  4%|▍         | 70/1674 [00:43<13:40,  1.95it/s]

{'loss': 0.0085, 'grad_norm': 0.000203177536604926, 'learning_rate': 1.91636798088411e-05, 'epoch': 0.13}


  5%|▍         | 80/1674 [00:48<13:25,  1.98it/s]

{'loss': 0.2218, 'grad_norm': 6.452883826568723e-05, 'learning_rate': 1.90442054958184e-05, 'epoch': 0.14}


  5%|▌         | 90/1674 [00:53<13:18,  1.98it/s]

{'loss': 0.1615, 'grad_norm': 41.78705596923828, 'learning_rate': 1.89247311827957e-05, 'epoch': 0.16}


  6%|▌         | 100/1674 [00:58<13:15,  1.98it/s]

{'loss': 0.0263, 'grad_norm': 0.027956189587712288, 'learning_rate': 1.8805256869773e-05, 'epoch': 0.18}


  7%|▋         | 110/1674 [01:03<13:00,  2.00it/s]

{'loss': 0.119, 'grad_norm': 0.004377957433462143, 'learning_rate': 1.86857825567503e-05, 'epoch': 0.2}


  7%|▋         | 120/1674 [01:08<12:56,  2.00it/s]

{'loss': 0.0661, 'grad_norm': 3.79374098777771, 'learning_rate': 1.85663082437276e-05, 'epoch': 0.22}


  8%|▊         | 130/1674 [01:13<12:55,  1.99it/s]

{'loss': 0.2384, 'grad_norm': 0.026394352316856384, 'learning_rate': 1.84468339307049e-05, 'epoch': 0.23}


  8%|▊         | 140/1674 [01:18<12:54,  1.98it/s]

{'loss': 0.1053, 'grad_norm': 19.37306785583496, 'learning_rate': 1.83273596176822e-05, 'epoch': 0.25}


  9%|▉         | 150/1674 [01:24<12:59,  1.95it/s]

{'loss': 0.0019, 'grad_norm': 0.023187125101685524, 'learning_rate': 1.82078853046595e-05, 'epoch': 0.27}


 10%|▉         | 160/1674 [01:29<12:43,  1.98it/s]

{'loss': 0.0467, 'grad_norm': 22.926443099975586, 'learning_rate': 1.80884109916368e-05, 'epoch': 0.29}


 10%|█         | 170/1674 [01:34<12:29,  2.01it/s]

{'loss': 0.1642, 'grad_norm': 0.07571879774332047, 'learning_rate': 1.79689366786141e-05, 'epoch': 0.3}


 11%|█         | 180/1674 [01:39<12:25,  2.00it/s]

{'loss': 0.093, 'grad_norm': 0.019595792517066002, 'learning_rate': 1.78494623655914e-05, 'epoch': 0.32}


 11%|█▏        | 190/1674 [01:44<12:35,  1.97it/s]

{'loss': 0.1041, 'grad_norm': 0.17543618381023407, 'learning_rate': 1.77299880525687e-05, 'epoch': 0.34}


 12%|█▏        | 200/1674 [01:49<12:27,  1.97it/s]

{'loss': 0.0725, 'grad_norm': 1.3774805068969727, 'learning_rate': 1.7610513739545997e-05, 'epoch': 0.36}


 13%|█▎        | 210/1674 [01:54<12:26,  1.96it/s]

{'loss': 0.0505, 'grad_norm': 27.63570785522461, 'learning_rate': 1.74910394265233e-05, 'epoch': 0.38}


 13%|█▎        | 220/1674 [01:59<12:27,  1.94it/s]

{'loss': 0.1598, 'grad_norm': 4.959136962890625, 'learning_rate': 1.7371565113500598e-05, 'epoch': 0.39}


 14%|█▎        | 230/1674 [02:03<08:47,  2.74it/s]

{'loss': 0.0318, 'grad_norm': 0.00039368783473037183, 'learning_rate': 1.72520908004779e-05, 'epoch': 0.41}


 14%|█▍        | 240/1674 [02:07<08:12,  2.91it/s]

{'loss': 0.2136, 'grad_norm': 16.112030029296875, 'learning_rate': 1.71326164874552e-05, 'epoch': 0.43}


 15%|█▍        | 250/1674 [02:10<08:08,  2.92it/s]

{'loss': 0.0912, 'grad_norm': 0.006452098023146391, 'learning_rate': 1.7013142174432496e-05, 'epoch': 0.45}


 16%|█▌        | 260/1674 [02:14<08:04,  2.92it/s]

{'loss': 0.205, 'grad_norm': 8.877364158630371, 'learning_rate': 1.68936678614098e-05, 'epoch': 0.47}


 16%|█▌        | 270/1674 [02:17<08:03,  2.91it/s]

{'loss': 0.1234, 'grad_norm': 0.009734982624650002, 'learning_rate': 1.6774193548387098e-05, 'epoch': 0.48}


 17%|█▋        | 280/1674 [02:21<08:02,  2.89it/s]

{'loss': 0.0117, 'grad_norm': 0.001331597683019936, 'learning_rate': 1.6654719235364398e-05, 'epoch': 0.5}


 17%|█▋        | 290/1674 [02:24<07:56,  2.91it/s]

{'loss': 0.051, 'grad_norm': 0.018027611076831818, 'learning_rate': 1.65352449223417e-05, 'epoch': 0.52}


 18%|█▊        | 300/1674 [02:27<07:59,  2.86it/s]

{'loss': 0.0002, 'grad_norm': 0.011195923201739788, 'learning_rate': 1.6415770609318996e-05, 'epoch': 0.54}


 19%|█▊        | 310/1674 [02:31<07:56,  2.86it/s]

{'loss': 0.0093, 'grad_norm': 0.018727920949459076, 'learning_rate': 1.6296296296296297e-05, 'epoch': 0.56}


 19%|█▉        | 320/1674 [02:34<07:54,  2.85it/s]

{'loss': 0.1909, 'grad_norm': 0.031552694737911224, 'learning_rate': 1.6176821983273597e-05, 'epoch': 0.57}


 20%|█▉        | 330/1674 [02:38<07:48,  2.87it/s]

{'loss': 0.0073, 'grad_norm': 0.007679588161408901, 'learning_rate': 1.6057347670250898e-05, 'epoch': 0.59}


 20%|██        | 340/1674 [02:41<07:47,  2.85it/s]

{'loss': 0.0774, 'grad_norm': 0.004063277039676905, 'learning_rate': 1.59378733572282e-05, 'epoch': 0.61}


 21%|██        | 350/1674 [02:45<07:41,  2.87it/s]

{'loss': 0.0016, 'grad_norm': 0.02911902777850628, 'learning_rate': 1.5818399044205496e-05, 'epoch': 0.63}


 22%|██▏       | 360/1674 [02:48<07:41,  2.85it/s]

{'loss': 0.0, 'grad_norm': 0.0008776908507570624, 'learning_rate': 1.5698924731182796e-05, 'epoch': 0.65}


 22%|██▏       | 370/1674 [02:52<07:32,  2.88it/s]

{'loss': 0.0, 'grad_norm': 4.1404425132895994e-07, 'learning_rate': 1.5579450418160097e-05, 'epoch': 0.66}


 23%|██▎       | 380/1674 [02:55<07:35,  2.84it/s]

{'loss': 0.1142, 'grad_norm': 0.0009599930490367115, 'learning_rate': 1.5459976105137397e-05, 'epoch': 0.68}


 23%|██▎       | 390/1674 [02:59<07:32,  2.84it/s]

{'loss': 0.0016, 'grad_norm': 2.577824670879636e-05, 'learning_rate': 1.5340501792114698e-05, 'epoch': 0.7}


 24%|██▍       | 400/1674 [03:03<07:33,  2.81it/s]

{'loss': 0.0052, 'grad_norm': 0.00010912959987763315, 'learning_rate': 1.5221027479091997e-05, 'epoch': 0.72}


 24%|██▍       | 410/1674 [03:06<07:30,  2.80it/s]

{'loss': 0.0337, 'grad_norm': 0.0004975621704943478, 'learning_rate': 1.5101553166069296e-05, 'epoch': 0.73}


 25%|██▌       | 420/1674 [03:10<07:22,  2.84it/s]

{'loss': 0.0, 'grad_norm': 4.2255920561729e-05, 'learning_rate': 1.4982078853046595e-05, 'epoch': 0.75}


 26%|██▌       | 430/1674 [03:13<07:20,  2.82it/s]

{'loss': 0.0, 'grad_norm': 0.010781770572066307, 'learning_rate': 1.4862604540023897e-05, 'epoch': 0.77}


 26%|██▋       | 440/1674 [03:17<07:19,  2.81it/s]

{'loss': 0.0642, 'grad_norm': 0.0019210216123610735, 'learning_rate': 1.4743130227001196e-05, 'epoch': 0.79}


 27%|██▋       | 450/1674 [03:20<07:16,  2.81it/s]

{'loss': 0.0012, 'grad_norm': 0.03814120218157768, 'learning_rate': 1.4623655913978497e-05, 'epoch': 0.81}


 27%|██▋       | 460/1674 [03:24<07:14,  2.79it/s]

{'loss': 0.1671, 'grad_norm': 0.0022646463476121426, 'learning_rate': 1.4504181600955796e-05, 'epoch': 0.82}


 28%|██▊       | 470/1674 [03:28<07:08,  2.81it/s]

{'loss': 0.125, 'grad_norm': 0.03169231116771698, 'learning_rate': 1.4384707287933094e-05, 'epoch': 0.84}


 29%|██▊       | 480/1674 [03:31<07:11,  2.77it/s]

{'loss': 0.0607, 'grad_norm': 22.3619327545166, 'learning_rate': 1.4265232974910395e-05, 'epoch': 0.86}


 29%|██▉       | 490/1674 [03:35<07:08,  2.76it/s]

{'loss': 0.0062, 'grad_norm': 7.294907093048096, 'learning_rate': 1.4145758661887696e-05, 'epoch': 0.88}


 30%|██▉       | 500/1674 [03:38<07:00,  2.79it/s]

{'loss': 0.0525, 'grad_norm': 0.003617681795731187, 'learning_rate': 1.4026284348864996e-05, 'epoch': 0.9}


 30%|███       | 510/1674 [03:42<07:32,  2.57it/s]

{'loss': 0.0938, 'grad_norm': 155.3232879638672, 'learning_rate': 1.3906810035842295e-05, 'epoch': 0.91}


 31%|███       | 520/1674 [03:46<06:57,  2.76it/s]

{'loss': 0.0948, 'grad_norm': 0.26205873489379883, 'learning_rate': 1.3787335722819594e-05, 'epoch': 0.93}


 32%|███▏      | 530/1674 [03:49<06:51,  2.78it/s]

{'loss': 0.0344, 'grad_norm': 0.016451505944132805, 'learning_rate': 1.3667861409796895e-05, 'epoch': 0.95}


 32%|███▏      | 540/1674 [03:53<07:05,  2.66it/s]

{'loss': 0.0044, 'grad_norm': 0.15407557785511017, 'learning_rate': 1.3548387096774194e-05, 'epoch': 0.97}


 33%|███▎      | 550/1674 [03:57<06:49,  2.74it/s]

{'loss': 0.0015, 'grad_norm': 0.004246697761118412, 'learning_rate': 1.3428912783751496e-05, 'epoch': 0.99}


                                                  
 33%|███▎      | 558/1674 [04:11<17:07,  1.09it/s]

{'eval_loss': 0.039749789983034134, 'eval_runtime': 9.4674, 'eval_samples_per_second': 117.773, 'eval_steps_per_second': 14.788, 'epoch': 1.0}


 33%|███▎      | 560/1674 [04:14<55:43,  3.00s/it]  

{'loss': 0.1501, 'grad_norm': 0.00020501927065197378, 'learning_rate': 1.3309438470728795e-05, 'epoch': 1.0}


 34%|███▍      | 570/1674 [04:18<07:59,  2.30it/s]

{'loss': 0.0001, 'grad_norm': 0.4997100532054901, 'learning_rate': 1.3189964157706094e-05, 'epoch': 1.02}


 35%|███▍      | 580/1674 [04:21<06:38,  2.75it/s]

{'loss': 0.0013, 'grad_norm': 3.161699351039715e-05, 'learning_rate': 1.3070489844683394e-05, 'epoch': 1.04}


 35%|███▌      | 590/1674 [04:25<06:43,  2.69it/s]

{'loss': 0.0, 'grad_norm': 8.755543240113184e-05, 'learning_rate': 1.2951015531660693e-05, 'epoch': 1.06}


 36%|███▌      | 600/1674 [04:29<06:32,  2.74it/s]

{'loss': 0.0, 'grad_norm': 8.022554538911209e-06, 'learning_rate': 1.2831541218637992e-05, 'epoch': 1.08}


 36%|███▋      | 610/1674 [04:32<06:27,  2.75it/s]

{'loss': 0.0, 'grad_norm': 9.190373020828702e-06, 'learning_rate': 1.2712066905615294e-05, 'epoch': 1.09}


 37%|███▋      | 620/1674 [04:36<06:32,  2.69it/s]

{'loss': 0.0, 'grad_norm': 9.005468746181577e-05, 'learning_rate': 1.2592592592592593e-05, 'epoch': 1.11}


 38%|███▊      | 630/1674 [04:40<06:27,  2.69it/s]

{'loss': 0.0, 'grad_norm': 2.9262422685860656e-05, 'learning_rate': 1.2473118279569894e-05, 'epoch': 1.13}


 38%|███▊      | 640/1674 [04:43<06:29,  2.66it/s]

{'loss': 0.0943, 'grad_norm': 4.252920552971773e-05, 'learning_rate': 1.2353643966547193e-05, 'epoch': 1.15}


 39%|███▉      | 650/1674 [04:47<06:29,  2.63it/s]

{'loss': 0.0003, 'grad_norm': 0.004399218130856752, 'learning_rate': 1.2234169653524492e-05, 'epoch': 1.16}


 39%|███▉      | 660/1674 [04:51<06:09,  2.74it/s]

{'loss': 0.0015, 'grad_norm': 0.0007806801004335284, 'learning_rate': 1.2114695340501794e-05, 'epoch': 1.18}


 40%|████      | 670/1674 [04:54<06:05,  2.74it/s]

{'loss': 0.0, 'grad_norm': 0.007345889694988728, 'learning_rate': 1.1995221027479093e-05, 'epoch': 1.2}


 41%|████      | 680/1674 [04:58<06:06,  2.71it/s]

{'loss': 0.0002, 'grad_norm': 0.004841512069106102, 'learning_rate': 1.1875746714456394e-05, 'epoch': 1.22}


 41%|████      | 690/1674 [05:02<06:00,  2.73it/s]

{'loss': 0.0, 'grad_norm': 0.00023818698537070304, 'learning_rate': 1.1756272401433692e-05, 'epoch': 1.24}


 42%|████▏     | 700/1674 [05:06<06:01,  2.70it/s]

{'loss': 0.0, 'grad_norm': 0.00023102744307834655, 'learning_rate': 1.1636798088410991e-05, 'epoch': 1.25}


 42%|████▏     | 710/1674 [05:09<05:58,  2.69it/s]

{'loss': 0.0, 'grad_norm': 0.0008088746108114719, 'learning_rate': 1.1517323775388292e-05, 'epoch': 1.27}


 43%|████▎     | 720/1674 [05:13<05:52,  2.71it/s]

{'loss': 0.0, 'grad_norm': 0.00091821129899472, 'learning_rate': 1.1397849462365593e-05, 'epoch': 1.29}


 44%|████▎     | 730/1674 [05:17<05:44,  2.74it/s]

{'loss': 0.0024, 'grad_norm': 0.00011825154797406867, 'learning_rate': 1.1278375149342893e-05, 'epoch': 1.31}


 44%|████▍     | 740/1674 [05:20<05:44,  2.71it/s]

{'loss': 0.0546, 'grad_norm': 2.5040004402399063e-05, 'learning_rate': 1.1158900836320192e-05, 'epoch': 1.33}


 45%|████▍     | 750/1674 [05:25<06:05,  2.53it/s]

{'loss': 0.0, 'grad_norm': 0.005887520499527454, 'learning_rate': 1.1039426523297491e-05, 'epoch': 1.34}


 45%|████▌     | 760/1674 [05:29<05:34,  2.73it/s]

{'loss': 0.0002, 'grad_norm': 0.004753299057483673, 'learning_rate': 1.0919952210274792e-05, 'epoch': 1.36}


 46%|████▌     | 770/1674 [05:32<05:34,  2.71it/s]

{'loss': 0.0001, 'grad_norm': 0.01187546830624342, 'learning_rate': 1.080047789725209e-05, 'epoch': 1.38}


 47%|████▋     | 780/1674 [05:36<05:27,  2.73it/s]

{'loss': 0.0001, 'grad_norm': 0.0009540148312225938, 'learning_rate': 1.0681003584229393e-05, 'epoch': 1.4}


 47%|████▋     | 790/1674 [05:40<05:32,  2.66it/s]

{'loss': 0.0, 'grad_norm': 0.0005667447112500668, 'learning_rate': 1.0561529271206692e-05, 'epoch': 1.42}


 48%|████▊     | 800/1674 [05:45<06:02,  2.41it/s]

{'loss': 0.0002, 'grad_norm': 0.0001481493527535349, 'learning_rate': 1.044205495818399e-05, 'epoch': 1.43}


 48%|████▊     | 810/1674 [05:48<05:23,  2.67it/s]

{'loss': 0.0, 'grad_norm': 0.0003603070799726993, 'learning_rate': 1.0322580645161291e-05, 'epoch': 1.45}


 49%|████▉     | 820/1674 [05:53<08:19,  1.71it/s]

{'loss': 0.1969, 'grad_norm': 0.15163317322731018, 'learning_rate': 1.020310633213859e-05, 'epoch': 1.47}


 50%|████▉     | 830/1674 [05:57<05:22,  2.62it/s]

{'loss': 0.0008, 'grad_norm': 0.011220115236938, 'learning_rate': 1.0083632019115889e-05, 'epoch': 1.49}


 50%|█████     | 840/1674 [06:01<05:06,  2.72it/s]

{'loss': 0.0005, 'grad_norm': 0.31200695037841797, 'learning_rate': 9.96415770609319e-06, 'epoch': 1.51}


 51%|█████     | 850/1674 [06:05<05:29,  2.50it/s]

{'loss': 0.0, 'grad_norm': 0.03886282444000244, 'learning_rate': 9.84468339307049e-06, 'epoch': 1.52}


 51%|█████▏    | 860/1674 [06:09<05:00,  2.71it/s]

{'loss': 0.0, 'grad_norm': 9.196437167702243e-05, 'learning_rate': 9.725209080047791e-06, 'epoch': 1.54}


 52%|█████▏    | 870/1674 [06:13<05:03,  2.65it/s]

{'loss': 0.0198, 'grad_norm': 0.006033024750649929, 'learning_rate': 9.60573476702509e-06, 'epoch': 1.56}


 53%|█████▎    | 880/1674 [06:17<04:52,  2.72it/s]

{'loss': 0.0, 'grad_norm': 0.009630905464291573, 'learning_rate': 9.48626045400239e-06, 'epoch': 1.58}


 53%|█████▎    | 890/1674 [06:20<04:49,  2.70it/s]

{'loss': 0.0005, 'grad_norm': 0.010066213086247444, 'learning_rate': 9.36678614097969e-06, 'epoch': 1.59}


 54%|█████▍    | 900/1674 [06:24<04:50,  2.67it/s]

{'loss': 0.0001, 'grad_norm': 0.00012552276893984526, 'learning_rate': 9.24731182795699e-06, 'epoch': 1.61}


 54%|█████▍    | 910/1674 [06:28<04:40,  2.72it/s]

{'loss': 0.0, 'grad_norm': 0.009303387254476547, 'learning_rate': 9.12783751493429e-06, 'epoch': 1.63}


 55%|█████▍    | 920/1674 [06:32<04:45,  2.65it/s]

{'loss': 0.0004, 'grad_norm': 0.0001148601368186064, 'learning_rate': 9.00836320191159e-06, 'epoch': 1.65}


 56%|█████▌    | 930/1674 [06:35<04:43,  2.62it/s]

{'loss': 0.0, 'grad_norm': 0.00102780491579324, 'learning_rate': 8.888888888888888e-06, 'epoch': 1.67}


 56%|█████▌    | 940/1674 [06:39<04:34,  2.67it/s]

{'loss': 0.0, 'grad_norm': 0.0002336282195756212, 'learning_rate': 8.769414575866189e-06, 'epoch': 1.68}


 57%|█████▋    | 950/1674 [06:43<04:26,  2.71it/s]

{'loss': 0.0, 'grad_norm': 0.0006156394956633449, 'learning_rate': 8.64994026284349e-06, 'epoch': 1.7}


 57%|█████▋    | 960/1674 [06:47<04:24,  2.70it/s]

{'loss': 0.0147, 'grad_norm': 0.0006184762460179627, 'learning_rate': 8.530465949820788e-06, 'epoch': 1.72}


 58%|█████▊    | 970/1674 [06:51<04:26,  2.64it/s]

{'loss': 0.0004, 'grad_norm': 0.0003445052425377071, 'learning_rate': 8.410991636798089e-06, 'epoch': 1.74}


 59%|█████▊    | 980/1674 [06:54<04:26,  2.61it/s]

{'loss': 0.0, 'grad_norm': 0.010695275850594044, 'learning_rate': 8.291517323775388e-06, 'epoch': 1.76}


 59%|█████▉    | 990/1674 [06:58<04:14,  2.69it/s]

{'loss': 0.0, 'grad_norm': 6.6307773522567e-05, 'learning_rate': 8.172043010752689e-06, 'epoch': 1.77}


 60%|█████▉    | 1000/1674 [07:02<04:12,  2.67it/s]

{'loss': 0.0334, 'grad_norm': 21.67754554748535, 'learning_rate': 8.052568697729989e-06, 'epoch': 1.79}


 60%|██████    | 1010/1674 [07:06<04:13,  2.61it/s]

{'loss': 0.0, 'grad_norm': 2.9881291993660852e-05, 'learning_rate': 7.933094384707288e-06, 'epoch': 1.81}


 61%|██████    | 1020/1674 [07:09<04:03,  2.69it/s]

{'loss': 0.0813, 'grad_norm': 1.5753902196884155, 'learning_rate': 7.813620071684589e-06, 'epoch': 1.83}


 62%|██████▏   | 1030/1674 [07:13<04:05,  2.63it/s]

{'loss': 0.0001, 'grad_norm': 0.008107843808829784, 'learning_rate': 7.694145758661888e-06, 'epoch': 1.85}


 62%|██████▏   | 1040/1674 [07:17<03:55,  2.69it/s]

{'loss': 0.0001, 'grad_norm': 0.0017486633732914925, 'learning_rate': 7.574671445639188e-06, 'epoch': 1.86}


 63%|██████▎   | 1050/1674 [07:21<03:53,  2.68it/s]

{'loss': 0.0001, 'grad_norm': 0.001046981429681182, 'learning_rate': 7.455197132616489e-06, 'epoch': 1.88}


 63%|██████▎   | 1060/1674 [07:24<03:50,  2.66it/s]

{'loss': 0.0001, 'grad_norm': 0.0022439705207943916, 'learning_rate': 7.335722819593788e-06, 'epoch': 1.9}


 64%|██████▍   | 1070/1674 [07:28<04:01,  2.50it/s]

{'loss': 0.0001, 'grad_norm': 0.002407707739621401, 'learning_rate': 7.2162485065710874e-06, 'epoch': 1.92}


 65%|██████▍   | 1080/1674 [07:32<03:40,  2.70it/s]

{'loss': 0.0, 'grad_norm': 0.00016735192912165076, 'learning_rate': 7.096774193548388e-06, 'epoch': 1.94}


 65%|██████▌   | 1090/1674 [07:36<03:40,  2.65it/s]

{'loss': 0.0, 'grad_norm': 0.0009845243766903877, 'learning_rate': 6.977299880525688e-06, 'epoch': 1.95}


 66%|██████▌   | 1100/1674 [07:40<03:39,  2.62it/s]

{'loss': 0.0002, 'grad_norm': 0.00013687102182302624, 'learning_rate': 6.857825567502987e-06, 'epoch': 1.97}


 66%|██████▋   | 1110/1674 [07:44<05:08,  1.83it/s]

{'loss': 0.0003, 'grad_norm': 0.008251231163740158, 'learning_rate': 6.738351254480287e-06, 'epoch': 1.99}


                                                   
 67%|██████▋   | 1116/1674 [07:54<02:53,  3.21it/s]

{'eval_loss': 0.06369587779045105, 'eval_runtime': 7.8456, 'eval_samples_per_second': 142.118, 'eval_steps_per_second': 17.844, 'epoch': 2.0}


 67%|██████▋   | 1120/1674 [07:57<12:09,  1.32s/it]

{'loss': 0.0, 'grad_norm': 9.519069135421887e-05, 'learning_rate': 6.618876941457587e-06, 'epoch': 2.01}


 68%|██████▊   | 1130/1674 [08:01<03:37,  2.51it/s]

{'loss': 0.0, 'grad_norm': 0.00012314715422689915, 'learning_rate': 6.499402628434887e-06, 'epoch': 2.03}


 68%|██████▊   | 1140/1674 [08:05<03:18,  2.69it/s]

{'loss': 0.0, 'grad_norm': 0.00035366040538065135, 'learning_rate': 6.379928315412187e-06, 'epoch': 2.04}


 69%|██████▊   | 1150/1674 [08:08<03:13,  2.71it/s]

{'loss': 0.0, 'grad_norm': 4.987248030374758e-05, 'learning_rate': 6.260454002389486e-06, 'epoch': 2.06}


 69%|██████▉   | 1160/1674 [08:12<03:10,  2.70it/s]

{'loss': 0.0, 'grad_norm': 0.0015357331139966846, 'learning_rate': 6.140979689366786e-06, 'epoch': 2.08}


 70%|██████▉   | 1170/1674 [08:16<03:21,  2.50it/s]

{'loss': 0.0, 'grad_norm': 1.198655809275806e-05, 'learning_rate': 6.021505376344087e-06, 'epoch': 2.1}


 70%|███████   | 1180/1674 [08:21<05:13,  1.58it/s]

{'loss': 0.0, 'grad_norm': 6.574573490070179e-05, 'learning_rate': 5.9020310633213864e-06, 'epoch': 2.11}


 71%|███████   | 1190/1674 [08:24<03:03,  2.63it/s]

{'loss': 0.0, 'grad_norm': 0.0032792401034384966, 'learning_rate': 5.782556750298687e-06, 'epoch': 2.13}


 72%|███████▏  | 1200/1674 [08:28<02:57,  2.67it/s]

{'loss': 0.0, 'grad_norm': 3.248230859753676e-05, 'learning_rate': 5.663082437275986e-06, 'epoch': 2.15}


 72%|███████▏  | 1210/1674 [08:32<02:52,  2.68it/s]

{'loss': 0.0, 'grad_norm': 0.0004579953965730965, 'learning_rate': 5.543608124253286e-06, 'epoch': 2.17}


 73%|███████▎  | 1220/1674 [08:36<02:51,  2.65it/s]

{'loss': 0.0, 'grad_norm': 3.1777130061527714e-05, 'learning_rate': 5.424133811230586e-06, 'epoch': 2.19}


 73%|███████▎  | 1230/1674 [08:39<02:53,  2.56it/s]

{'loss': 0.0, 'grad_norm': 0.0004493911110330373, 'learning_rate': 5.304659498207886e-06, 'epoch': 2.2}


 74%|███████▍  | 1240/1674 [08:43<02:42,  2.67it/s]

{'loss': 0.0001, 'grad_norm': 9.634915477363393e-05, 'learning_rate': 5.185185185185185e-06, 'epoch': 2.22}


 75%|███████▍  | 1250/1674 [08:47<02:40,  2.65it/s]

{'loss': 0.0, 'grad_norm': 0.0066226692870259285, 'learning_rate': 5.065710872162486e-06, 'epoch': 2.24}


 75%|███████▌  | 1260/1674 [08:51<02:34,  2.69it/s]

{'loss': 0.0, 'grad_norm': 0.0004271507787052542, 'learning_rate': 4.946236559139785e-06, 'epoch': 2.26}


 76%|███████▌  | 1270/1674 [08:55<02:31,  2.67it/s]

{'loss': 0.0, 'grad_norm': 1.3254585610411596e-05, 'learning_rate': 4.826762246117085e-06, 'epoch': 2.28}


 76%|███████▋  | 1280/1674 [08:58<02:27,  2.66it/s]

{'loss': 0.0, 'grad_norm': 0.0004307570634409785, 'learning_rate': 4.707287933094385e-06, 'epoch': 2.29}


 77%|███████▋  | 1290/1674 [09:02<02:23,  2.67it/s]

{'loss': 0.0, 'grad_norm': 0.006114112213253975, 'learning_rate': 4.587813620071685e-06, 'epoch': 2.31}


 78%|███████▊  | 1300/1674 [09:06<02:19,  2.68it/s]

{'loss': 0.0, 'grad_norm': 4.492760490393266e-05, 'learning_rate': 4.468339307048985e-06, 'epoch': 2.33}


 78%|███████▊  | 1310/1674 [09:10<02:17,  2.66it/s]

{'loss': 0.0, 'grad_norm': 0.00010743292659753934, 'learning_rate': 4.348864994026284e-06, 'epoch': 2.35}


 79%|███████▉  | 1320/1674 [09:13<02:17,  2.58it/s]

{'loss': 0.0, 'grad_norm': 9.238495294994209e-06, 'learning_rate': 4.229390681003585e-06, 'epoch': 2.37}


 79%|███████▉  | 1330/1674 [09:17<02:09,  2.66it/s]

{'loss': 0.0001, 'grad_norm': 0.0038026021793484688, 'learning_rate': 4.1099163679808845e-06, 'epoch': 2.38}


 80%|████████  | 1340/1674 [09:22<02:24,  2.31it/s]

{'loss': 0.0, 'grad_norm': 0.002320777392014861, 'learning_rate': 3.990442054958184e-06, 'epoch': 2.4}


 81%|████████  | 1350/1674 [09:26<02:00,  2.68it/s]

{'loss': 0.0, 'grad_norm': 3.1853873224463314e-05, 'learning_rate': 3.870967741935484e-06, 'epoch': 2.42}


 81%|████████  | 1360/1674 [09:29<01:57,  2.67it/s]

{'loss': 0.0, 'grad_norm': 1.8643617295310833e-05, 'learning_rate': 3.751493428912784e-06, 'epoch': 2.44}


 82%|████████▏ | 1370/1674 [09:33<01:53,  2.67it/s]

{'loss': 0.0, 'grad_norm': 0.0001953992759808898, 'learning_rate': 3.6320191158900844e-06, 'epoch': 2.46}


 82%|████████▏ | 1380/1674 [09:37<01:55,  2.54it/s]

{'loss': 0.0, 'grad_norm': 2.1793430278194137e-05, 'learning_rate': 3.5125448028673837e-06, 'epoch': 2.47}


 83%|████████▎ | 1390/1674 [09:41<01:45,  2.68it/s]

{'loss': 0.0, 'grad_norm': 5.500587576534599e-05, 'learning_rate': 3.393070489844684e-06, 'epoch': 2.49}


 84%|████████▎ | 1400/1674 [09:45<01:45,  2.61it/s]

{'loss': 0.0, 'grad_norm': 8.481036638841033e-05, 'learning_rate': 3.2735961768219836e-06, 'epoch': 2.51}


 84%|████████▍ | 1410/1674 [09:48<01:41,  2.61it/s]

{'loss': 0.0, 'grad_norm': 1.7148515325970948e-05, 'learning_rate': 3.1541218637992834e-06, 'epoch': 2.53}


 85%|████████▍ | 1420/1674 [09:52<01:35,  2.66it/s]

{'loss': 0.0, 'grad_norm': 3.5689910873770714e-05, 'learning_rate': 3.034647550776583e-06, 'epoch': 2.54}


 85%|████████▌ | 1430/1674 [09:56<01:31,  2.68it/s]

{'loss': 0.004, 'grad_norm': 0.0001907791884150356, 'learning_rate': 2.9151732377538833e-06, 'epoch': 2.56}


 86%|████████▌ | 1440/1674 [10:00<01:28,  2.65it/s]

{'loss': 0.0, 'grad_norm': 0.00015066840569488704, 'learning_rate': 2.7956989247311827e-06, 'epoch': 2.58}


 87%|████████▋ | 1450/1674 [10:04<01:27,  2.56it/s]

{'loss': 0.0, 'grad_norm': 0.001377701759338379, 'learning_rate': 2.676224611708483e-06, 'epoch': 2.6}


 87%|████████▋ | 1460/1674 [10:07<01:21,  2.64it/s]

{'loss': 0.0, 'grad_norm': 1.9892124328180216e-05, 'learning_rate': 2.556750298685783e-06, 'epoch': 2.62}


 88%|████████▊ | 1470/1674 [10:11<01:16,  2.66it/s]

{'loss': 0.0, 'grad_norm': 0.0009325690334662795, 'learning_rate': 2.4372759856630828e-06, 'epoch': 2.63}


 88%|████████▊ | 1480/1674 [10:15<01:12,  2.68it/s]

{'loss': 0.0, 'grad_norm': 5.350754508981481e-05, 'learning_rate': 2.3178016726403825e-06, 'epoch': 2.65}


 89%|████████▉ | 1490/1674 [10:19<01:08,  2.68it/s]

{'loss': 0.0, 'grad_norm': 0.0009114077547565103, 'learning_rate': 2.1983273596176823e-06, 'epoch': 2.67}


 90%|████████▉ | 1500/1674 [10:22<01:04,  2.68it/s]

{'loss': 0.0043, 'grad_norm': 0.00031011091778054833, 'learning_rate': 2.078853046594982e-06, 'epoch': 2.69}


 90%|█████████ | 1510/1674 [10:26<01:01,  2.65it/s]

{'loss': 0.0, 'grad_norm': 0.0031316441018134356, 'learning_rate': 1.9593787335722822e-06, 'epoch': 2.71}


 91%|█████████ | 1520/1674 [10:30<00:58,  2.64it/s]

{'loss': 0.0001, 'grad_norm': 0.14366506040096283, 'learning_rate': 1.839904420549582e-06, 'epoch': 2.72}


 91%|█████████▏| 1530/1674 [10:34<00:54,  2.66it/s]

{'loss': 0.0, 'grad_norm': 0.00014355884923134, 'learning_rate': 1.720430107526882e-06, 'epoch': 2.74}


 92%|█████████▏| 1540/1674 [10:38<00:49,  2.68it/s]

{'loss': 0.0, 'grad_norm': 0.0006118763121776283, 'learning_rate': 1.6009557945041817e-06, 'epoch': 2.76}


 93%|█████████▎| 1550/1674 [10:41<00:46,  2.67it/s]

{'loss': 0.0, 'grad_norm': 0.004838824737817049, 'learning_rate': 1.4814814814814815e-06, 'epoch': 2.78}


 93%|█████████▎| 1560/1674 [10:45<00:42,  2.66it/s]

{'loss': 0.0, 'grad_norm': 0.015481731854379177, 'learning_rate': 1.3620071684587816e-06, 'epoch': 2.8}


 94%|█████████▍| 1570/1674 [10:49<00:39,  2.64it/s]

{'loss': 0.0, 'grad_norm': 0.01838519424200058, 'learning_rate': 1.2425328554360814e-06, 'epoch': 2.81}


 94%|█████████▍| 1580/1674 [10:53<00:36,  2.54it/s]

{'loss': 0.0, 'grad_norm': 0.00044983133557252586, 'learning_rate': 1.1230585424133811e-06, 'epoch': 2.83}


 95%|█████████▍| 1590/1674 [10:56<00:31,  2.68it/s]

{'loss': 0.0, 'grad_norm': 0.001000594231300056, 'learning_rate': 1.0035842293906811e-06, 'epoch': 2.85}


 96%|█████████▌| 1600/1674 [11:00<00:28,  2.62it/s]

{'loss': 0.0, 'grad_norm': 0.0019461133051663637, 'learning_rate': 8.84109916367981e-07, 'epoch': 2.87}


 96%|█████████▌| 1610/1674 [11:06<00:53,  1.20it/s]

{'loss': 0.0, 'grad_norm': 0.0009965577628463507, 'learning_rate': 7.646356033452807e-07, 'epoch': 2.89}


 97%|█████████▋| 1620/1674 [11:10<00:20,  2.60it/s]

{'loss': 0.0, 'grad_norm': 0.00012694908946286887, 'learning_rate': 6.451612903225807e-07, 'epoch': 2.9}


 97%|█████████▋| 1630/1674 [11:14<00:16,  2.67it/s]

{'loss': 0.0, 'grad_norm': 0.010187671519815922, 'learning_rate': 5.256869772998806e-07, 'epoch': 2.92}


 98%|█████████▊| 1640/1674 [11:17<00:12,  2.66it/s]

{'loss': 0.0, 'grad_norm': 0.0010198992677032948, 'learning_rate': 4.062126642771804e-07, 'epoch': 2.94}


 99%|█████████▊| 1650/1674 [11:21<00:09,  2.62it/s]

{'loss': 0.0, 'grad_norm': 0.0005135126411914825, 'learning_rate': 2.867383512544803e-07, 'epoch': 2.96}


 99%|█████████▉| 1660/1674 [11:26<00:08,  1.58it/s]

{'loss': 0.0, 'grad_norm': 0.029224906116724014, 'learning_rate': 1.6726403823178018e-07, 'epoch': 2.97}


100%|█████████▉| 1670/1674 [11:30<00:01,  2.64it/s]

{'loss': 0.0, 'grad_norm': 0.0005227415822446346, 'learning_rate': 4.7789725209080046e-08, 'epoch': 2.99}


                                                   
100%|██████████| 1674/1674 [11:41<00:00,  3.35it/s]

{'eval_loss': 0.06016705557703972, 'eval_runtime': 7.7486, 'eval_samples_per_second': 143.897, 'eval_steps_per_second': 18.068, 'epoch': 3.0}


100%|██████████| 1674/1674 [11:43<00:00,  2.38it/s]

{'train_runtime': 703.3076, 'train_samples_per_second': 19.012, 'train_steps_per_second': 2.38, 'train_loss': 0.029778581343443626, 'epoch': 3.0}





TrainOutput(global_step=1674, training_loss=0.029778581343443626, metrics={'train_runtime': 703.3076, 'train_samples_per_second': 19.012, 'train_steps_per_second': 2.38, 'total_flos': 2340429376636764.0, 'train_loss': 0.029778581343443626, 'epoch': 3.0})

In [20]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Should be > 0 if GPU is available
print(torch.cuda.current_device())  # Should return a valid device index
print(torch.cuda.get_device_name(0))  # Prints your GPU name


True
1
0
NVIDIA GeForce RTX 4060 Laptop GPU


In [12]:
model.save_pretrained("./fine_tuned_spam_model")
tokenizer.save_pretrained("./fine_tuned_spam_model")

print("Fine-tuning complete. Model saved.")

Fine-tuning complete. Model saved.


In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Path to your fine-tuned model
fine_tuned_model_path = "./fine_tuned_spam_model"  # Change this to the correct path

# Load the tokenizer and fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_path)

# Set the model to evaluation mode
model.eval()


  from .autonotebook import tqdm as notebook_tqdm


ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      

In [6]:
def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    label_map = {0: "ham", 1: "spam"}
    return label_map[prediction]

sentence = """we are calling from union bank of india the credit card declined last week
we would help you to redeem the card and give you free expensive gifts aling with it so please give the OTP to redeem the gift"""
print(f"Predicted Label: {classify_text(sentence)}")



Predicted Label: spam
