In [2]:
!pip install transformers datasets



In [3]:
#Kaggle API Setup
from google.colab import files
import pandas as pd

print("Please upload the kaggle.json file")
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Please upload the kaggle.json file


Saving kaggle.json to kaggle.json


In [4]:
#Download the dataset directly from kaggle
!kaggle datasets download  -d uciml/sms-spam-collection-dataset
!unzip -q sms-spam-collection-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
License(s): unknown
Downloading sms-spam-collection-dataset.zip to /content
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 515MB/s]


In [5]:
#Loading the data into a pandas df and exploring the data
df = pd.read_csv('spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
#Cleaning the dataframe by dropping unnecessary columns and renaming important columns
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df = df.rename(columns={"v1":"label", "v2":"text"})
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#Convert labels from (ham, spam) to binary (0, 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# The previous cell created an extra column named 'labels'. Let's remove it.
if 'labels' in df.columns:
  df = df.drop(['labels'], axis=1)

df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
print("\nDataset Info")
df.info()
print("\nLabel Distribution")
print(df['label'].value_counts())


Dataset Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   int64 
 1   text    5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB

Label Distribution
label
0    4825
1     747
Name: count, dtype: int64


##First Time trying and Using a Tokenizer.
For this project I will be using DistilBert model and its tokenizer


In [10]:
from transformers import DistilBertTokenizer

#Initialize the tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

#Extract sample text from the df
sample_text = df['text'].iloc[0]
print(f"Original Text: {sample_text}")

#Tokenize the text
tokenized_output = tokenizer(sample_text, padding=True, truncation=True, return_tensors="pt")
print(f"Tokenized Output: {tokenized_output}")

#the tokenized output is in the form of input ids, they can be converted back to words/tokens
print("\nInput IDs: ", tokenized_output['input_ids'])
print("Tokens: ", tokenizer.convert_ids_to_tokens(tokenized_output['input_ids'][0]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Original Text: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Tokenized Output: {'input_ids': tensor([[  101,  2175,  2127, 18414, 17583,  2391,  1010,  4689,  1012,  1012,
          2800,  2069,  1999, 11829,  2483,  1050,  2307,  2088,  2474,  1041,
         28305,  1012,  1012,  1012, 25022,  2638,  2045,  2288, 26297, 28194,
          1012,  1012,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

Input IDs:  tensor([[  101,  2175,  2127, 18414, 17583,  2391,  1010,  4689,  1012,  1012,
          2800,  2069,  1999, 11829,  2483,  1050,  2307,  2088,  2474,  1041,
         28305,  1012,  1012,  1012, 25022,  2638,  2045,  2288, 26297, 28194,
          1012,  1012,  1012,   102]])
Tokens:  ['[CLS]', 'go', 'until', 'ju', '##rong', 'point', ',', 'crazy', '.', '.', 'available', 'only', 'in', 'bug', '##is', 'n', 'gre

Now that we have the idea of how to use the tokenizer, we will extend this approach to the entire dataset


In [11]:
from datasets import Dataset

#First step would be to convert the pandas dataframe into a hugging face Dataset
dataset = Dataset.from_pandas(df)

#Now defining the tokenization function
def tokenize_function(examples):
  #This tokenize function will tokenize only the text part of the dataset
  return tokenizer(examples["text"], padding="max_length", truncation=True)



#Calling the function on the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)


#Print the new result
print(tokenized_dataset)

Map:   0%|          | 0/5572 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 5572
})


Now that the dataset is ready we will split the dataset


In [12]:
final_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

print(final_dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 4457
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1115
    })
})


In [13]:
!nvidia-smi


Wed Aug 27 14:50:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

Now we can start importing the model and fine tuning it


In [15]:
#Loading the pretrained model

from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score


#Loading the model
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2) #num_labels=2 means that we are telling the model that
                                                                                      #our target variable is binary 0 or 1


#Next, we will define a function to compute metrics

def compute_metrics(pred):
  labels = pred.label_ids #gives us the true label
  preds = np.argmax(pred.predictions, axis=-1)
  # The model's raw output (pred.predictions) is a pair of numbers (logits) for each message,
  #like [-1.2, 2.5]. np.argmax finds the index of the highest number. So, for [-1.2, 2.5],
  #the highest value is at index 1, making the prediction 1 (spam).
  #This converts the raw logits into our final predictions.
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc}  # The Trainer expects the metrics to be returned in a dictionary.


#Next we need to define training arguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=16,  #num of training examples to use in a single batch
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



#Now we will create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["test"],
    compute_metrics=compute_metrics,
)

#Finally start the training

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.033241,0.991928
2,0.044800,0.03895,0.993722


TrainOutput(global_step=558, training_loss=0.04231592275763071, metrics={'train_runtime': 442.4631, 'train_samples_per_second': 20.146, 'train_steps_per_second': 1.261, 'total_flos': 1180814391619584.0, 'train_loss': 0.04231592275763071, 'epoch': 2.0})

Now that the model is fine tuned according to our dataset
We will continue with Evaluation and Prediction


In [16]:
from transformers import pipeline

#First we have to get the final evaluation metrics
print("Evaluating the model on the training set")
eval_results = trainer.evaluate()
print(eval_results)


#Now we will use the fine tuned model for predictions
print("Using the model for predictions")

#What we will do is create a text_classification pipeline using the fine-tuned model and the tokenizer
spam_classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0,  #ensures it runs on the GPU
)

#Testing it with custom sentences
custom_texts = [
    "Congratulations! You've won a free ticket to the Bahamas. Click here to claim.",
    "Hey, are we still on for lunch tomorrow?",
    "URGENT: Your account has been compromised. Please verify your details by clicking this link.",
    "Wassup bro! How have you been?",
    "YURRRRR WASS GOODY MY BOI",
    "We have approved your loan of 45000$, Call us to cash it out"
]

predictions = spam_classifier(custom_texts)


#Printing the results
for text, prediction in zip(custom_texts, predictions):
    label = "Spam" if prediction['label'] == 'LABEL_1' else "Ham"
    score = prediction['score']
    print(f"\nText: '{text}'")
    print(f"Predicted Label: {label} (Confidence: {score:.4f})")


Evaluating the model on the training set


Device set to use cuda:0


{'eval_loss': 0.033241163939237595, 'eval_accuracy': 0.9919282511210762, 'eval_runtime': 14.8679, 'eval_samples_per_second': 74.994, 'eval_steps_per_second': 9.416, 'epoch': 2.0}
Using the model for predictions

Text: 'Congratulations! You've won a free ticket to the Bahamas. Click here to claim.'
Predicted Label: Spam (Confidence: 0.8790)

Text: 'Hey, are we still on for lunch tomorrow?'
Predicted Label: Ham (Confidence: 0.9963)

Text: 'URGENT: Your account has been compromised. Please verify your details by clicking this link.'
Predicted Label: Spam (Confidence: 0.5784)

Text: 'Wassup bro! How have you been?'
Predicted Label: Ham (Confidence: 0.9983)

Text: 'YURRRRR WASS GOODY MY BOI'
Predicted Label: Ham (Confidence: 0.9988)

Text: 'We have approved your loan of 45000$, Call us to cash it out'
Predicted Label: Spam (Confidence: 0.6563)
