In [2]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import re  # regular expressions for text cleaning
import os  # operating system utilities (paths, env, etc.)

In [3]:
# Load the dataset with proper encoding
df = pd.read_csv("../data/spam.csv", sep=",", encoding="latin-1")

# Rename columns for clarity
df.rename(columns={"v1": "label", "v2": "message"}, inplace=True)

# Display a concise summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       5572 non-null   object
 1   message     5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
# Preview the first few rows
df.head()

Unnamed: 0,label,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
# Check for missing values in each column
df.isnull().sum()

label            0
message          0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

## Identify and Remove Duplicates

In [6]:
# Check for duplicates
print("Number of duplicate rows:", df.duplicated().sum())
print("Duplicate rows (if any):")
print(df[df.duplicated(keep=False)])

# Remove duplicates
df_before = len(df)
df.drop_duplicates(inplace=True)
df_after = len(df)

print(f"\nRows removed: {df_before - df_after}")
print(f"Dataset shape after removing duplicates: {df.shape}")

Number of duplicate rows: 403
Duplicate rows (if any):
     label                                            message Unnamed: 2  \
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
7      ham  As per your request 'Melle Melle (Oru Minnamin...        NaN   
8     spam  WINNER!! As a valued network customer you have...        NaN   
9     spam  Had your mobile 11 months or more? U R entitle...        NaN   
11    spam  SIX chances to win CASH! From 100 to 20,000 po...        NaN   
...    ...                                                ...        ...   
5524  spam  You are awarded a SiPix Digital Camera! call 0...        NaN   
5535   ham  I know you are thinkin malaria. But relax, chi...        NaN   
5539   ham                         Just sleeping..and surfing        NaN   
5553   ham                        Hahaha..use your brain dear        NaN   
5558   ham                             Sorry, I'll call later        NaN   

     Unnamed: 3 Unnamed: 4  
2  

## Data Type Conversion

In [9]:
# Check current data types
print("Current Data Types:")
print(df.dtypes)

# Convert label to categorical (spam/ham)
df["label"] = df["label"].astype("category")

# Ensure message column is string type
df["message"] = df["message"].astype(str)

# Drop any columns that are not needed (like column indices added by pandas)
columns_to_drop = [col for col in df.columns if col.startswith("Unnamed")]
if columns_to_drop:
    df.drop(columns=columns_to_drop, inplace=True)
    print(f"Dropped columns: {columns_to_drop}")

print("\nUpdated Data Types:")
print(df.dtypes)
print("\nLabel value counts:")
print(df["label"].value_counts())

Current Data Types:
label         object
message       object
Unnamed: 2    object
Unnamed: 3    object
Unnamed: 4    object
dtype: object
Dropped columns: ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

Updated Data Types:
label      category
message      object
dtype: object

Label value counts:
label
ham     4516
spam     653
Name: count, dtype: int64


In [10]:
# Preview data after type conversions
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Text Data Cleaning

In [11]:
# Function to clean text
def clean_text(text):
    """
    Clean text by:
    - Converting to lowercase
    - Removing URLs
    - Removing HTML tags
    - Removing special characters and punctuation
    - Removing extra whitespace
    """
    # Convert to lowercase
    # text = str(text).lower()

    # Remove URLs
    # text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

    # Remove HTML tags
    # text = re.sub(r"<[^>]+>", "", text)

    # Remove email addresses
    # text = re.sub(r"\S+@\S+", "", text)

    # Remove special characters and punctuation (keep alphanumeric and spaces)
    # text = re.sub(r"[^\w\s]", "", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [12]:
# Apply text cleaning and store in a new column
df["message_cleaned"] = df["message"].apply(clean_text)

# Preview the updated dataframe
df.head()

Unnamed: 0,label,message,message_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I don't think he goes to usf, he lives aro..."


In [13]:
# Add a message length feature
df["length"] = df["message_cleaned"].str.len()

# Preview the updated dataframe
df.head()

Unnamed: 0,label,message,message_cleaned,length
0,ham,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I don't think he goes to usf, he lives aro...",61


In [14]:
# Add punctuation count feature
df["punct"] = df["message_cleaned"].str.count(r"[^\w\s]")

# Preview the updated dataframe
df.head()

Unnamed: 0,label,message,message_cleaned,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I don't think he goes to usf, he lives aro...",61,2


In [15]:
# Check total number of rows
len(df)

5169

In [16]:
# Inspect the distinct label classes
df["label"].unique()

['ham', 'spam']
Categories (2, object): ['ham', 'spam']

In [17]:
# Count how many examples per class
df["label"].value_counts()

label
ham     4516
spam     653
Name: count, dtype: int64

In [18]:
# Utility for splitting data into train/test sets
from sklearn.model_selection import train_test_split

### The Target (y)
`label`: This is what you want to predict (Spam vs. Ham).

### The Features (X)
You have two types of features available: 
- Text
- Numerical Metadata

#### Option A: Text-Based (Recommended for NLP)
The most powerful predictor is the content of the message itself.

- Column: message_cleaned
- Note: You cannot feed this column directly into a machine learning model. You must first converting it to numbers using a **vectorizer** (like `TfidfVectorizer` or `CountVectorizer`) after splitting your data.

#### Option B: Metadata-Based (Weak alone, strong when combined)
- Columns: length, punct
- Note: These are already numbers, so they can be used directly, but they usually carry less information than the actual words in the message.

In [19]:
# Select the feature (text) and the target (label)
X = df["message_cleaned"]
y = df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

# Check the shapes to verify the split
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

Training Data Shape: (3463,)
Testing Data Shape: (1706,)


### Vectorization

Machine learning models cannot understand raw text strings like `"Call me back later"`. You must convert these text messages into numerical vectors.

Common techniques include:
1. **CountVectorizer (Bag of Words)**: simply counts word frequencies.
2. **TfidfVectorizer (TF-IDF)**: weighs words by importance (lowers the weight o common words like "the", "distinct" words get higher weight).
    - **TF (Term Frequency)**: How often a word appears in a specific document. If a word appears many times in a document, it is likely important to that document.
    - **IDF (Inverse Document Frequency)**: How unique a word is across all documents. Words that appear in almost every document (like "the", "is", "and") are not very useful for distinguishing between documents. IDF downweights these common words and upweights rare words.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit vectorizer only on training data, then transform both train and test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(
    X_test
)  # Note: only .transform(), not fit_transform()

print(f"X_train_tfidf shape: {X_train_tfidf.shape}")

X_train_tfidf shape: (3463, 6889)


**Note**
1. `fit_transform(X_train)`:
   - **Fits:** Learns the vocabulary and IDF weights from the training data.
   - **Transforms:** Converts the training data into numbers based on what it learned.
   - Usage: Only on the Training Set.

2. `transform(X_test)`:
   - **Transforms:** Converts the test data into numbers using the already learned vocabulary and weights from the training set.
   - Usage: On the Test Set (and any future real-world data).

### Model Training

Now that your text is converted into numbers (Example: `X_train_tfidf`), you can feed it into a machine learning classifier.

A great starting model for text classification is `LinearSVC` (Linear Support Vector Classifier) because it is fast and handles sparse text data very well.

**Practical Note:**

While modern approaches may use neural networks or transformer-based models for state-of-the-art results, **LinearSVC + TF-IDF** remains a strong, simple, and interpretable baseline—often achieving >95% accuracy on standard spam datasets (like SMS Spam Collection or Enron email).

In [21]:
from sklearn.svm import LinearSVC

# Initialize the model
clf = LinearSVC()

# Train the model
clf.fit(X_train_tfidf, y_train)

# Predict results for the test set
predictions = clf.predict(X_test_tfidf)

# Show the first 5 predictions
print(list(predictions[:5]))

['ham', 'ham', 'ham', 'ham', 'ham']


>Alternative approach: instead of running TF-IDF vectorization and LinearSVC as separate steps, you can combine them into a single pipeline using `sklearn.pipeline.Pipeline`. This allows you to streamline the process of vectorization and model training into one cohesive step.
    

In [None]:
# Using a Pipeline lets you chain vectorization → model into one estimator, so you:
# - avoid accidentally calling fit_transform on the test set (data leakage)
# - keep training/prediction code shorter
# - make cross-validation and hyperparameter tuning much easier

from sklearn.pipeline import Pipeline

# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC

# Build a single end-to-end model: raw text -> TF-IDF -> LinearSVC
text_clf = Pipeline(
    steps=[
        ("tfidf", TfidfVectorizer()),
        ("clf", LinearSVC()),
    ]
)

# Train (pipeline will fit TF-IDF on X_train, then fit the classifier)
text_clf.fit(X_train, y_train)

# Predict (pipeline will transform X_test using the already-fit TF-IDF, then predict)
predictions = text_clf.predict(X_test)

print(list(predictions[:5]))

['ham', 'ham', 'ham', 'ham', 'ham']


In [None]:
# Predict directly from raw text (the Pipeline automatically applies TF-IDF vectorization, then runs LinearSVC).
text_clf.predict(
    [
        "Hello, How are you? Congratulations! You have won a free ticket. Call immidiately."
    ]
)

array(['spam'], dtype=object)

### Model Evaluation

You need to measure how well your model performed by comparing your `predictions` against the actual correct answers (`y_test`).

Key metrics to check:
- **Accuracy**: Overall percentage of correct predictions.
- **Confusion Matrix**: Shows false positives vs false negatives (crucial for spam filters).
- **Classification Report**: Detailed precision, recall, and F1-scores.

In [22]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# 1. Overall Accuracy
print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")

# 2. Confusion Matrix
print("\nConfusion Matrix:")
# print(confusion_matrix(y_test, predictions))
df = pd.DataFrame(
    confusion_matrix(y_test, predictions),
    index=["ham", "spam"],
    columns=["ham", "spam"],
)
print(df)

# 3. Detailed Report
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.98

Confusion Matrix:
       ham  spam
ham   1467     5
spam    26   208

Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1472
        spam       0.98      0.89      0.93       234

    accuracy                           0.98      1706
   macro avg       0.98      0.94      0.96      1706
weighted avg       0.98      0.98      0.98      1706



**Results**

- **Overall Accuracy (98%)**: The model is correct 98% of the time, which is outstanding.
- **Precision for Spam (0.98)**: This is the most critical metric for a spam filter. It means when the model says "This is Spam", it is correct 98% of the time.
  - Real-world impact: Very few legitimate emails (Ham) are getting tossed into the junk folder. This is usually the priority for email providers (user trust).
- **Recall for Ham (1.00)**: It found almost every single safe message.
- **Recall for Spam (0.89)**: This is lower than the other metrics. It means the model missed 11% of the actual spam messages (26 messages in the confusion matrix)
  - Real-world impact: Some spam will still sneak into the user's Inbox.


For a production spam filter, this is a safe and high-quality model. It prioritizes "not blocking real emails" (High Precision) over "catching absolutely every spam" (High Recall), which is exactly how most spam filters should behave.

If you wanted to catch more spam, you could tune the model, but you would risk accidentally blocking more real emails (lowering that 0.98 precision).