In [7]:
import numpy as np
import pandas as pd

In [8]:
df = pd.read_csv('spam.csv',  encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Drop the extra columns

In [9]:
df.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Rename the column names

In [10]:
df = df.rename(columns={'v1': 'category', 'v2': 'text'})
df.head()

Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df['text'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

Preprocessing text columns

In [12]:
import re
import string
from sklearn.model_selection import train_test_split

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the text column
df['text'] = df['text'].apply(preprocess_text)

In [13]:
df['text'][0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

Encode the Labels

In [14]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])  # ham=0, spam=1

In [15]:
df.head()

Unnamed: 0,category,text
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final ...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


Split the Data

In [16]:
X = df['text']
y = df['category']

# Split the data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
X_train

184                             going on nothing greatbye
2171                          i wont so wats wit the guys
5422                      ok ksry i knw sivatats y i askd
4113    where are you what do you do how can you stand...
4588          have you not finished work yet or something
                              ...                        
1932                            jus finished avatar nigro
5316                             jus finish watching tv u
2308    moby pub quizwin a å£ high street prize if u k...
1903    free entry in a weekly comp for a chance to wi...
763     nothing but we jus tot u would ask cos u ba gu...
Name: text, Length: 4457, dtype: object

In [18]:
X_test

2826    oh right ok ill make sure that i do loads of w...
3695                          i am in tirupur call you da
3906               no that just means you have a fat head
575            you have won cash or a prize to claim call
2899     come aftr ltdecimalgt now i m cleaning the house
                              ...                        
854     stop the story ive told him ive returned it an...
5044    we have sent jd for customer service cum accou...
2015    just re read it and i have no shame but tell m...
3381    well i meant as opposed to my drunken night of...
785     she was supposed to be but couldnt make it she...
Name: text, Length: 1115, dtype: object

In [19]:
y_train

184     0
2171    0
5422    0
4113    0
4588    0
       ..
1932    0
5316    0
2308    1
1903    1
763     0
Name: category, Length: 4457, dtype: int64

In [20]:
y_test

2826    0
3695    0
3906    0
575     1
2899    0
       ..
854     0
5044    0
2015    0
3381    0
785     0
Name: category, Length: 1115, dtype: int64

Text Vectorization

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data, transform the test data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [22]:
print(X_train_vectorized)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 56427 stored elements and shape (4457, 7580)>
  Coords	Values
  (0, 2540)	0.37480030994277175
  (0, 4494)	0.2877554722580902
  (0, 4366)	0.4963964226811729
  (0, 2608)	0.7282253136232357
  (1, 7317)	0.4096139848007767
  (1, 5921)	0.26283010514422595
  (1, 7118)	0.5147580761919395
  (1, 7276)	0.5024814397892622
  (1, 6492)	0.19791696685220464
  (1, 2668)	0.45442504242175186
  (2, 4470)	0.24772381829046913
  (2, 3444)	0.5332607129444951
  (2, 3434)	0.4186524656374954
  (2, 5827)	0.5332607129444951
  (2, 381)	0.4411678217267143
  (3, 7211)	0.16417228452606042
  (3, 332)	0.11685868905435609
  (3, 7515)	0.3755081206943349
  (3, 7198)	0.13309874932526355
  (3, 1719)	0.2460929766314273
  (3, 2929)	0.13554022192674514
  (3, 938)	0.12191205110163555
  (3, 6083)	0.24444020222927526
  (3, 6622)	0.07196744743290548
  (3, 558)	0.12270201617164656
  :	:
  (4455, 1247)	0.2420201817121277
  (4455, 4871)	0.26176364818795933
  (4456, 2540)	0.

In [23]:
print(X_test_vectorized)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 13241 stored elements and shape (1115, 7580)>
  Coords	Values
  (0, 235)	0.10141246072402145
  (0, 1514)	0.15131758893053981
  (0, 1719)	0.12990975555886194
  (0, 1823)	0.2774431146366586
  (0, 1843)	0.26972001118259764
  (0, 2573)	0.1434590235767829
  (0, 2785)	0.19291233293230578
  (0, 3029)	0.1438214401991649
  (0, 3172)	0.1035642426064149
  (0, 3186)	0.11771161894877617
  (0, 3643)	0.24572986065919214
  (0, 3808)	0.17789005660567295
  (0, 4218)	0.30143326516006413
  (0, 4439)	0.11515282690339022
  (0, 4463)	0.17125635691491078
  (0, 4470)	0.14002944074559426
  (0, 5213)	0.3587685316324726
  (0, 5392)	0.17887899448945754
  (0, 5744)	0.28739992671721587
  (0, 5746)	0.1804174909372733
  (0, 5921)	0.12378280681231202
  (0, 6285)	0.1861040969604679
  (0, 6481)	0.24379079634811543
  (0, 6492)	0.09321123110802597
  (0, 6628)	0.16273757540265507
  :	:
  (1112, 6585)	0.14584941684459693
  (1112, 7247)	0.12669210253201643
  (1113,

Stratified Cross-Validation

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import  StratifiedKFold


In [28]:
# Initialize all models for classification
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier()
}

In [32]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
best_model = None
best_score = 0

In [33]:
for name, model in models.items():
    scores = []
    for train_idx, val_idx in skf.split(X_train_vectorized, y_train):
        # Split data
        X_train_fold, X_val_fold = X_train_vectorized[train_idx], X_train_vectorized[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Train and evaluate
        model.fit(X_train_fold, y_train_fold)
        score = model.score(X_val_fold, y_val_fold)
        scores.append(score)

    avg_score = np.mean(scores)
    print(f"{name} average score: {avg_score}")

    # Check if this model is the best
    if avg_score > best_score:
        best_score = avg_score
        best_model = model

Logistic Regression average score: 0.9596150551720661
SVM average score: 0.9759938529752606
Random Forest average score: 0.9746490653499269
Naive Bayes average score: 0.9517614752859375
K-Nearest Neighbors average score: 0.9609613543608605
Decision Tree average score: 0.956025091953444


Train the Final Model on the Entire Training Set

In [34]:
best_model.fit(X_train_vectorized, y_train)

Evaluate the Final Model on the Test Set

In [35]:
y_pred = best_model.predict(X_test_vectorized)

Print evaluation metrics

In [36]:
print("Final Model Performance on Test Set:")
print(classification_report(y_test, y_pred))

Final Model Performance on Test Set:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.87      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

