In [2]:
from google.colab import files
uploaded = files.upload()

Saving SMSSpamCollection to SMSSpamCollection


In [3]:
import pandas as pd

In [4]:
import pandas as pd

# Load tab-separated file
df = pd.read_csv("SMSSpamCollection",
                 sep="\t",
                 header=None,
                 names=["label", "message"])

# Save as CSV (optional)
df.to_csv("sms.csv", index=False)

df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
df.isnull().sum()

Unnamed: 0,0
label,0
message,0


In [7]:
import numpy as np


In [14]:
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [15]:
X = df["message"]
y = df["label"].map({"ham": 0, "spam": 1})

kfold = 5
skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=42)

In [16]:
# Base Models
nb = MultinomialNB()
lr = LogisticRegression(max_iter=500)
svm = SVC(kernel='linear', probability=True)  # FIXED

# Voting
voting_hard = VotingClassifier(
    estimators=[('nb', nb), ('lr', lr), ('svm', svm)],
    voting='hard'
)

voting_soft = VotingClassifier(
    estimators=[('nb', nb), ('lr', lr), ('svm', svm)],
    voting='soft'
)

# Stacking
stacking = StackingClassifier(
    estimators=[('nb', nb), ('lr', lr), ('svm', svm)],
    final_estimator=LogisticRegression()
)

# AdaBoost with Decision Stumps
stump = DecisionTreeClassifier(max_depth=1)
adaboost = AdaBoostClassifier(
    estimator=stump,
    n_estimators=100,
    learning_rate=1.0
)

In [17]:
models = {
    "NaiveBayes": nb,
    "LogisticRegression": lr,
    "LinearSVM": svm,
    "VotingHard": voting_hard,
    "VotingSoft": voting_soft,
    "Stacking": stacking,
    "AdaBoost_Stumps": adaboost
}

In [19]:
results = []

print("=== MODEL COMPARISON (5-Fold CV) ===\n")

for name, model in models.items():

    pipe = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', model)
    ])

    # If model supports probabilities → use ROC-AUC
    if name == "VotingHard":
        scoring = ['precision', 'recall', 'f1']
    else:
        scoring = ['precision', 'recall', 'f1', 'roc_auc']

    cv_results = cross_validate(pipe, X, y, cv=skf, scoring=scoring)

    print(name)
    print(f"Precision: {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
    print(f"Recall: {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")
    print(f"F1: {cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}")

    if name != "VotingHard":
        print(f"ROC-AUC: {cv_results['test_roc_auc'].mean():.4f} ± {cv_results['test_roc_auc'].std():.4f}")
        roc = cv_results['test_roc_auc'].mean()
    else:
        roc = None

    print("-"*50)

    results.append([
        name,
        cv_results['test_precision'].mean(),
        cv_results['test_recall'].mean(),
        cv_results['test_f1'].mean(),
        roc
    ])

=== MODEL COMPARISON (5-Fold CV) ===

NaiveBayes
Precision: 0.9983 ± 0.0034
Recall: 0.7818 ± 0.0205
F1: 0.8767 ± 0.0131
ROC-AUC: 0.9878 ± 0.0038
--------------------------------------------------
LogisticRegression
Precision: 0.9855 ± 0.0090
Recall: 0.7256 ± 0.0153
F1: 0.8357 ± 0.0107
ROC-AUC: 0.9909 ± 0.0053
--------------------------------------------------
LinearSVM
Precision: 0.9784 ± 0.0087
Recall: 0.9036 ± 0.0197
F1: 0.9394 ± 0.0094
ROC-AUC: 0.9921 ± 0.0041
--------------------------------------------------
VotingHard
Precision: 0.9872 ± 0.0082
Recall: 0.8286 ± 0.0057
F1: 0.9010 ± 0.0065
--------------------------------------------------
VotingSoft
Precision: 0.9850 ± 0.0069
Recall: 0.8862 ± 0.0238
F1: 0.9329 ± 0.0153
ROC-AUC: 0.9924 ± 0.0045
--------------------------------------------------
Stacking
Precision: 0.9707 ± 0.0079
Recall: 0.9290 ± 0.0178
F1: 0.9493 ± 0.0100
ROC-AUC: 0.9920 ± 0.0041
--------------------------------------------------
AdaBoost_Stumps
Precision: 0.9547 

In [20]:
comparison_df = pd.DataFrame(results,
                             columns=["Model", "Precision", "Recall", "F1", "ROC-AUC"])

comparison_df.to_csv("ensemble_comparison.csv", index=False)
comparison_df

Unnamed: 0,Model,Precision,Recall,F1,ROC-AUC
0,NaiveBayes,0.998291,0.781754,0.87671,0.987838
1,LogisticRegression,0.98551,0.725566,0.835686,0.990921
2,LinearSVM,0.978409,0.903624,0.939358,0.992053
3,VotingHard,0.987238,0.828635,0.901005,
4,VotingSoft,0.98504,0.886183,0.93287,0.992396
5,Stacking,0.970681,0.929038,0.9493,0.991986
6,AdaBoost_Stumps,0.954682,0.427096,0.589898,0.926984


In [21]:
best_model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', stacking)
])

predictions = cross_val_predict(best_model, X, y, cv=skf)

cm = confusion_matrix(y, predictions)

print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[4804   21]
 [  54  693]]


In [22]:
best_model.fit(X, y)

preds = best_model.predict(X)
probs = best_model.predict_proba(X)[:, 1]

final_df = pd.DataFrame({
    "MessageId": np.arange(len(X)),
    "Actual": y,
    "Predicted": preds,
    "Probability": probs
})

final_df.to_csv("final_model_predictions.csv", index=False)
final_df.head()

Unnamed: 0,MessageId,Actual,Predicted,Probability
0,0,0,0,0.008129
1,1,0,0,0.007713
2,2,1,1,0.999766
3,3,0,0,0.007721
4,4,0,0,0.007741


In [23]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y, preds)
print("Accuracy:", accuracy)

Accuracy: 0.997307968413496


In [27]:
comparison = pd.DataFrame({
    "Message": X,
    "Actual": y,
    "Predicted": preds
})

comparison.head(1000)

Unnamed: 0,Message,Actual,Predicted
0,"Go until jurong point, crazy.. Available only ...",0,0
1,Ok lar... Joking wif u oni...,0,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,1
3,U dun say so early hor... U c already then say...,0,0
4,"Nah I don't think he goes to usf, he lives aro...",0,0
...,...,...,...
995,"I can't, I don't have her number!",0,0
996,Change again... It's e one next to escalator...,0,0
997,Yetunde i'm in class can you not run water on ...,0,0
998,Not a lot has happened here. Feels very quiet....,0,0


In [28]:
files.download("ensemble_comparison.csv")
files.download("final_model_predictions.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>