<a href="https://colab.research.google.com/github/KudratBatta/SMS-Spam-Detection/blob/main/SMS_SpamDetect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import joblib

In [16]:
!wget -q https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip -o smsspamcollection.zip

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [17]:
df = pd.read_csv("SMSSpamCollection", sep="\t", header=None, names=["label", "message"])
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    df["message"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [19]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [20]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": LinearSVC()
}

summary = []

In [21]:
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    report = classification_report(y_test, y_pred, output_dict=True)
    weighted = report["weighted avg"]
    summary.append({
        "Model": name,
        "Precision": round(weighted["precision"], 4),
        "Recall": round(weighted["recall"], 4),
        "F1-Score": round(weighted["f1-score"], 4)
    })

In [22]:
results_df = pd.DataFrame(summary).sort_values(by="F1-Score", ascending=False).reset_index(drop=True)
print("📊 Model Performance:\n")
print(results_df)

📊 Model Performance:

                 Model  Precision  Recall  F1-Score
0                  SVM     0.9840  0.9839    0.9835
1        Random Forest     0.9756  0.9749    0.9738
2          Naive Bayes     0.9714  0.9704    0.9688
3  Logistic Regression     0.9689  0.9677    0.9658


In [23]:
best_model = models["SVM"]
joblib.dump(best_model, "svm_spam_model.joblib")
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
print("\n✅ SVM model and TF-IDF vectorizer saved.")


✅ SVM model and TF-IDF vectorizer saved.


In [24]:
import joblib

model = joblib.load("svm_spam_model.joblib")
vectorizer = joblib.load("tfidf_vectorizer.joblib")

msg = ["You've won a free iPhone! Click here to claim."]
msg_vec = vectorizer.transform(msg)
prediction = model.predict(msg_vec)

print("Spam" if prediction[0] == 1 else "Ham")

Spam
