In [None]:
# 📦 Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import string
import re
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [None]:
# 🔹 Step 2: EDA (Exploratory Data Analysis)
df['length'] = df['message'].apply(len)
label_counts = df['label'].value_counts()

In [None]:
# Word clouds
spam_words = ' '.join(df[df['label'] == 1]['message'])
ham_words = ' '.join(df[df['label'] == 0]['message'])

In [None]:
# Show plots
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.xticks([0, 1], ['Ham', 'Spam'])
plt.title("Class Distribution")

plt.subplot(1, 2, 2)
sns.histplot(data=df, x='length', hue='label', bins=50, palette='husl')
plt.title("Message Length Distribution")
plt.tight_layout()
plt.show()

In [None]:
# Word clouds
wc1 = WordCloud(width=600, height=400, background_color='white').generate(spam_words)
wc2 = WordCloud(width=600, height=400, background_color='white').generate(ham_words)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(wc1, interpolation='bilinear')
plt.axis('off')
plt.title("Spam Word Cloud")

plt.subplot(1, 2, 2)
plt.imshow(wc2, interpolation='bilinear')
plt.axis('off')
plt.title("Ham Word Cloud")
plt.tight_layout()
plt.show()

In [None]:
# 🔹 Step 3: Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['clean_message'] = df['message'].apply(clean_text)

In [None]:
# 🔹 Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_message'], df['label'], test_size=0.3, random_state=42)

In [None]:
# 🔹 Step 5: Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# 🔹 Step 6: Try Multiple Models
def evaluate_model(model, name):
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.4f}")
    return name, acc, model

results = []
results.append(evaluate_model(MultinomialNB(), "MultinomialNB"))
results.append(evaluate_model(LogisticRegression(max_iter=1000), "Logistic Regression"))
results.append(evaluate_model(LinearSVC(), "Linear SVM"))
results.append(evaluate_model(RandomForestClassifier(), "Random Forest"))

In [None]:
# 🔹 Step 7: Select Best Model
best_model_info = max(results, key=lambda x: x[1])
best_name, best_acc, best_model = best_model_info
print(f"\n✅ Best Model: {best_name} with Accuracy: {best_acc:.4f}")

In [None]:
# 🔹 Step 8: Final Classification Report
final_preds = best_model.predict(X_test_vec)
print("\nClassification Report:\n")
print(classification_report(y_test, final_preds, target_names=['ham', 'spam']))

In [None]:
# 🔹 Step 9: Build Final Pipeline on Full Data
final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('svm', LinearSVC())  # use best model name if different
])
final_pipeline.fit(df['clean_message'], df['label'])

In [None]:
# 🔹 Step 10: Save the Final Model
joblib.dump(final_pipeline, "sms_spam_classifier_svm.pkl")
print("\n✅ Final model saved as 'sms_spam_classifier_svm.pkl'")