In [3]:
# === 1. Import Required Libraries ===
import pandas as pd
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.corpus import stopwords

# === 2. Download NLTK stopwords (run once) ===
nltk.download('stopwords')

# === 3. Load Dataset ===
df = pd.read_csv(r"C:\Users\HP440G4\Downloads\spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']  # Rename for clarity
df['label'] = df['label'].map({'ham': 0, 'spam': 1})  # Convert labels

# === 4. Text Cleaning Function ===
def clean_text(text):
    text = text.lower()  # Lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    words = text.split()  # Tokenize
    stop_words = set(stopwords.words('english'))  # Load stopwords
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

# === 5. TF-IDF Vectorization ===
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['clean_text'])
y = df['label']

# === 6. Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === 7. Train Naive Bayes ===
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)

# === 8. Train SVM ===
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)

# === 9. Evaluate Models ===
print("🔍 Naive Bayes Performance:")
print("Accuracy :", accuracy_score(y_test, nb_preds))
print("Precision:", precision_score(y_test, nb_preds))
print("Recall   :", recall_score(y_test, nb_preds))

print("\n🔍 SVM Performance:")
print("Accuracy :", accuracy_score(y_test, svm_preds))
print("Precision:", precision_score(y_test, svm_preds))
print("Recall   :", recall_score(y_test, svm_preds))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP440G4\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🔍 Naive Bayes Performance:
Accuracy : 0.9659192825112107
Precision: 1.0
Recall   : 0.7466666666666667

🔍 SVM Performance:
Accuracy : 0.9757847533632287
Precision: 0.9767441860465116
Recall   : 0.84


In [5]:
import joblib

# Save model and vectorizer (you MUST run this)
joblib.dump(svm_model, 'spam_classifier_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [7]:
import joblib

# Replace with your actual trained model variable
joblib.dump(svm_model, 'spam_classifier_model.pkl')  # ✅ Save model


['spam_classifier_model.pkl']

In [9]:
import joblib

# Assuming your trained model is stored in svm_model
joblib.dump(svm_model, 'spam_classifier_model.pkl')

# Assuming your vectorizer is stored in tfidf
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

print("✅ Model and vectorizer saved!")


✅ Model and vectorizer saved!


In [11]:
import os
os.getcwd()


'C:\\Users\\HP440G4'

In [13]:
import joblib

# Save directly into your spam project folder
joblib.dump(svm_model, r'C:\Users\HP440G4\Desktop\spam\spam_classifier_model.pkl')
joblib.dump(tfidf, r'C:\Users\HP440G4\Desktop\spam\tfidf_vectorizer.pkl')

print("✅ Model and vectorizer saved to Desktop\\spam")


✅ Model and vectorizer saved to Desktop\spam
