In [2]:
import pandas as pd
import re

from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score



In [3]:
#Connect to PostgreSQL
engine = create_engine(
    "postgresql+psycopg2://postgres:#3005Harsha@localhost:5433/spam_db"
)


In [4]:
#Load Data from Database
query = """
SELECT message, category
FROM emails
"""
df = pd.read_sql(query, engine)

print(df.head())
print(df['category'].value_counts())


                                             message category
0  Go until jurong point, crazy.. Available only ...      ham
1                      Ok lar... Joking wif u oni...      ham
2  Free entry in 2 a wkly comp to win FA Cup fina...     spam
3  U dun say so early hor... U c already then say...      ham
4  Nah I don't think he goes to usf, he lives aro...      ham
category
ham     4825
spam     747
Name: count, dtype: int64


In [5]:
#Clean and Validate Data
df['category'] = df['category'].str.strip().str.lower()
df = df[df['category'].isin(['spam', 'ham'])]
df = df.dropna(subset=['message'])

df = df.reset_index(drop=True)


In [6]:
#Text cleaning Function
df['category'] = df['category'].str.strip().str.lower()
df = df[df['category'].isin(['spam', 'ham'])]
df = df.dropna(subset=['message'])

df = df.reset_index(drop=True)


In [7]:
#Feature Engineering(TF-IDF)
tfidf = TfidfVectorizer(max_features=3000)

X = tfidf.fit_transform(df['clean_message'])
y = df['category'].map({'ham': 0, 'spam': 1})


KeyError: 'clean_message'

In [8]:
print(df.columns)


Index(['message', 'category'], dtype='object')


In [9]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

df['clean_message'] = df['message'].apply(clean_text)


In [10]:
print(df['clean_message'].isna().sum())


0


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000)

X = tfidf.fit_transform(df['clean_message'])
y = df['category'].map({'ham': 0, 'spam': 1})


In [12]:
print("NaNs in y:", y.isna().sum())


NaNs in y: 0


In [13]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [14]:
#Train Model (Navies Bayes)
nb = MultinomialNB()
nb.fit(X_train, y_train)

pred_nb = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, pred_nb))


Naive Bayes Accuracy: 0.9829596412556054


In [15]:
#Logestic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))


Logistic Regression Accuracy: 0.9739910313901345


In [16]:
#SVM (Best for text)
svm = LinearSVC()
svm.fit(X_train, y_train)

pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, pred_svm))


SVM Accuracy: 0.9883408071748879


In [17]:
#Model Evalu8ation
print("\nSVM Classification Report")
print(classification_report(y_test, pred_svm))



SVM Classification Report
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       968
           1       0.99      0.93      0.95       147

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [18]:
#Train Final Model
from sklearn.svm import LinearSVC

final_model = LinearSVC()
final_model.fit(X_train, y_train)


In [19]:
#Save Model and TF-IDF Vectorizer
import joblib

joblib.dump(final_model, "spam_classifier_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("Model and Vectorizer saved successfully!")


Model and Vectorizer saved successfully!


In [20]:
#Load Model and Test Prediction
loaded_model = joblib.load("spam_classifier_model.pkl")
loaded_tfidf = joblib.load("tfidf_vectorizer.pkl")


In [21]:
#Testing with new email
sample_email = ["Congratulations! You won a free lottery ticket. Click now"]

sample_clean = [clean_text(sample_email[0])]
sample_vector = loaded_tfidf.transform(sample_clean)

prediction = loaded_model.predict(sample_vector)

print("Prediction:", "SPAM" if prediction[0] == 1 else "HAM")


Prediction: SPAM
