In [19]:
!pip install requests




In [18]:
import pandas as pd
import re
import string
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Load dataset
df = pd.read_csv(r'C:\Users\MSI GL62M\Downloads\mail_data.csv')
df = df.where(pd.notnull(df), '')

# Convert labels (spam → 1, ham → 0)
df.loc[df['Category'] == 'spam', 'Category'] = 1
df.loc[df['Category'] == 'ham', 'Category'] = 0
df['Category'] = df['Category'].astype(int)

# Apply text cleaning
df['Message'] = df['Message'].apply(clean_text)

# Separate features & labels
X = df['Message']
y = df['Category']

# Split into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

# TF-IDF vectorization (Improved)
vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, stop_words='english', ngram_range=(1,2))
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# Train Logistic Regression Model
model = LogisticRegression(class_weight='balanced')
model.fit(X_train_features, y_train)

# Make Predictions
y_pred = model.predict(X_test_features)

# Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

# Save the model & vectorizer
joblib.dump(model, 'spam_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

# Load Model & Test a New Message
model = joblib.load('spam_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

new_message = ['FreeMsg Hey there darling, it’s been 3 weeks now and no word back!']
new_message_features = vectorizer.transform(new_message)

prob = model.predict_proba(new_message_features)[0][1]  # Spam probability

if prob > 0.5:
    print(f"The message is SPAM with {prob*100:.2f}% confidence.")
else:
    print(f"The message is NOT spam with {100 - prob*100:.2f}% confidence.")

Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       960
           1       0.94      0.92      0.93       155

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

The message is SPAM with 62.99% confidence.


In [21]:
import pickle

# Save the trained model
with open("spam_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

# Save the TfidfVectorizer
with open("vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!
