In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

# Load the dataset
dataset_path = r'C:\email.csv.csv'  # Update this path with the exact path to your dataset
try:
    data = pd.read_csv(dataset_path)
    print("Dataset loaded successfully.")
    print(data.head())  # Print first few rows to confirm
except FileNotFoundError:
    print(f"File not found. Please check the path: {dataset_path}")
    raise
except Exception as e:
    print(f"An error occurred: {e}")
    raise

# Check for null values and handle them if necessary
if data['message'].isnull().sum() > 0:
    data = data.dropna(subset=['message'])

# Create 'label' column if it doesn't exist
if 'label' not in data.columns:
    # Placeholder: Define how to determine if a message is spam
    # Here, I'm assuming a simple rule-based approach for demonstration purposes.
    # You'll need to replace this with the actual criteria for your dataset.
    data['label'] = data['message'].apply(lambda x: 'spam' if 'spam' in x else 'ham')

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)  # Remove special characters
    text = text.lower()  # Convert to lower case
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Apply preprocessing
data['message'] = data['message'].apply(preprocess_text)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=5)
X = tfidf_vectorizer.fit_transform(data['message'])
y = data['label']

# Encode the labels if they are not already numerical
y = y.map({'spam': 1, 'ham': 0})  # 'ham' represents non-spam

# Use SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model selection and training
model = MultinomialNB()
model.fit(X_train, y_train)

# Prediction and evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


Dataset loaded successfully.
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...
Accuracy: 0.9955464431990174
Precision: 0.9914956518400492
Recall: 0.9996612826741249
F1-Score: 0.9955617238438074
