In [None]:
import pandas as pd
import re

# Load dataset (only 10% of rows for testing)
file_path = r"D:\college\Sem2\DataSci ML\archive\emails.csv"
df = pd.read_csv(file_path)

# Check for missing data and duplicates
print(df.info())
print(df.head())

duplicate_count = df.duplicated(subset=['message']).sum()
print(f"Number of duplicate emails: {duplicate_count}")


In [None]:
import re

# Function to extract email components
def extract_email_components(email):
    from_match = re.search(r'From: (.+)', email)
    to_match = re.search(r'To: (.+)', email)
    subject_match = re.search(r'Subject: (.*)', email)
    body_match = re.search(r'\n\n(.*)', email, re.DOTALL)  # Body starts after a blank line
    return {
        "From": from_match.group(1).strip() if from_match else None,
        "To": to_match.group(1).strip() if to_match else None,
        "Subject": subject_match.group(1).strip() if subject_match else None,
        "Body": body_match.group(1).strip() if body_match else None
    }

# Apply function to all emails
email_components = df['message'].apply(extract_email_components)
email_df = pd.DataFrame(email_components.tolist())

# Merge extracted data with original DataFrame
df = pd.concat([df, email_df], axis=1)

# Display first few rows
print(df.head())

# Check for missing values in key fields
print("Missing values per column:")
print(df[['From', 'To', 'Subject', 'Body']].isnull().sum())


In [None]:
# Text cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'\b(re|fw)\b', '', text)  # Remove "Re:" and "FW:"
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)   # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning to Subject & Body
df["Processed_Subject"] = df["Subject"].fillna("").apply(clean_text)
df["Processed_Body"] = df["Body"].fillna("").apply(clean_text)



In [None]:
# Text cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'\b(re|fw)\b', '', text)  # Remove "Re:" and "FW:"
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)   # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning to Subject & Body
df["Processed_Subject"] = df["Subject"].fillna("").apply(clean_text)
df["Processed_Body"] = df["Body"].fillna("").apply(clean_text)

# Define spam keywords
spam_keywords = [
    "win", "lottery", "free", "offer", "click here", "urgent", "claim", 
    "limited-time", "money", "prize", "limited time"
]
spam_pattern = re.compile(r'\b(' + '|'.join(spam_keywords) + r')\b', re.IGNORECASE)

# Modified function to check if an email is spam based on rule >= 3
def is_spam(row):
    subject = row["Processed_Subject"] if pd.notnull(row["Processed_Subject"]) else ""
    body = row["Processed_Body"] if pd.notnull(row["Processed_Body"]) else ""
    sender = row["From"] if pd.notnull(row["From"]) else ""
    
    score = 0
    # Add points for keyword matches
    score += 1 if spam_pattern.search(subject) else 0
    score += 1 if spam_pattern.search(body) else 0
    
    # Add points if sender is outside @enron.com domain
    score += 2 if sender and not sender.endswith("@enron.com") else 0
    
    # If score >= 3, classify as spam (1), else not spam (0)
    return 1 if score >= 3 else 0

# Apply function to label spam
df["Spam_Label"] = df.apply(is_spam, axis=1)

# Display spam vs non-spam counts
print(df["Spam_Label"].value_counts())


In [None]:
from scipy.sparse import hstack  # Import hstack from scipy.sparse
from sklearn.model_selection import train_test_split  # Import train_test_split

# Drop 'To' column as it's not useful
df = df.drop(columns=["To"], errors="ignore")

# Remove rows with missing values in key fields
df = df.dropna(subset=["From", "Processed_Subject", "Processed_Body"])

# Feature Engineering: Word Count in Subject & Body
df["Word_Count_Subject"] = df["Processed_Subject"].apply(lambda x: len(x.split()))
df["Word_Count_Body"] = df["Processed_Body"].apply(lambda x: len(x.split()))

# Display updated dataset
print(df[["Processed_Subject", "Processed_Body", "Word_Count_Subject", "Word_Count_Body", "Spam_Label"]].head())

# TF-IDF Vectorization for Subject & Body
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english", max_features=500, ngram_range=(1, 2))

# Transform Subject & Body separately
tfidf_subject = vectorizer.fit_transform(df["Processed_Subject"])
tfidf_body = vectorizer.fit_transform(df["Processed_Body"])

# Merge TF-IDF features into a single sparse matrix
X = hstack([tfidf_subject, tfidf_body])  # Sparse format to reduce memory usage
y = df["Spam_Label"].values  # Convert labels to NumPy array

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Initialis and train the k-NN classifier
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

# Predict with test data
y_pred = knn_model.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"k-NN Accuracy: {accuracy:.4f}")

# Display Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix Visualization
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues",
            xticklabels=["Not Spam", "Spam"], yticklabels=["Not Spam", "Spam"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - k-NN")
plt.show()
