In [None]:

# Produced by: Kirubel Temesgen
# College ID: C00260396
# Description: Machine Learning Algorithm Overview

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


In [None]:
# Load dataset
file_path = r"D:\college\Sem2\DataSci ML\archive\emails.csv"
df = pd.read_csv(file_path).sample(frac=0.1, random_state=42)  # Use 10% of data for efficiency

# Extract email components
def extract_email_components(email):
    from_match = re.search(r'From: (.+)', email)
    subject_match = re.search(r'Subject: (.*)', email)
    body_match = re.search(r'\n\n(.*)', email, re.DOTALL)  # Extract body after blank line

    return {
        "From": from_match.group(1).strip() if from_match else None,
        "Subject": subject_match.group(1).strip() if subject_match else None,
        "Body": body_match.group(1).strip() if body_match else None
    }

# Apply function to all emails
email_components = df['message'].apply(extract_email_components)
email_df = pd.DataFrame(email_components.tolist())

# Merge extracted data with original DataFrame
df = pd.concat([df, email_df], axis=1)

# Drop unneeded columns
df = df.drop(columns=["message", "file"], errors="ignore")

# Drop rows with missing values
df = df.dropna(subset=["From", "Subject", "Body"])


In [None]:
# Text cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'\b(re|fw)\b', '', text)  # Remove "Re:" and "FW:"
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)   # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning to Subject & Body
df["Processed_Subject"] = df["Subject"].apply(clean_text)
df["Processed_Body"] = df["Body"].apply(clean_text)

# Remove original text columns
df = df.drop(columns=["Subject", "Body"])


In [None]:
# Initialise TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=500, ngram_range=(1,2))

# Transform Subject & Body separately
tfidf_subject = vectorizer.fit_transform(df["Processed_Subject"])
tfidf_body = vectorizer.fit_transform(df["Processed_Body"])

# Merge TF-IDF features into a single sparse matrix
X = hstack([tfidf_subject, tfidf_body])  # Convert to sparse format for efficiency


In [None]:
wcss = []
silhouette_scores = []
K_range = range(2, 10)  # Try different values for k (2 to 10)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)  # Within-cluster sum of squares
    silhouette_scores.append(silhouette_score(X, kmeans.labels_))

# Plot Elbow Curve
plt.figure(figsize=(8, 5))
plt.plot(K_range, wcss, marker='o', linestyle='--')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("WCSS (Within Cluster Sum of Squares)")
plt.title("Elbow Method to Determine Optimal k")
plt.show()

# Plot Silhouette Scores
plt.figure(figsize=(8, 5))
plt.plot(K_range, silhouette_scores, marker='s', linestyle='-')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score for Different k")
plt.show()


In [None]:
# Set optimal k (based on the previous analysis)
optimal_k = 5  # Adjust based on elbow & silhouette method results

# Train k-Means model
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df["Cluster_Label"] = kmeans.fit_predict(X)

# Display Cluster Distribution
print(df["Cluster_Label"].value_counts())


In [None]:
# Sample emails for each cluster
for cluster in range(optimal_k):
    print(f"\nCluster {cluster} Sample Emails:")
    print(df[df["Cluster_Label"] == cluster][["Processed_Subject", "Processed_Body"]].sample(5))

# Countplot of Cluster Distribution
plt.figure(figsize=(6,4))
sns.countplot(x=df["Cluster_Label"])
plt.xlabel("Cluster Label")
plt.ylabel("Number of Emails")
plt.title("Distribution of Emails in Clusters")
plt.show()
