In [1]:
#Classification

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
data = pd.read_csv('emails.csv') 

# Data Preparation
# Remove non-essential columns, only keeping word frequency columns and target label
X = data.drop(columns=['Email No.', 'Prediction'])  # Feature matrix
y = data['Prediction']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Train and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    # Print evaluation results
    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}\n")


--- Logistic Regression ---
Accuracy: 0.9719806763285024
Precision: 0.9435215946843853
Recall: 0.9594594594594594
F1 Score: 0.9514237855946398

--- Decision Tree ---
Accuracy: 0.9246376811594202
Precision: 0.8657718120805369
Recall: 0.8716216216216216
F1 Score: 0.8686868686868686

--- Support Vector Machine ---
Accuracy: 0.8173913043478261
Precision: 0.9083969465648855
Recall: 0.40202702702702703
F1 Score: 0.5573770491803279

--- Random Forest ---
Accuracy: 0.9777777777777777
Precision: 0.9595959595959596
Recall: 0.9628378378378378
F1 Score: 0.9612141652613827

--- Gradient Boosting ---
Accuracy: 0.9719806763285024
Precision: 0.9435215946843853
Recall: 0.9594594594594594
F1 Score: 0.9514237855946398

