# **Imports**

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

# **Load and process the dataset**

In [39]:
file_path = "/content/drive/MyDrive/Projects/emails.csv"
data = pd.read_csv(file_path)
data.head()

X = data.drop(columns=["Email No.", "Prediction"])  # Separate the features (excluding the first column) and the target variable.
y = data["Prediction"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Split the Data into Train and Test Sets


# **Preprocessing**

In [40]:
imputer = SimpleImputer(strategy="mean")  # handeling missing values
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

scaler = StandardScaler()                 # Scale Numerical Features
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# **Model Selection, Training, and Evaluation**

In [41]:
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}


def train_and_evaluate(classifier, name):
    classifier.fit(X_train, y_train)

    train_accuracy = accuracy_score(y_train, classifier.predict(X_train))
    test_accuracy = accuracy_score(y_test, classifier.predict(X_test))

    train_precision = precision_score(y_train, classifier.predict(X_train))
    test_precision = precision_score(y_test, classifier.predict(X_test))

    train_recall = recall_score(y_train, classifier.predict(X_train))
    test_recall = recall_score(y_test, classifier.predict(X_test))

    train_f1 = f1_score(y_train, classifier.predict(X_train))
    test_f1 = f1_score(y_test, classifier.predict(X_test))

    return {
        "name": name,
        "train_accuracy": train_accuracy,
        "test_accuracy": test_accuracy,
        "train_precision": train_precision,
        "test_precision": test_precision,
        "train_recall": train_recall,
        "test_recall": test_recall,
        "train_f1": train_f1,
        "test_f1": test_f1
    }


# Determine the best classifier
results = []
for name, classifier in classifiers.items():
    results.append(train_and_evaluate(classifier, name))


best_algorithm = max(results, key=lambda x: x["test_accuracy"])    # Find the best performing algorithm


# Print the results
for result in results:
    print("\nClassifier:", result["name"])
    print("Training Accuracy:", result["train_accuracy"])
    print("Testing Accuracy:", result["test_accuracy"])
    print("Training Precision:", result["train_precision"])
    print("Testing Precision:", result["test_precision"])
    print("Training Recall:", result["train_recall"])
    print("Testing Recall:", result["test_recall"])
    print("Training F1-score:", result["train_f1"])
    print("Testing F1-score:", result["test_f1"])

print("\nBest performing algorithm:", best_algorithm["name"])



Classifier: Logistic Regression
Training Accuracy: 0.9997582789460963
Testing Accuracy: 0.970048309178744
Training Precision: 0.9991701244813278
Testing Precision: 0.9260450160771704
Training Recall: 1.0
Testing Recall: 0.972972972972973
Training F1-score: 0.9995848899958489
Testing F1-score: 0.9489291598023064

Classifier: Decision Tree
Training Accuracy: 1.0
Testing Accuracy: 0.923671497584541
Training Precision: 1.0
Testing Precision: 0.8653198653198653
Training Recall: 1.0
Testing Recall: 0.8682432432432432
Training F1-score: 1.0
Testing F1-score: 0.866779089376054

Classifier: Support Vector Machine
Training Accuracy: 0.9702683103698332
Testing Accuracy: 0.9468599033816425
Training Precision: 0.9963269054178145
Testing Precision: 0.9958847736625515
Training Recall: 0.9011627906976745
Testing Recall: 0.8175675675675675
Training F1-score: 0.946358482337549
Testing F1-score: 0.8979591836734693

Classifier: Random Forest
Training Accuracy: 1.0
Testing Accuracy: 0.9777777777777777
Tra