In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

In [None]:
emails_dataset=pd.read_csv('/content/emails.csv')

In [None]:
emails_dataset.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [None]:
emails_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [None]:
emails_dataset.shape

(5172, 3002)

In [None]:
emails_dataset_drop_email_col=emails_dataset.drop('Email No.',axis=1)

In [None]:
X = emails_dataset.drop(columns=['Email No.', 'Prediction'])
y = emails_dataset['Prediction']
X_numeric = X.select_dtypes(include=['float64', 'int64'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=40)


In [None]:
logistic_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()
SVM = SVC(kernel='linear', random_state=42)
random_forest = RandomForestClassifier()
gradient_boost = GradientBoostingClassifier()

In [None]:
models = [logistic_reg, decision_tree, SVM, random_forest, gradient_boost]
for model in models:
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print(f"{model.__class__.__name__} Accuracy: {accuracy:.4f}")

LogisticRegression Accuracy: 0.9710
DecisionTreeClassifier Accuracy: 0.9343
SVC Accuracy: 0.9488
RandomForestClassifier Accuracy: 0.9681
GradientBoostingClassifier Accuracy: 0.9681


In [None]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluates the given model using the provided test data.

    Args:
        model: The trained machine learning model.
        X_test: The test features.
        y_test: The test labels.

    Returns:
        None. Prints the evaluation metrics.
    """
    y_pred = model.predict(X_test)

    # Add your evaluation metrics here, e.g.,
    from sklearn.metrics import accuracy_score, precision_score, recall_score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

In [None]:
chosen_model = random_forest
evaluate_model(chosen_model, X_test, y_test)

Accuracy: 0.9681
Precision: 0.9336
Recall: 0.9558


In [None]:
evaluate_models = [logistic_reg, decision_tree, SVM, gradient_boost]
for model in evaluate_models:
    print(f"{model.__class__.__name__} Evaluation:")
    evaluate_model(model, X_test, y_test)

LogisticRegression Evaluation:
Accuracy: 0.9710
Precision: 0.9430
Recall: 0.9558
DecisionTreeClassifier Evaluation:
Accuracy: 0.9343
Precision: 0.8599
Recall: 0.9184
SVC Evaluation:
Accuracy: 0.9488
Precision: 0.9113
Recall: 0.9082
GradientBoostingClassifier Evaluation:
Accuracy: 0.9681
Precision: 0.9424
Recall: 0.9456


In [None]:
cv_scores = cross_val_score(chosen_model, X_scaled, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Cross-validation scores: [0.95942029 0.97487923 0.95647969 0.96905222 0.93617021]
Mean CV accuracy: 0.9592003289135574


In [None]:
for model in models:
    model_name = model.__class__.__name__
    cv_scores = cross_val_score(model, X_scaled, y, cv=5)
    print(f"{model_name} - Cross-validation scores: {', '.join(map(str, cv_scores))}")
    print(f"{model_name} - Mean CV accuracy: {cv_scores.mean():.4f}")

LogisticRegression - Cross-validation scores: 0.9565217391304348, 0.9671497584541063, 0.9632495164410058, 0.960348162475822, 0.9497098646034816
LogisticRegression - Mean CV accuracy: 0.9594
DecisionTreeClassifier - Cross-validation scores: 0.9188405797101449, 0.9323671497584541, 0.9206963249516441, 0.9284332688588007, 0.879110251450677
DecisionTreeClassifier - Mean CV accuracy: 0.9159
SVC - Cross-validation scores: 0.9294685990338164, 0.9478260869565217, 0.9429400386847195, 0.937137330754352, 0.9148936170212766
SVC - Mean CV accuracy: 0.9345
RandomForestClassifier - Cross-validation scores: 0.9632850241545894, 0.9671497584541063, 0.9584139264990329, 0.9709864603481625, 0.9342359767891683
RandomForestClassifier - Mean CV accuracy: 0.9588
GradientBoostingClassifier - Cross-validation scores: 0.9536231884057971, 0.9632850241545894, 0.9555125725338491, 0.9671179883945842, 0.9400386847195358
GradientBoostingClassifier - Mean CV accuracy: 0.9559
