###Data Preparation:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

data = pd.read_csv("/content/emails.csv")

X = data.drop(columns=['Prediction'])
y = data['Prediction']

X_numeric = X.select_dtypes(include=['float64', 'int64'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

###Model Selectionb and Training:

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

logistic_regression = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()
svm = SVC()

models = [logistic_regression, decision_tree, svm]
for model in models:
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print(f"{model.__class__.__name__} Accuracy: {accuracy}")

LogisticRegression Accuracy: 0.970048309178744
DecisionTreeClassifier Accuracy: 0.9217391304347826
SVC Accuracy: 0.9468599033816425


###Model Evaluation:

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-score: {f1}")

chosen_model = logistic_regression
evaluate_model(chosen_model, X_test, y_test)

cv_scores = cross_val_score(chosen_model, X_scaled, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Accuracy: 0.970048309178744, Precision: 0.9288025889967637, Recall: 0.9695945945945946, F1-score: 0.9487603305785124
Cross-validation scores: [0.95652174 0.96714976 0.96324952 0.96034816 0.94970986]
Mean CV accuracy: 0.9593958082209701


###Ensemble Methods

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

random_forest = RandomForestClassifier()
gradient_boosting = GradientBoostingClassifier()

ensemble_models = [random_forest, gradient_boosting]
for model in ensemble_models:
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print(f"{model.__class__.__name__} Accuracy: {accuracy}")

chosen_ensemble_model = gradient_boosting
evaluate_model(chosen_ensemble_model, X_test, y_test)

cv_scores_ensemble = cross_val_score(chosen_ensemble_model, X_scaled, y, cv=5)
print("Cross-validation scores for ensemble model:", cv_scores_ensemble)
print("Mean CV accuracy for ensemble model:", cv_scores_ensemble.mean())

RandomForestClassifier Accuracy: 0.9710144927536232
GradientBoostingClassifier Accuracy: 0.9719806763285024
Accuracy: 0.9719806763285024, Precision: 0.9435215946843853, Recall: 0.9594594594594594, F1-score: 0.9514237855946398
Cross-validation scores for ensemble model: [0.95362319 0.96328502 0.95551257 0.96711799 0.94003868]
Mean CV accuracy for ensemble model: 0.9559154916416711


###HyperParameter_Tuning_of_Model

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.exceptions import ConvergenceWarning
import warnings

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
# Grid Search for Logistic Regression
param_grid_lr = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                 'max_iter': [100, 200, 300, 400, 500, 1000]}
grid_search_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5)
grid_search_lr.fit(X_train, y_train)
best_lr = grid_search_lr.best_estimator_

# Model Evaluation
print("Logistic Regression (Grid Search) Evaluation:")
evaluate_model(best_lr, X_test, y_test)

Logistic Regression (Grid Search) Evaluation:
Accuracy: 0.9806763285024155, Precision: 0.9630872483221476, Recall: 0.9695945945945946, F1-score: 0.9663299663299664


In [None]:
# Grid Search for Random Forest
param_grid_rf = {'n_estimators': [100, 200, 300],
                 'max_depth': [10, 20, 30, None],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_

print("Random Forest (Grid Search) Evaluation:")
evaluate_model(best_rf, X_test, y_test)

Random Forest (Grid Search) Evaluation:
Accuracy: 0.9758454106280193, Precision: 0.9531772575250836, Recall: 0.9628378378378378, F1-score: 0.9579831932773111


In [None]:
# Random Search for Random Forest
param_dist_rf = {'n_estimators': [100, 200, 300],
                 'max_depth': [10, 20, 30, None],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}
random_search_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist_rf, n_iter=10, cv=5)
random_search_rf.fit(X_train, y_train)
best_rf_random = random_search_rf.best_estimator_

print("Random Forest (Random Search) Evaluation:")
evaluate_model(best_rf_random, X_test, y_test)

Random Forest (Random Search) Evaluation:
Accuracy: 0.9768115942028985, Precision: 0.9563758389261745, Recall: 0.9628378378378378, F1-score: 0.9595959595959596
