# Tugas Group Project 2

- Rafindra Nabiel Fawwaz
- Akhtar Zia Faizarobbi
- Naufal Maula Nabil
- Sinta Dewi Rahmawati

## Pemilahan Data

### Import Dataset

In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv('app_train_cleaned_encoded.csv')
test_df = pd.read_csv('app_test_cleaned_encoded.csv')

In [None]:
train_df.shape
test_df.shape

### Splitting Data

In [None]:
X = train_df.drop(columns=['TARGET', 'SK_ID_CURR'])
y = train_df['TARGET']

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.25, 
    stratify=y, 
    random_state=42
)

#### Statistik data

In [None]:
print("Jumlah data sebelum split:")
print(y.value_counts(normalize=True))
print()

print("Jumlah data setelah split:")
print("Train set:")
print(y_train.value_counts(normalize=True))
print()
print("Test set:")
print(y_test.value_counts(normalize=True))

#### Kfold

In [None]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

## Eksperimen Model Klasifikasi

### 1. KNN

### 2. Naive Bayes

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd

# Set up 10-fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
nb_model = GaussianNB()

# Untuk menyimpan metrik dari setiap fold
results = {
    'fold': [],
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1_score': [],
    'auc_roc': []
}

# 10 kali training dan evaluasi
for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train), 1):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Train model
    nb_model.fit(X_tr, y_tr)
    
    # Predict
    y_val_pred = nb_model.predict(X_val)
    y_val_proba = nb_model.predict_proba(X_val)[:, 1]
    
    # Hitung metrik
    acc = accuracy_score(y_val, y_val_pred)
    prec = precision_score(y_val, y_val_pred)
    rec = recall_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    auc = roc_auc_score(y_val, y_val_proba)
    
    # Simpan hasil
    results['fold'].append(fold)
    results['accuracy'].append(acc)
    results['precision'].append(prec)
    results['recall'].append(rec)
    results['f1_score'].append(f1)
    results['auc_roc'].append(auc)

# Ubah ke DataFrame untuk analisis
results_df = pd.DataFrame(results)
print(results_df)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# Definisikan StratifiedKFold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Ambil data training dan validasi dari fold ke-0
train_idx, val_idx = list(cv.split(X_train, y_train))[0]
X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

# Inisialisasi dan fit model
nb_model = GaussianNB()
nb_model.fit(X_fold_train, y_fold_train)

# Prediksi
y_pred = nb_model.predict(X_fold_val)
y_proba = nb_model.predict_proba(X_fold_val)[:, 1]

# Evaluasi metrik
acc = accuracy_score(y_fold_val, y_pred)
prec = precision_score(y_fold_val, y_pred, zero_division=0)
rec = recall_score(y_fold_val, y_pred, zero_division=0)
f1 = f1_score(y_fold_val, y_pred, zero_division=0)
auc = roc_auc_score(y_fold_val, y_proba)

# Tampilkan hasil
print("=== Evaluasi Model Naive Bayes (Fold ke-0) ===")
print(f"Accuracy  : {acc:.6f}")
print(f"Precision : {prec:.6f}")
print(f"Recall    : {rec:.6f}")
print(f"F1 Score  : {f1:.6f}")
print(f"AUC-ROC   : {auc:.6f}")
print("\nClassification Report:")
print(classification_report(y_fold_val, y_pred, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_fold_val, y_pred))


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# Misalnya X_train, y_train adalah data training yang sudah dipersiapkan
model_naive_bayes = GaussianNB()

# Melatih model dengan data training
model_naive_bayes.fit(X_train, y_train)

# Setelah model dilatih, gunakan model ini untuk prediksi
y_test_pred = model_naive_bayes.predict(X_test) 
y_test_proba = model_naive_bayes.predict_proba(X_test)[:, 1]


In [None]:
# Prediksi pada data test
y_test_pred = model_naive_bayes.predict(X_test)  
y_test_proba = model_naive_bayes.predict_proba(X_test)[:, 1]

# Menghitung metrik evaluasi
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, zero_division=0)
recall = recall_score(y_test, y_test_pred, zero_division=0)
f1 = f1_score(y_test, y_test_pred, zero_division=0)
auc_roc = roc_auc_score(y_test, y_test_proba)

# Menampilkan hasil
print(f"=== Evaluasi Model Naive Bayes pada Data Test ===")
print(f"Accuracy  : {accuracy:.6f}")
print(f"Precision : {precision:.6f}")
print(f"Recall    : {recall:.6f}")
print(f"F1 Score  : {f1:.6f}")
print(f"AUC-ROC   : {auc_roc:.6f}")

# Menampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# Menampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

#### HASIL EKSPERIMAN NAIVE BAYES 

Model Naive Bayes yang dilatih pada fold ke-0 menunjukkan performa yang sangat buruk, dengan akurasi rendah (~11%), precision rendah (~8%), dan AUC-ROC mendekati 0.5, yang menandakan model tidak mampu membedakan antara kelas. Meskipun recall tinggi (>96%), hal ini terjadi karena model cenderung memprediksi hampir semua data sebagai kelas positif, akibat dari ketidakseimbangan kelas yang signifikan. Secara keseluruhan, model ini tidak cocok digunakan pada dataset ini

### 3. Logistic Regression

### 4. SVM

In [None]:
from joblib import Parallel, delayed
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
kernel_list = ['linear', 'rbf']
C_values = [0.1, 1, 10, 100, 1000]

parameter_grid = [(k, c) for k in kernel_list for c in C_values]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
def evaluate_svm(kernel, C, train_idx, val_idx, X_train, y_train):
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # Buat dan fit model SVM
    svm_model = SVC(kernel=kernel, C=C, probability=True, random_state=42)
    svm_model.fit(X_fold_train, y_fold_train)
    
    # Prediksi
    y_pred = svm_model.predict(X_fold_val)
    y_proba = svm_model.predict_proba(X_fold_val)[:,1]
    
    # Hitung metrik
    metrics = {
        'accuracy': accuracy_score(y_fold_val, y_pred),
        'precision': precision_score(y_fold_val, y_pred, zero_division=0),
        'recall': recall_score(y_fold_val, y_pred, zero_division=0),
        'f1_score': f1_score(y_fold_val, y_pred, zero_division=0),
        'auc_roc': roc_auc_score(y_fold_val, y_proba)
    }
    
    return metrics

#### Kombinasi 1: kernel = linear, C = 0.1

In [None]:
from joblib import Parallel, delayed

kernel = 'linear'
C = 0.1
print(f"\nEvaluating SVM with kernel={kernel} and C={C}")
fold_metrics = Parallel(n_jobs=-1)(
    delayed(evaluate_svm)(kernel, C, train_idx, val_idx, X_train, y_train)
    for train_idx, val_idx in kfold.split(X_train, y_train)
)
fold_metrics_df = pd.DataFrame(fold_metrics)
print(f"Mean metrics:\n{fold_metrics_df.mean()}")
print(f"Std metrics:\n{fold_metrics_df.std()}")

#### Kombinasi 2: kernel = linear, C = 1

In [None]:
kernel = 'linear'
C = 1
print(f"\nEvaluating SVM with kernel={kernel} and C={C}")
fold_metrics = Parallel(n_jobs=-1)(
    delayed(evaluate_svm)(kernel, C, train_idx, val_idx, X_train, y_train)
    for train_idx, val_idx in kfold.split(X_train, y_train)
)
fold_metrics_df = pd.DataFrame(fold_metrics)
print(f"Mean metrics:\n{fold_metrics_df.mean()}")
print(f"Std metrics:\n{fold_metrics_df.std()}")

#### Kombinasi 3: kernel = linear, C = 10

In [None]:
kernel = 'linear'
C = 10
print(f"\nEvaluating SVM with kernel={kernel} and C={C}")
fold_metrics = Parallel(n_jobs=-1)(
    delayed(evaluate_svm)(kernel, C, train_idx, val_idx, X_train, y_train)
    for train_idx, val_idx in kfold.split(X_train, y_train)
)
fold_metrics_df = pd.DataFrame(fold_metrics)
print(f"Mean metrics:\n{fold_metrics_df.mean()}")
print(f"Std metrics:\n{fold_metrics_df.std()}")

#### Kombinasi 4: kernel = linear, C = 100

In [None]:
kernel = 'linear'
C = 100
print(f"\nEvaluating SVM with kernel={kernel} and C={C}")
fold_metrics = Parallel(n_jobs=-1)(
    delayed(evaluate_svm)(kernel, C, train_idx, val_idx, X_train, y_train)
    for train_idx, val_idx in kfold.split(X_train, y_train)
)
fold_metrics_df = pd.DataFrame(fold_metrics)
print(f"Mean metrics:\n{fold_metrics_df.mean()}")
print(f"Std metrics:\n{fold_metrics_df.std()}")

#### Kombinasi 5: kernel = linear, C = 1000

In [None]:
kernel = 'linear'
C = 1000
print(f"\nEvaluating SVM with kernel={kernel} and C={C}")
fold_metrics = Parallel(n_jobs=-1)(
    delayed(evaluate_svm)(kernel, C, train_idx, val_idx, X_train, y_train)
    for train_idx, val_idx in kfold.split(X_train, y_train)
)
fold_metrics_df = pd.DataFrame(fold_metrics)
print(f"Mean metrics:\n{fold_metrics_df.mean()}")
print(f"Std metrics:\n{fold_metrics_df.std()}")

#### Kombinasi 6: kernel = rbf, C = 0.1

In [None]:
kernel = 'rbf'
C = 0.1
print(f"\nEvaluating SVM with kernel={kernel} and C={C}")
fold_metrics = Parallel(n_jobs=-1)(
    delayed(evaluate_svm)(kernel, C, train_idx, val_idx, X_train, y_train)
    for train_idx, val_idx in kfold.split(X_train, y_train)
)
fold_metrics_df = pd.DataFrame(fold_metrics)
print(f"Mean metrics:\n{fold_metrics_df.mean()}")
print(f"Std metrics:\n{fold_metrics_df.std()}")

#### Kombinasi 7: kernel = linear, C = 1

In [None]:
kernel = 'rbf'
C = 1
print(f"\nEvaluating SVM with kernel={kernel} and C={C}")
fold_metrics = Parallel(n_jobs=-1)(
    delayed(evaluate_svm)(kernel, C, train_idx, val_idx, X_train, y_train)
    for train_idx, val_idx in kfold.split(X_train, y_train)
)
fold_metrics_df = pd.DataFrame(fold_metrics)
print(f"Mean metrics:\n{fold_metrics_df.mean()}")
print(f"Std metrics:\n{fold_metrics_df.std()}")

#### Kombinasi 8: kernel = linear, C = 10

In [None]:
kernel = 'rbf'
C = 10
print(f"\nEvaluating SVM with kernel={kernel} and C={C}")
fold_metrics = Parallel(n_jobs=-1)(
    delayed(evaluate_svm)(kernel, C, train_idx, val_idx, X_train, y_train)
    for train_idx, val_idx in kfold.split(X_train, y_train)
)
fold_metrics_df = pd.DataFrame(fold_metrics)
print(f"Mean metrics:\n{fold_metrics_df.mean()}")
print(f"Std metrics:\n{fold_metrics_df.std()}")

#### Kombinasi 9: kernel = rbf, C = 100

In [None]:
kernel = 'rbf'
C = 100
print(f"\nEvaluating SVM with kernel={kernel} and C={C}")
fold_metrics = Parallel(n_jobs=-1)(
    delayed(evaluate_svm)(kernel, C, train_idx, val_idx, X_train, y_train)
    for train_idx, val_idx in kfold.split(X_train, y_train)
)
fold_metrics_df = pd.DataFrame(fold_metrics)
print(f"Mean metrics:\n{fold_metrics_df.mean()}")
print(f"Std metrics:\n{fold_metrics_df.std()}")

#### Kombinasi 10: kernel = linear, C = 1000

In [None]:
kernel = 'rbf'
C = 1000
print(f"\nEvaluating SVM with kernel={kernel} and C={C}")
fold_metrics = Parallel(n_jobs=-1)(
    delayed(evaluate_svm)(kernel, C, train_idx, val_idx, X_train, y_train)
    for train_idx, val_idx in kfold.split(X_train, y_train)
)
fold_metrics_df = pd.DataFrame(fold_metrics)
print(f"Mean metrics:\n{fold_metrics_df.mean()}")
print(f"Std metrics:\n{fold_metrics_df.std()}")

#### Cari model terbaik berdasarkan mean f1-score


In [None]:
results_df = pd.DataFrame(results)
best_model_info = results_df.sort_values(by='mean_f1_score', ascending=False).iloc[0]
print("\nModel terbaik berdasarkan hasil cross-validation:")
print(best_model_info)

#### Fit model terbaik ke seluruh data train

In [None]:
best_kernel = best_model_info['kernel']
best_C = best_model_info['C']

final_svm_model = SVC(kernel=best_kernel, C=best_C, probability=True, random_state=42)
final_svm_model.fit(X_train, y_train)


#### Evaluasi di data train

In [None]:
y_train_pred = final_svm_model.predict(X_train)
y_train_proba = final_svm_model.predict_proba(X_train)[:,1]

print("\nEvaluasi model terbaik pada seluruh data train:")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Precision: {precision_score(y_train, y_train_pred, zero_division=0)}")
print(f"Recall: {recall_score(y_train, y_train_pred, zero_division=0)}")
print(f"F1 Score: {f1_score(y_train, y_train_pred, zero_division=0)}")
print(f"AUC ROC: {roc_auc_score(y_train, y_train_proba)}")

#### Prediksi pada data test

In [None]:
y_test_pred = final_svm_model.predict(X_test)
y_test_proba = final_svm_model.predict_proba(X_test)[:,1]

print("\nEvaluasi model terbaik pada data test:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Precision: {precision_score(y_test, y_test_pred, zero_division=0)}")
print(f"Recall: {recall_score(y_test, y_test_pred, zero_division=0)}")
print(f"F1 Score: {f1_score(y_test, y_test_pred, zero_division=0)}")
print(f"AUC ROC: {roc_auc_score(y_test, y_test_proba)}")

### 5. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X, y)
y_pred = dt_model.predict(X_test)

#### Max Depth

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Define the range of max_depth values
max_depths = np.linspace(1, 32, 32, endpoint=True, dtype=int)

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each max_depth value
for depth in max_depths:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    
    # Predictions on training data
    train_pred = dt.predict(X_train)
    train_prob = dt.predict_proba(X_train)[:, 1]
    
    # Calculate metrics for training data
    train_results['accuracy'].append(accuracy_score(y_train, train_pred))
    train_results['precision'].append(precision_score(y_train, train_pred, zero_division=0))
    train_results['recall'].append(recall_score(y_train, train_pred, zero_division=0))
    train_results['f1'].append(f1_score(y_train, train_pred, zero_division=0))
    train_results['roc_auc'].append(roc_auc_score(y_train, train_prob))
    
    # Predictions on testing data
    test_pred = dt.predict(X_test)
    test_prob = dt.predict_proba(X_test)[:, 1]
    
    # Calculate metrics for testing data
    test_results['accuracy'].append(accuracy_score(y_test, test_pred))
    test_results['precision'].append(precision_score(y_test, test_pred, zero_division=0))
    test_results['recall'].append(recall_score(y_test, test_pred, zero_division=0))
    test_results['f1'].append(f1_score(y_test, test_pred, zero_division=0))
    test_results['roc_auc'].append(roc_auc_score(y_test, test_prob))

# Plot the results
plt.figure(figsize=(12, 8))

# Plot ROC AUC
plt.plot(max_depths, train_results['roc_auc'], 'b', label='Train ROC AUC')
plt.plot(max_depths, test_results['roc_auc'], 'r', label='Test ROC AUC')

# Plot Accuracy
plt.plot(max_depths, train_results['accuracy'], 'g--', label='Train Accuracy')
plt.plot(max_depths, test_results['accuracy'], 'y--', label='Test Accuracy')

# Plot Precision
plt.plot(max_depths, train_results['precision'], 'c-.', label='Train Precision')
plt.plot(max_depths, test_results['precision'], 'm-.', label='Test Precision')

# Plot Recall
plt.plot(max_depths, train_results['recall'], 'k:', label='Train Recall')
plt.plot(max_depths, test_results['recall'], 'orange', label='Test Recall')

# Plot F1-Score
plt.plot(max_depths, train_results['f1'], 'purple', label='Train F1-Score')
plt.plot(max_depths, test_results['f1'], 'brown', label='Test F1-Score')

# Add labels and legend
plt.xlabel('Tree Depth')
plt.ylabel('Score')
plt.title('Decision Tree Performance Metrics vs Tree Depth')
plt.legend(loc='best')
plt.grid()
plt.show()


In [None]:
# Define the range of max_depth values
max_depths = [14, 15]

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each max_depth value
for depth in max_depths:
    print(f"\nEvaluating max_depth={depth}")
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train), 1):
        # Split the data for the current fold
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train the model on the training fold
        dt.fit(X_tr, y_tr)
        
        # Predictions on training fold
        train_pred = dt.predict(X_tr)
        train_prob = dt.predict_proba(X_tr)[:, 1]
        
        # Calculate metrics for training fold
        train_accuracy = accuracy_score(y_tr, train_pred)
        train_precision = precision_score(y_tr, train_pred, zero_division=0)
        train_recall = recall_score(y_tr, train_pred, zero_division=0)
        train_f1 = f1_score(y_tr, train_pred, zero_division=0)
        train_roc_auc = roc_auc_score(y_tr, train_prob)
        
        # Store training metrics
        train_results['accuracy'].append(train_accuracy)
        train_results['precision'].append(train_precision)
        train_results['recall'].append(train_recall)
        train_results['f1'].append(train_f1)
        train_results['roc_auc'].append(train_roc_auc)
        
        # Predictions on validation fold
        val_pred = dt.predict(X_val)
        val_prob = dt.predict_proba(X_val)[:, 1]
        
        # Calculate metrics for validation fold
        val_accuracy = accuracy_score(y_val, val_pred)
        val_precision = precision_score(y_val, val_pred, zero_division=0)
        val_recall = recall_score(y_val, val_pred, zero_division=0)
        val_f1 = f1_score(y_val, val_pred, zero_division=0)
        val_roc_auc = roc_auc_score(y_val, val_prob)
        
        # Store validation metrics
        test_results['accuracy'].append(val_accuracy)
        test_results['precision'].append(val_precision)
        test_results['recall'].append(val_recall)
        test_results['f1'].append(val_f1)
        test_results['roc_auc'].append(val_roc_auc)
        
        # Print metrics for the current fold
        print(f"Fold {fold}")
        print(f"  Train Metrics: Accuracy={train_accuracy:.4f}, Precision={train_precision:.4f}, Recall={train_recall:.4f}, F1={train_f1:.4f}, ROC AUC={train_roc_auc:.4f}")
        print(f"  Test Metrics:  Accuracy={val_accuracy:.4f}, Precision={val_precision:.4f}, Recall={val_recall:.4f}, F1={val_f1:.4f}, ROC AUC={val_roc_auc:.4f}")

    # Calculate and print mean and standard deviation for each metric
    train_df = pd.DataFrame(train_results)
    test_df = pd.DataFrame(test_results)

    summary = {
        'Train Mean': train_df.mean(),
        'Train Std': train_df.std(),
        'Test Mean': test_df.mean(),
        'Test Std': test_df.std()
    }

    summary_df = pd.DataFrame(summary)
    print("\nSummary of 10-Fold Cross-Validation Metrics:")
    print(summary_df.round(4))


#### Min_samples_split

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Define the range of min_samples_split values
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each min_samples_split value
for min_samples_split in min_samples_splits:
    dt = DecisionTreeClassifier(min_samples_split=int(min_samples_split * len(X_train)), random_state=42)
    dt.fit(X_train, y_train)
    
    # Predictions on training data
    train_pred = dt.predict(X_train)
    train_prob = dt.predict_proba(X_train)[:, 1]
    
    # Calculate metrics for training data
    train_results['accuracy'].append(accuracy_score(y_train, train_pred))
    train_results['precision'].append(precision_score(y_train, train_pred, zero_division=0))
    train_results['recall'].append(recall_score(y_train, train_pred, zero_division=0))
    train_results['f1'].append(f1_score(y_train, train_pred, zero_division=0))
    train_results['roc_auc'].append(roc_auc_score(y_train, train_prob))
    
    # Predictions on testing data
    test_pred = dt.predict(X_test)
    test_prob = dt.predict_proba(X_test)[:, 1]
    
    # Calculate metrics for testing data
    test_results['accuracy'].append(accuracy_score(y_test, test_pred))
    test_results['precision'].append(precision_score(y_test, test_pred, zero_division=0))
    test_results['recall'].append(recall_score(y_test, test_pred, zero_division=0))
    test_results['f1'].append(f1_score(y_test, test_pred, zero_division=0))
    test_results['roc_auc'].append(roc_auc_score(y_test, test_prob))

# Plot the results
plt.figure(figsize=(12, 8))

# Plot ROC AUC
plt.plot(min_samples_splits, train_results['roc_auc'], 'b', label='Train ROC AUC')
plt.plot(min_samples_splits, test_results['roc_auc'], 'r', label='Test ROC AUC')

# Plot Accuracy
plt.plot(min_samples_splits, train_results['accuracy'], 'g--', label='Train Accuracy')
plt.plot(min_samples_splits, test_results['accuracy'], 'y--', label='Test Accuracy')

# Plot Precision
plt.plot(min_samples_splits, train_results['precision'], 'c-.', label='Train Precision')
plt.plot(min_samples_splits, test_results['precision'], 'm-.', label='Test Precision')

# Plot Recall
plt.plot(min_samples_splits, train_results['recall'], 'k:', label='Train Recall')
plt.plot(min_samples_splits, test_results['recall'], 'orange', label='Test Recall')

# Plot F1-Score
plt.plot(min_samples_splits, train_results['f1'], 'purple', label='Train F1-Score')
plt.plot(min_samples_splits, test_results['f1'], 'brown', label='Test F1-Score')

# Add labels and legend
plt.xlabel('Min Samples Split (Fraction of Training Data)')
plt.ylabel('Score')
plt.title('Decision Tree Performance Metrics vs Min Samples Split')
plt.legend(loc='best')
plt.grid()
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Define the range of min_samples_split values
min_samples_splits = [0.2, 0.4]

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each min_samples_split value
for min_samples_split in min_samples_splits:
    print(f"\nEvaluating min_samples_split={min_samples_split}")
    dt = DecisionTreeClassifier(min_samples_split=int(min_samples_split * len(X_train)), random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train), 1):
        # Split the data for the current fold
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train the model on the training fold
        dt.fit(X_tr, y_tr)
        
        # Predictions on training fold
        train_pred = dt.predict(X_tr)
        train_prob = dt.predict_proba(X_tr)[:, 1]
        
        # Calculate metrics for training fold
        train_accuracy = accuracy_score(y_tr, train_pred)
        train_precision = precision_score(y_tr, train_pred, zero_division=0)
        train_recall = recall_score(y_tr, train_pred, zero_division=0)
        train_f1 = f1_score(y_tr, train_pred, zero_division=0)
        train_roc_auc = roc_auc_score(y_tr, train_prob)
        
        # Store training metrics
        train_results['accuracy'].append(train_accuracy)
        train_results['precision'].append(train_precision)
        train_results['recall'].append(train_recall)
        train_results['f1'].append(train_f1)
        train_results['roc_auc'].append(train_roc_auc)
        
        # Predictions on validation fold
        val_pred = dt.predict(X_val)
        val_prob = dt.predict_proba(X_val)[:, 1]
        
        # Calculate metrics for validation fold
        val_accuracy = accuracy_score(y_val, val_pred)
        val_precision = precision_score(y_val, val_pred, zero_division=0)
        val_recall = recall_score(y_val, val_pred, zero_division=0)
        val_f1 = f1_score(y_val, val_pred, zero_division=0)
        val_roc_auc = roc_auc_score(y_val, val_prob)
        
        # Store validation metrics
        test_results['accuracy'].append(val_accuracy)
        test_results['precision'].append(val_precision)
        test_results['recall'].append(val_recall)
        test_results['f1'].append(val_f1)
        test_results['roc_auc'].append(val_roc_auc)
        
        # Print metrics for the current fold
        print(f"Fold {fold}")
        print(f"  Train Metrics: Accuracy={train_accuracy:.4f}, Precision={train_precision:.4f}, Recall={train_recall:.4f}, F1={train_f1:.4f}, ROC AUC={train_roc_auc:.4f}")
        print(f"  Test Metrics:  Accuracy={val_accuracy:.4f}, Precision={val_precision:.4f}, Recall={val_recall:.4f}, F1={val_f1:.4f}, ROC AUC={val_roc_auc:.4f}")

    # Calculate and print mean and standard deviation for each metric
    train_df = pd.DataFrame(train_results)
    test_df = pd.DataFrame(test_results)

    summary = {
        'Train Mean': train_df.mean(),
        'Train Std': train_df.std(),
        'Test Mean': test_df.mean(),
        'Test Std': test_df.std()
    }

    summary_df = pd.DataFrame(summary)
    print("\nSummary of 10-Fold Cross-Validation Metrics:")
    print(summary_df.round(4))


#### Min samples leaf

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Define the range of min_samples_leaf values
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each min_samples_leaf value
for min_samples_leaf in min_samples_leafs:
    dt = DecisionTreeClassifier(min_samples_leaf=int(min_samples_leaf * len(X_train)), random_state=42)
    dt.fit(X_train, y_train)
    
    # Predictions on training data
    train_pred = dt.predict(X_train)
    train_prob = dt.predict_proba(X_train)[:, 1]
    
    # Calculate metrics for training data
    train_results['accuracy'].append(accuracy_score(y_train, train_pred))
    train_results['precision'].append(precision_score(y_train, train_pred, zero_division=0))
    train_results['recall'].append(recall_score(y_train, train_pred, zero_division=0))
    train_results['f1'].append(f1_score(y_train, train_pred, zero_division=0))
    train_results['roc_auc'].append(roc_auc_score(y_train, train_prob))
    
    # Predictions on testing data
    test_pred = dt.predict(X_test)
    test_prob = dt.predict_proba(X_test)[:, 1]
    
    # Calculate metrics for testing data
    test_results['accuracy'].append(accuracy_score(y_test, test_pred))
    test_results['precision'].append(precision_score(y_test, test_pred, zero_division=0))
    test_results['recall'].append(recall_score(y_test, test_pred, zero_division=0))
    test_results['f1'].append(f1_score(y_test, test_pred, zero_division=0))
    test_results['roc_auc'].append(roc_auc_score(y_test, test_prob))

# Plot the results
plt.figure(figsize=(12, 8))

# Plot ROC AUC
plt.plot(min_samples_leafs, train_results['roc_auc'], 'b', label='Train ROC AUC')
plt.plot(min_samples_leafs, test_results['roc_auc'], 'r', label='Test ROC AUC')

# Plot Accuracy
plt.plot(min_samples_leafs, train_results['accuracy'], 'g--', label='Train Accuracy')
plt.plot(min_samples_leafs, test_results['accuracy'], 'y--', label='Test Accuracy')

# Plot Precision
plt.plot(min_samples_leafs, train_results['precision'], 'c-.', label='Train Precision')
plt.plot(min_samples_leafs, test_results['precision'], 'm-.', label='Test Precision')

# Plot Recall
plt.plot(min_samples_leafs, train_results['recall'], 'k:', label='Train Recall')
plt.plot(min_samples_leafs, test_results['recall'], 'orange', label='Test Recall')

# Plot F1-Score
plt.plot(min_samples_leafs, train_results['f1'], 'purple', label='Train F1-Score')
plt.plot(min_samples_leafs, test_results['f1'], 'brown', label='Test F1-Score')

# Add labels and legend
plt.xlabel('Min Samples Leaf (Fraction of Training Data)')
plt.ylabel('Score')
plt.title('Decision Tree Performance Metrics vs Min Samples Leaf')
plt.legend(loc='best')
plt.grid()
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Define the range of min_samples_leaf values
min_samples_leafs = [0.2, 0.3]

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each min_samples_leaf value
for min_samples_leaf in min_samples_leafs:
    print(f"\nEvaluating min_samples_leaf={min_samples_leaf}")
    dt = DecisionTreeClassifier(min_samples_leaf=int(min_samples_leaf * len(X_train)), random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train), 1):
        # Split the data for the current fold
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train the model on the training fold
        dt.fit(X_tr, y_tr)
        
        # Predictions on training fold
        train_pred = dt.predict(X_tr)
        train_prob = dt.predict_proba(X_tr)[:, 1]
        
        # Calculate metrics for training fold
        train_accuracy = accuracy_score(y_tr, train_pred)
        train_precision = precision_score(y_tr, train_pred, zero_division=0)
        train_recall = recall_score(y_tr, train_pred, zero_division=0)
        train_f1 = f1_score(y_tr, train_pred, zero_division=0)
        train_roc_auc = roc_auc_score(y_tr, train_prob)
        
        # Store training metrics
        train_results['accuracy'].append(train_accuracy)
        train_results['precision'].append(train_precision)
        train_results['recall'].append(train_recall)
        train_results['f1'].append(train_f1)
        train_results['roc_auc'].append(train_roc_auc)
        
        # Predictions on validation fold
        val_pred = dt.predict(X_val)
        val_prob = dt.predict_proba(X_val)[:, 1]
        
        # Calculate metrics for validation fold
        val_accuracy = accuracy_score(y_val, val_pred)
        val_precision = precision_score(y_val, val_pred, zero_division=0)
        val_recall = recall_score(y_val, val_pred, zero_division=0)
        val_f1 = f1_score(y_val, val_pred, zero_division=0)
        val_roc_auc = roc_auc_score(y_val, val_prob)
        
        # Store validation metrics
        test_results['accuracy'].append(val_accuracy)
        test_results['precision'].append(val_precision)
        test_results['recall'].append(val_recall)
        test_results['f1'].append(val_f1)
        test_results['roc_auc'].append(val_roc_auc)
        
        # Print metrics for the current fold
        print(f"Fold {fold}")
        print(f"  Train Metrics: Accuracy={train_accuracy:.4f}, Precision={train_precision:.4f}, Recall={train_recall:.4f}, F1={train_f1:.4f}, ROC AUC={train_roc_auc:.4f}")
        print(f"  Test Metrics:  Accuracy={val_accuracy:.4f}, Precision={val_precision:.4f}, Recall={val_recall:.4f}, F1={val_f1:.4f}, ROC AUC={val_roc_auc:.4f}")

    # Calculate and print mean and standard deviation for each metric
    train_df = pd.DataFrame(train_results)
    test_df = pd.DataFrame(test_results)

    summary = {
        'Train Mean': train_df.mean(),
        'Train Std': train_df.std(),
        'Test Mean': test_df.mean(),
        'Test Std': test_df.std()
    }

    summary_df = pd.DataFrame(summary)
    print("\nSummary of 10-Fold Cross-Validation Metrics:")
    print(summary_df.round(4))


#### Max Features

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Define the range of max_features values
max_features = list(range(1, X_train.shape[1] + 1))

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each max_features value
for max_feature in max_features:
    dt = DecisionTreeClassifier(max_features=max_feature, random_state=42)
    dt.fit(X_train, y_train)
    
    # Predictions on training data
    train_pred = dt.predict(X_train)
    train_prob = dt.predict_proba(X_train)[:, 1]
    
    # Calculate metrics for training data
    train_results['accuracy'].append(accuracy_score(y_train, train_pred))
    train_results['precision'].append(precision_score(y_train, train_pred, zero_division=0))
    train_results['recall'].append(recall_score(y_train, train_pred, zero_division=0))
    train_results['f1'].append(f1_score(y_train, train_pred, zero_division=0))
    train_results['roc_auc'].append(roc_auc_score(y_train, train_prob))
    
    # Predictions on testing data
    test_pred = dt.predict(X_test)
    test_prob = dt.predict_proba(X_test)[:, 1]
    
    # Calculate metrics for testing data
    test_results['accuracy'].append(accuracy_score(y_test, test_pred))
    test_results['precision'].append(precision_score(y_test, test_pred, zero_division=0))
    test_results['recall'].append(recall_score(y_test, test_pred, zero_division=0))
    test_results['f1'].append(f1_score(y_test, test_pred, zero_division=0))
    test_results['roc_auc'].append(roc_auc_score(y_test, test_prob))

# Plot the results
plt.figure(figsize=(12, 8))

# Plot ROC AUC
plt.plot(max_features, train_results['roc_auc'], 'b', label='Train ROC AUC')
plt.plot(max_features, test_results['roc_auc'], 'r', label='Test ROC AUC')

# Plot Accuracy
plt.plot(max_features, train_results['accuracy'], 'g--', label='Train Accuracy')
plt.plot(max_features, test_results['accuracy'], 'y--', label='Test Accuracy')

# Plot Precision
plt.plot(max_features, train_results['precision'], 'c-.', label='Train Precision')
plt.plot(max_features, test_results['precision'], 'm-.', label='Test Precision')

# Plot Recall
plt.plot(max_features, train_results['recall'], 'k:', label='Train Recall')
plt.plot(max_features, test_results['recall'], 'orange', label='Test Recall')

# Plot F1-Score
plt.plot(max_features, train_results['f1'], 'purple', label='Train F1-Score')
plt.plot(max_features, test_results['f1'], 'brown', label='Test F1-Score')

# Add labels and legend
plt.xlabel('Max Features')
plt.ylabel('Score')
plt.title('Decision Tree Performance Metrics vs Max Features')
plt.legend(loc='best')
plt.grid()
plt.show()


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define the range of max_features values
max_features_values = [10, 'sqrt']

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each max_features value
for max_features in max_features_values:
    print(f"\nEvaluating max_features={max_features}")
    dt = DecisionTreeClassifier(max_features=max_features, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train), 1):
        # Split the data for the current fold
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train the model on the training fold
        dt.fit(X_tr, y_tr)
        
        # Predictions on training fold
        train_pred = dt.predict(X_tr)
        train_prob = dt.predict_proba(X_tr)[:, 1]
        
        # Calculate metrics for training fold
        train_accuracy = accuracy_score(y_tr, train_pred)
        train_precision = precision_score(y_tr, train_pred, zero_division=0)
        train_recall = recall_score(y_tr, train_pred, zero_division=0)
        train_f1 = f1_score(y_tr, train_pred, zero_division=0)
        train_roc_auc = roc_auc_score(y_tr, train_prob)
        
        # Store training metrics
        train_results['accuracy'].append(train_accuracy)
        train_results['precision'].append(train_precision)
        train_results['recall'].append(train_recall)
        train_results['f1'].append(train_f1)
        train_results['roc_auc'].append(train_roc_auc)
        
        # Predictions on validation fold
        val_pred = dt.predict(X_val)
        val_prob = dt.predict_proba(X_val)[:, 1]
        
        # Calculate metrics for validation fold
        val_accuracy = accuracy_score(y_val, val_pred)
        val_precision = precision_score(y_val, val_pred, zero_division=0)
        val_recall = recall_score(y_val, val_pred, zero_division=0)
        val_f1 = f1_score(y_val, val_pred, zero_division=0)
        val_roc_auc = roc_auc_score(y_val, val_prob)
        
        # Store validation metrics
        test_results['accuracy'].append(val_accuracy)
        test_results['precision'].append(val_precision)
        test_results['recall'].append(val_recall)
        test_results['f1'].append(val_f1)
        test_results['roc_auc'].append(val_roc_auc)
        
        # Print metrics for the current fold
        print(f"Fold {fold}")
        print(f"  Train Metrics: Accuracy={train_accuracy:.4f}, Precision={train_precision:.4f}, Recall={train_recall:.4f}, F1={train_f1:.4f}, ROC AUC={train_roc_auc:.4f}")
        print(f"  Test Metrics:  Accuracy={val_accuracy:.4f}, Precision={val_precision:.4f}, Recall={val_recall:.4f}, F1={val_f1:.4f}, ROC AUC={val_roc_auc:.4f}")

         # Calculate and print mean and standard deviation for each metric
    train_df = pd.DataFrame(train_results)
    test_df = pd.DataFrame(test_results)

    summary = {
        'Train Mean': train_df.mean(),
        'Train Std': train_df.std(),
        'Test Mean': test_df.mean(),
        'Test Std': test_df.std()
    }

    summary_df = pd.DataFrame(summary)
    print("\nSummary of 10-Fold Cross-Validation Metrics:")
    print(summary_df.round(4))


#### Max Leaf Nodes

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Define the range of max_leaf_nodes values
max_leaf_nodes = np.linspace(2, 32, 31, endpoint=True, dtype=int)

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each max_leaf_nodes value
for max_leaf_node in max_leaf_nodes:
    dt = DecisionTreeClassifier(max_leaf_nodes=max_leaf_node, random_state=42)
    dt.fit(X_train, y_train)
    
    # Predictions on training data
    train_pred = dt.predict(X_train)
    train_prob = dt.predict_proba(X_train)[:, 1]
    
    # Calculate metrics for training data
    train_results['accuracy'].append(accuracy_score(y_train, train_pred))
    train_results['precision'].append(precision_score(y_train, train_pred, zero_division=0))
    train_results['recall'].append(recall_score(y_train, train_pred, zero_division=0))
    train_results['f1'].append(f1_score(y_train, train_pred, zero_division=0))
    train_results['roc_auc'].append(roc_auc_score(y_train, train_prob))
    
    # Predictions on testing data
    test_pred = dt.predict(X_test)
    test_prob = dt.predict_proba(X_test)[:, 1]
    
    # Calculate metrics for testing data
    test_results['accuracy'].append(accuracy_score(y_test, test_pred))
    test_results['precision'].append(precision_score(y_test, test_pred, zero_division=0))
    test_results['recall'].append(recall_score(y_test, test_pred, zero_division=0))
    test_results['f1'].append(f1_score(y_test, test_pred, zero_division=0))
    test_results['roc_auc'].append(roc_auc_score(y_test, test_prob))

# Plot the results
plt.figure(figsize=(12, 8))

# Plot ROC AUC
plt.plot(max_leaf_nodes, train_results['roc_auc'], 'b', label='Train ROC AUC')
plt.plot(max_leaf_nodes, test_results['roc_auc'], 'r', label='Test ROC AUC')

# Plot Accuracy
plt.plot(max_leaf_nodes, train_results['accuracy'], 'g--', label='Train Accuracy')
plt.plot(max_leaf_nodes, test_results['accuracy'], 'y--', label='Test Accuracy')

# Plot Precision
plt.plot(max_leaf_nodes, train_results['precision'], 'c-.', label='Train Precision')
plt.plot(max_leaf_nodes, test_results['precision'], 'm-.', label='Test Precision')

# Plot Recall
plt.plot(max_leaf_nodes, train_results['recall'], 'k:', label='Train Recall')
plt.plot(max_leaf_nodes, test_results['recall'], 'orange', label='Test Recall')

# Plot F1-Score
plt.plot(max_leaf_nodes, train_results['f1'], 'purple', label='Train F1-Score')
plt.plot(max_leaf_nodes, test_results['f1'], 'brown', label='Test F1-Score')

# Add labels and legend
plt.xlabel('Max Leaf Nodes')
plt.ylabel('Score')
plt.title('Decision Tree Performance Metrics vs Max Leaf Nodes')
plt.legend(loc='best')
plt.grid()
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define the range of max_leaf_nodes values
max_leaf_nodes_values = [20, 25]

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each max_leaf_nodes value
for max_leaf_nodes in max_leaf_nodes_values:
    print(f"\nEvaluating max_leaf_nodes={max_leaf_nodes}")
    dt = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train), 1):
        # Split the data for the current fold
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train the model on the training fold
        dt.fit(X_tr, y_tr)
        
        # Predictions on training fold
        train_pred = dt.predict(X_tr)
        train_prob = dt.predict_proba(X_tr)[:, 1]
        
        # Calculate metrics for training fold
        train_accuracy = accuracy_score(y_tr, train_pred)
        train_precision = precision_score(y_tr, train_pred, zero_division=0)
        train_recall = recall_score(y_tr, train_pred, zero_division=0)
        train_f1 = f1_score(y_tr, train_pred, zero_division=0)
        train_roc_auc = roc_auc_score(y_tr, train_prob)
        
        # Store training metrics
        train_results['accuracy'].append(train_accuracy)
        train_results['precision'].append(train_precision)
        train_results['recall'].append(train_recall)
        train_results['f1'].append(train_f1)
        train_results['roc_auc'].append(train_roc_auc)
        
        # Predictions on validation fold
        val_pred = dt.predict(X_val)
        val_prob = dt.predict_proba(X_val)[:, 1]
        
        # Calculate metrics for validation fold
        val_accuracy = accuracy_score(y_val, val_pred)
        val_precision = precision_score(y_val, val_pred, zero_division=0)
        val_recall = recall_score(y_val, val_pred, zero_division=0)
        val_f1 = f1_score(y_val, val_pred, zero_division=0)
        val_roc_auc = roc_auc_score(y_val, val_prob)
        
        # Store validation metrics
        test_results['accuracy'].append(val_accuracy)
        test_results['precision'].append(val_precision)
        test_results['recall'].append(val_recall)
        test_results['f1'].append(val_f1)
        test_results['roc_auc'].append(val_roc_auc)
        
        # Print metrics for the current fold
        print(f"Fold {fold}")
        print(f"  Train Metrics: Accuracy={train_accuracy:.4f}, Precision={train_precision:.4f}, Recall={train_recall:.4f}, F1={train_f1:.4f}, ROC AUC={train_roc_auc:.4f}")
        print(f"  Test Metrics:  Accuracy={val_accuracy:.4f}, Precision={val_precision:.4f}, Recall={val_recall:.4f}, F1={val_f1:.4f}, ROC AUC={val_roc_auc:.4f}")

     # Calculate and print mean and standard deviation for each metric
    train_df = pd.DataFrame(train_results)
    test_df = pd.DataFrame(test_results)

    summary = {
        'Train Mean': train_df.mean(),
        'Train Std': train_df.std(),
        'Test Mean': test_df.mean(),
        'Test Std': test_df.std()
    }

    summary_df = pd.DataFrame(summary)
    print("\nSummary of 10-Fold Cross-Validation Metrics:")
    print(summary_df.round(4))


#### Criterion

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Define the range of criterion values
criteria = ['gini', 'entropy']

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each criterion value
for criterion in criteria:
    dt = DecisionTreeClassifier(criterion=criterion, random_state=42)
    dt.fit(X_train, y_train)
    
    # Predictions on training data
    train_pred = dt.predict(X_train)
    train_prob = dt.predict_proba(X_train)[:, 1]
    
    # Calculate metrics for training data
    train_results['accuracy'].append(accuracy_score(y_train, train_pred))
    train_results['precision'].append(precision_score(y_train, train_pred, zero_division=0))
    train_results['recall'].append(recall_score(y_train, train_pred, zero_division=0))
    train_results['f1'].append(f1_score(y_train, train_pred, zero_division=0))
    train_results['roc_auc'].append(roc_auc_score(y_train, train_prob))
    
    # Predictions on testing data
    test_pred = dt.predict(X_test)
    test_prob = dt.predict_proba(X_test)[:, 1]
    
    # Calculate metrics for testing data
    test_results['accuracy'].append(accuracy_score(y_test, test_pred))
    test_results['precision'].append(precision_score(y_test, test_pred, zero_division=0))
    test_results['recall'].append(recall_score(y_test, test_pred, zero_division=0))
    test_results['f1'].append(f1_score(y_test, test_pred, zero_division=0))
    test_results['roc_auc'].append(roc_auc_score(y_test, test_prob))

# Plot the results
x = np.arange(len(criteria))  # X-axis positions for the criteria

plt.figure(figsize=(12, 8))

# Plot ROC AUC
plt.bar(x - 0.3, train_results['roc_auc'], width=0.2, label='Train ROC AUC', color='b')
plt.bar(x - 0.1, test_results['roc_auc'], width=0.2, label='Test ROC AUC', color='r')

# Plot Accuracy
plt.bar(x + 0.1, train_results['accuracy'], width=0.2, label='Train Accuracy', color='g')
plt.bar(x + 0.3, test_results['accuracy'], width=0.2, label='Test Accuracy', color='y')

# Add labels and legend
plt.xticks(x, criteria)
plt.xlabel('Criterion')
plt.ylabel('Score')
plt.title('Decision Tree Performance Metrics vs Criterion')
plt.legend(loc='best')
plt.grid(axis='y')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import numpy as np

# Define the range of criterion values
criteria = ['gini', 'entropy']

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each criterion value
for criterion in criteria:
    print(f"\nEvaluating criterion={criterion}")
    dt = DecisionTreeClassifier(criterion=criterion, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train), 1):
        # Split the data for the current fold
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train the model on the training fold
        dt.fit(X_tr, y_tr)
        
        # Predictions on training fold
        train_pred = dt.predict(X_tr)
        train_prob = dt.predict_proba(X_tr)[:, 1]
        
        # Calculate metrics for training fold
        train_results['accuracy'].append(accuracy_score(y_tr, train_pred))
        train_results['precision'].append(precision_score(y_tr, train_pred, zero_division=0))
        train_results['recall'].append(recall_score(y_tr, train_pred, zero_division=0))
        train_results['f1'].append(f1_score(y_tr, train_pred, zero_division=0))
        train_results['roc_auc'].append(roc_auc_score(y_tr, train_prob))
        
        # Predictions on validation fold
        val_pred = dt.predict(X_val)
        val_prob = dt.predict_proba(X_val)[:, 1]
        
        # Calculate metrics for validation fold
        test_results['accuracy'].append(accuracy_score(y_val, val_pred))
        test_results['precision'].append(precision_score(y_val, val_pred, zero_division=0))
        test_results['recall'].append(recall_score(y_val, val_pred, zero_division=0))
        test_results['f1'].append(f1_score(y_val, val_pred, zero_division=0))
        test_results['roc_auc'].append(roc_auc_score(y_val, val_prob))
        
        # Print metrics for the current fold
        print(f"Fold {fold}")
        print(f"  Train Metrics: Accuracy={train_results['accuracy'][-1]:.4f}, Precision={train_results['precision'][-1]:.4f}, Recall={train_results['recall'][-1]:.4f}, F1={train_results['f1'][-1]:.4f}, ROC AUC={train_results['roc_auc'][-1]:.4f}")
        print(f"  Test Metrics:  Accuracy={test_results['accuracy'][-1]:.4f}, Precision={test_results['precision'][-1]:.4f}, Recall={test_results['recall'][-1]:.4f}, F1={test_results['f1'][-1]:.4f}, ROC AUC={test_results['roc_auc'][-1]:.4f}")

    # Calculate and print mean and standard deviation for each metric
    train_df = pd.DataFrame(train_results)
    test_df = pd.DataFrame(test_results)

    summary = {
        'Train Mean': train_df.mean(),
        'Train Std': train_df.std(),
        'Test Mean': test_df.mean(),
        'Test Std': test_df.std()
    }

    summary_df = pd.DataFrame(summary)
    print("\nSummary of 10-Fold Cross-Validation Metrics:")
    print(summary_df.round(4))


#### Applying with training data

In [None]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier


dt_model = DecisionTreeClassifier(max_depth=14, 
                                  min_samples_split=0.4,
                                  min_samples_leaf=0.2,
                                  max_features=10,
                                  max_leaf_nodes=25,
                                  criterion='entropy'
                                  )
dt_model.fit(X, y)

# Predictions on training fold
train_pred = dt_model.predict(X)
train_prob = dt_model.predict_proba(X)[:, 1]

# Generate classification report for the test set
print(classification_report(y, test_pred))

#### Applying with test data

In [None]:

y_pred = dt_model.predict(test_df)
y_pred_proba = dt_model.predict_proba(test_df)[:, 1]

from sklearn.model_selection import cross_validate

# Evaluate multiple metrics
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
# Use cross-validation on the training data instead of predictions
cv_results = cross_validate(dt_model, X_train, y_train, cv=10, scoring=scoring)
# Print results
for metric in scoring:
    scores = cv_results[f'test_{metric}']
    print(f"{metric.capitalize()} scores: {scores}")
    print(f"Mean {metric}: {scores.mean():.4f}")

### 6.  Back Propagation Neural Network (BPNN)