# Decision tree

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

# Load the data
student_file = os.path.join("..", "data", "processed", "Merged_Final_File_Updated.xlsx")
df = pd.read_excel(student_file)

# Map dependent variable 'dropped out' to binary
df['dropped out'] = df['dropped out'].map({'no': 0, 'yes': 1})

# Define features and target
features = ['anl1 final grade', 'anl2 final grade', 'anl3 final grade', 'anl4 final grade', 'education_level']
target = 'dropped out'

X = df[features]
y = df[target]

# Define preprocessing for numerical and categorical features
numerical_features = ['anl1 final grade', 'anl2 final grade', 'anl3 final grade', 'anl4 final grade']
categorical_features = ['education_level']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1)),  # Fill NA values with 1
    ('scaler', StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the pipeline with Decision Tree
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define the Stratified K-Fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List of metrics to evaluate
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Evaluate each metric using cross-validation
results = {}
for metric in metrics:
    if metric == 'roc_auc':
        # For ROC AUC, we need to use the 'predict_proba' method, so set the pipeline to use probabilities
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc')
    else:
        # For other metrics, use standard class predictions
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring=metric)
        
    results[metric] = {
        'mean': scores.mean(),
        'std': scores.std(),
        'scores': scores
    }

# Print results
for metric, values in results.items():
    print(f"\n{metric.capitalize()} Scores for each fold: {values['scores']}")
    print(f"Mean {metric.capitalize()}: {values['mean']:.4f}")
    print(f"Standard Deviation of {metric.capitalize()}: {values['std']:.4f}")



Accuracy Scores for each fold: [0.67164179 0.64179104 0.67164179 0.66666667 0.71212121]
Mean Accuracy: 0.6728
Standard Deviation of Accuracy: 0.0226

Precision Scores for each fold: [0.62857143 0.60606061 0.63636364 0.63636364 0.7       ]
Mean Precision: 0.6415
Standard Deviation of Precision: 0.0313

Recall Scores for each fold: [0.70967742 0.64516129 0.67741935 0.67741935 0.67741935]
Mean Recall: 0.6774
Standard Deviation of Recall: 0.0204

F1 Scores for each fold: [0.66666667 0.625      0.65625    0.65625    0.68852459]
Mean F1: 0.6585
Standard Deviation of F1: 0.0205

Roc_auc Scores for each fold: [0.68100358 0.61648746 0.72132616 0.68248848 0.67695853]
Mean Roc_auc: 0.6757
Standard Deviation of Roc_auc: 0.0337
