# Decision tree

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import os

# Load the data
student_file = os.path.join("..", "data", "processed", "Merged_Final_File_Updated.xlsx")
df = pd.read_excel(student_file)

# Map dependent variable 'dropped out' to binary
df['dropped out'] = df['dropped out'].map({'no': 0, 'yes': 1})

# Define features and target
features = ['anl1 final grade', 'anl2 final grade', 'anl3 final grade', 'anl4 final grade', 'education_level']
target = 'dropped out'

X = df[features]
y = df[target]

# Define preprocessing for numerical and categorical features
numerical_features = ['anl1 final grade', 'anl2 final grade', 'anl3 final grade', 'anl4 final grade']
categorical_features = ['education_level']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1)),  # Fill NA values with 1
    ('scaler', StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the pipeline with Decision Tree
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define the Stratified K-Fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List of metrics to evaluate
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Evaluate each metric using cross-validation
results = {}
for metric in metrics:
    if metric == 'roc_auc':
        # For ROC AUC, we need to use the 'predict_proba' method, so set the pipeline to use probabilities
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc')
    else:
        # For other metrics, use standard class predictions
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring=metric)
        
    results[metric] = {
        'mean': scores.mean(),
        'std': scores.std(),
        'scores': scores
    }

# Print results
for metric, values in results.items():
    print(f"\n{metric.capitalize()} Scores for each fold: {values['scores']}")
    print(f"Mean {metric.capitalize()}: {values['mean']:.4f}")
    print(f"Standard Deviation of {metric.capitalize()}: {values['std']:.4f}")

# Get cross-validation predictions
y_pred_cv = cross_val_predict(pipeline, X, y, cv=cv)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y, y_pred_cv)

# Display confusion matrix
print("\nConfusion Matrix (Decision Tree - Cross-Validation):")
print(conf_matrix)



Accuracy Scores for each fold: [0.67164179 0.64179104 0.67164179 0.66666667 0.71212121]
Mean Accuracy: 0.6728
Standard Deviation of Accuracy: 0.0226

Precision Scores for each fold: [0.62857143 0.60606061 0.63636364 0.63636364 0.7       ]
Mean Precision: 0.6415
Standard Deviation of Precision: 0.0313

Recall Scores for each fold: [0.70967742 0.64516129 0.67741935 0.67741935 0.67741935]
Mean Recall: 0.6774
Standard Deviation of Recall: 0.0204

F1 Scores for each fold: [0.66666667 0.625      0.65625    0.65625    0.68852459]
Mean F1: 0.6585
Standard Deviation of F1: 0.0205

Roc_auc Scores for each fold: [0.68100358 0.61648746 0.72132616 0.68248848 0.67695853]
Mean Roc_auc: 0.6757
Standard Deviation of Roc_auc: 0.0337

Confusion Matrix (Decision Tree - Cross-Validation):
[[119  59]
 [ 50 105]]


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import os

# Load the data
student_file = os.path.join("..", "data", "processed", "Merged_Final_File_Updated.xlsx")
df = pd.read_excel(student_file)

# Map dependent variable 'dropped out' to binary
df['dropped out'] = df['dropped out'].map({'no': 0, 'yes': 1})

# Define features and target
features = ['anl1 final grade', 'anl2 final grade', 'anl3 final grade', 'anl4 final grade', 'education_level']
target = 'dropped out'

X = df[features]
y = df[target]

# Define preprocessing for numerical and categorical features
numerical_features = ['anl1 final grade', 'anl2 final grade', 'anl3 final grade', 'anl4 final grade']
categorical_features = ['education_level']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1)),  # Fill NA values with 1
    ('scaler', StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the pipeline with Decision Tree
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define the Stratified K-Fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid for Decision Tree
param_grid = {
    'classifier__criterion': ['gini', 'entropy', 'log_loss'],  # Criteria for splitting
    'classifier__max_depth': [3, 5, 10, None],                # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],              # Minimum samples required to split an internal node
    'classifier__min_samples_leaf': [1, 2, 4],                # Minimum samples required to be at a leaf node
    'classifier__max_features': [None, 'sqrt', 'log2']        # Number of features to consider when splitting
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='roc_auc',  # Use ROC AUC as the primary metric
    cv=cv,
    n_jobs=-1,          # Use all available CPU cores
    verbose=2           # Verbosity level
)

# Perform the grid search
grid_search.fit(X, y)

# Output the best parameters and the best score
print("\nBest Parameters:", grid_search.best_params_)
print("Best ROC AUC Score: {:.4f}".format(grid_search.best_score_))

# Evaluate the best model on additional metrics
best_pipeline = grid_search.best_estimator_
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

results = {}
for metric in metrics:
    if metric == 'roc_auc':
        scores = cross_val_score(best_pipeline, X, y, cv=cv, scoring='roc_auc')
    else:
        scores = cross_val_score(best_pipeline, X, y, cv=cv, scoring=metric)
        
    results[metric] = {
        'mean': scores.mean(),
        'std': scores.std(),
        'scores': scores
    }

# Print results for each metric
for metric, values in results.items():
    print(f"\n{metric.capitalize()} Scores for the Best Model: {values['scores']}")
    print(f"Mean {metric.capitalize()}: {values['mean']:.4f}")
    print(f"Standard Deviation of {metric.capitalize()}: {values['std']:.4f}")

# Get cross-validation predictions
y_pred_cv = cross_val_predict(best_pipeline, X, y, cv=cv)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y, y_pred_cv)

# Display confusion matrix
print("\nConfusion Matrix (Decision Tree - Best Model):")
print(conf_matrix)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits

Best Parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 3, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2}
Best ROC AUC Score: 0.7757

Accuracy Scores for the Best Model: [0.73134328 0.68656716 0.74626866 0.84848485 0.8030303 ]
Mean Accuracy: 0.7631
Standard Deviation of Accuracy: 0.0566

Precision Scores for the Best Model: [0.65116279 0.63157895 0.68421053 0.75609756 0.75      ]
Mean Precision: 0.6946
Standard Deviation of Precision: 0.0506

Recall Scores for the Best Model: [0.90322581 0.77419355 0.83870968 1.         0.87096774]
Mean Recall: 0.8774
Standard Deviation of Recall: 0.0747

F1 Scores for the Best Model: [0.75675676 0.69565217 0.75362319 0.86111111 0.80597015]
Mean F1: 0.7746
Standard Deviation of F1: 0.0556

Roc_auc Scores for the Best Model: [0.75044803 0.68145161 0.79793907 0.83041475 0.81843318]
Mean Roc_auc: 0.7757
Standa