# The Decision Tree on the Churn Dataset with Cross Validation

In [None]:
from sys import path
import pandas as pd
from IPython.display import display, HTML
from sklearn import preprocessing
from sklearn.tree import export_text, DecisionTreeClassifier
from sklearn.metrics import accuracy_score 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.model_selection import cross_validate, train_test_split, cross_val_score, GridSearchCV

path.append("..")

## Read the churn file 

In [None]:
inputFile = "../data/churn.csv"
df = pd.read_csv(inputFile, delimiter=";")
display(df)
print (df.info())

## Data Preparation
### Transform labels into index

In [None]:
df_features = df.drop("LEAVE",axis=1) # drop label attribute from the features
df_labels = df[["LEAVE"]].copy()
display(df_features)
display(df_labels)

num_attributes = df.select_dtypes(include=["int64"]).columns.tolist()
cat_attributes = df.select_dtypes(exclude=["int64"]).columns.tolist()
cat_attributes.remove("LEAVE")
print(num_attributes)
print(cat_attributes)
cat_encoder = OrdinalEncoder().set_output(transform="pandas")
label_encoder = OrdinalEncoder().set_output(transform="pandas")
transform_pipeline = ColumnTransformer([("num",StandardScaler(),num_attributes), \
                                        ("cat",cat_encoder,cat_attributes)]).set_output(transform="pandas")
df_features_prepared = transform_pipeline.fit_transform(df_features)
display(df_features_prepared)
df_labels_prepared = label_encoder.fit_transform(df_labels)
display(df_labels_prepared)

### Spliting the dataset into train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_features_prepared,df_labels_prepared,test_size=0.4,random_state=1234)
display (X_train)
display (X_test) 
display (y_train)
display (y_test)

## Build the decision tree model

In [None]:
dt = DecisionTreeClassifier()

## Cross Validation 

In [None]:
# TODO do the normal cross validation and print the test error
cv_results = cross_validate(dt, X_train, y_train, cv=5, return_train_score=True)
print("Cross-validation results:", cv_results)
test_errors = 1 - cv_results['test_score']
print("Test errors for each fold:", test_errors)
print("Mean test error:", test_errors.mean())

## Nested Cross Validation
### Build a network parameters grid

In [None]:
# TODO add different settings and sets
param_grid = [{"criterion":["entropy","gini"], "max_depth": [ 5, 10, None ], "min_samples_leaf":[1, 5, 10]}]
param_grid_extended = [{"criterion":["entropy","gini"], "max_depth": [ 3, 5, 10, None, 15, 20 ], "min_samples_leaf":[1, 2, 5, 10, 15, 20, 25]}]

### Hyperparamenter search

In [None]:
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring="accuracy",return_train_score=True)
grid_search_extended = GridSearchCV(dt, param_grid_extended, cv=5, scoring="accuracy",return_train_score=True)
# TODO search the hyperparams and print the result
grid_search.fit(X_train, y_train)
grid_search_extended.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)
print("Best hyperparameters (extended):", grid_search_extended.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)
print("Best cross-validation accuracy (extended):", grid_search_extended.best_score_)
best_dt = grid_search.best_estimator_
best_dt_extended = grid_search_extended.best_estimator_

### Results of the hyperparameter search

## Test the model 

In [None]:
# TODO test the best model with the test set 
y_pred = best_dt.predict(X_test)
y_pred_extended = best_dt_extended.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
test_accuracy_extended = accuracy_score(y_test, y_pred_extended)
print("Test set accuracy with best hyperparameters:", test_accuracy)
print("Test set accuracy with best hyperparameters (extended):", test_accuracy_extended)
print("Decision Tree structure:\n", export_text(best_dt, feature_names=list(X_train.columns)))
print("Decision Tree structure (extended):\n", export_text(best_dt_extended, feature_names=list(X_train.columns)))