<a href="https://www.kaggle.com/code/funxexcel/completed-p4-dt-hyperparameter-grid?scriptVersionId=108305189" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Part 1 : Decision Trees - Hyperparameter Optimization (Grid Search)
- Why? Decision Trees are know to overfit (hence, hyperparameter optimization to reduce overfitting)

## Import Libraries

In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report

## Import Data

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/credit-card-approval-clean-data/clean_dataset.csv
/kaggle/input/credit-card-approval-clean-data/crx.csv


In [3]:
data = pd.read_csv('/kaggle/input/credit-card-approval-clean-data/clean_dataset.csv')

### Describe Data

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    int64  
 1   Age             690 non-null    float64
 2   Debt            690 non-null    float64
 3   Married         690 non-null    int64  
 4   BankCustomer    690 non-null    int64  
 5   Industry        690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    int64  
 9   Employed        690 non-null    int64  
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    int64  
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    int64  
 14  Income          690 non-null    int64  
 15  Approved        690 non-null    int64  
dtypes: float64(3), int64(10), object(3)
memory usage: 86.4+ KB


## Partition Data into X and y

In [5]:
y = data['Approved']
X = data.drop('Approved', axis = 1)

## Create Dummy Variables

In [6]:
X = pd.get_dummies(X)
X.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,...,Industry_Transport,Industry_Utilities,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Latino,Ethnicity_Other,Ethnicity_White,Citizen_ByBirth,Citizen_ByOtherMeans,Citizen_Temporary
0,1,30.83,0.0,1,1,1.25,1,1,1,0,...,0,0,0,0,0,0,1,1,0,0
1,0,58.67,4.46,1,1,3.04,1,1,6,0,...,0,0,0,1,0,0,0,1,0,0
2,0,24.5,0.5,1,1,1.5,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,1,27.83,1.54,1,1,3.75,1,1,5,1,...,0,0,0,0,0,0,1,1,0,0
4,1,20.17,5.625,1,1,1.71,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0


## Train Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Decision Tree with Hyperparameter

### 1/ Initiate an Instance

In [8]:
dt_model = DecisionTreeClassifier(random_state = 42)

In [9]:
dt_model.fit(X_train, y_train)
dt_model.max_features_

# Notice the Max feature is high, hence overfitting 

34

### 2/ Create Parameter Grid
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [10]:
#dt_model1 = DecisionTreeClassifier(criterion = 'gini','max_depth' = 2 ,random_state = 42)
#dt_model2 = DecisionTreeClassifier(criterion = 'gini','max_depth' = 3 ,random_state = 42)
#dt_model3 = DecisionTreeClassifier(criterion = 'gini','max_depth' = 4 ,random_state = 42)
#dt_model4 = DecisionTreeClassifier(criterion = 'gini','max_depth' = 5 ,random_state = 42)
#dt_model5 = DecisionTreeClassifier(criterion = 'gini','max_depth' = 6 ,random_state = 42)

In [11]:
dt_param_grid = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3,4,5,6,7,8,9,10]
}

### 3/ Initiate a GridSearchCV instance
- model
- parameter grid
- scoring

In [12]:
gs_dt = GridSearchCV(dt_model, param_grid = dt_param_grid, scoring = 'accuracy')

### 4/ Fit Grid Search to get Best Estimators

In [13]:
gs_dt.fit(X_train,y_train)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 4, 5, 6, 7, 8, 9, 10]},
             scoring='accuracy')

### 5/ Store Best Estimator

In [14]:
gs_dt.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=42)

In [15]:
best_dt_estimates = gs_dt.best_estimator_

### 6/ Fit Model with Best Estimator

In [16]:
best_dt_estimates.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=42)

### 7/ Make predictions

In [17]:
y_train_predicted = best_dt_estimates.predict(X_train)
y_test_predicted = best_dt_estimates.predict(X_test)

### 8/ Check Performance

#### Check train Peformance

In [18]:
print(classification_report(y_train, y_train_predicted))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90       258
           1       0.90      0.84      0.87       204

    accuracy                           0.89       462
   macro avg       0.89      0.88      0.89       462
weighted avg       0.89      0.89      0.89       462



#### Check test peformance

In [19]:
print(classification_report(y_test, y_test_predicted))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87       125
           1       0.88      0.79      0.83       103

    accuracy                           0.86       228
   macro avg       0.86      0.85      0.85       228
weighted avg       0.86      0.86      0.85       228



# Did we improve the performance?