<a href="https://www.kaggle.com/code/funxexcel/completed-p3-dt-train-test-split?scriptVersionId=108305131" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Part 3 : Decision Trees - How can trust our predictions?

## Import Libraries

In [1]:
import numpy as np 
import pandas as pd 

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

## Import Data

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/credit-card-approval-clean-data/clean_dataset.csv
/kaggle/input/credit-card-approval-clean-data/crx.csv


In [3]:
data = pd.read_csv('/kaggle/input/credit-card-approval-clean-data/clean_dataset.csv')

### Describe Data

In [4]:
data.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,30.83,0.0,1,1,Industrials,White,1.25,1,1,1,0,ByBirth,202,0,1
1,0,58.67,4.46,1,1,Materials,Black,3.04,1,1,6,0,ByBirth,43,560,1
2,0,24.5,0.5,1,1,Materials,Black,1.5,1,0,0,0,ByBirth,280,824,1
3,1,27.83,1.54,1,1,Industrials,White,3.75,1,1,5,1,ByBirth,100,3,1
4,1,20.17,5.625,1,1,Industrials,White,1.71,1,0,0,0,ByOtherMeans,120,0,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    int64  
 1   Age             690 non-null    float64
 2   Debt            690 non-null    float64
 3   Married         690 non-null    int64  
 4   BankCustomer    690 non-null    int64  
 5   Industry        690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    int64  
 9   Employed        690 non-null    int64  
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    int64  
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    int64  
 14  Income          690 non-null    int64  
 15  Approved        690 non-null    int64  
dtypes: float64(3), int64(10), object(3)
memory usage: 86.4+ KB


In [6]:
data.columns.tolist()

['Gender',
 'Age',
 'Debt',
 'Married',
 'BankCustomer',
 'Industry',
 'Ethnicity',
 'YearsEmployed',
 'PriorDefault',
 'Employed',
 'CreditScore',
 'DriversLicense',
 'Citizen',
 'ZipCode',
 'Income',
 'Approved']

## Partition Data into X and y

In [7]:
y = data['Approved']
X = data.drop('Approved', axis = 1)

In [8]:
# Check y
y.head(10)

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: Approved, dtype: int64

In [9]:
# Check X
X.head(10)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income
0,1,30.83,0.0,1,1,Industrials,White,1.25,1,1,1,0,ByBirth,202,0
1,0,58.67,4.46,1,1,Materials,Black,3.04,1,1,6,0,ByBirth,43,560
2,0,24.5,0.5,1,1,Materials,Black,1.5,1,0,0,0,ByBirth,280,824
3,1,27.83,1.54,1,1,Industrials,White,3.75,1,1,5,1,ByBirth,100,3
4,1,20.17,5.625,1,1,Industrials,White,1.71,1,0,0,0,ByOtherMeans,120,0
5,1,32.08,4.0,1,1,CommunicationServices,White,2.5,1,0,0,1,ByBirth,360,0
6,1,33.17,1.04,1,1,Transport,Black,6.5,1,0,0,1,ByBirth,164,31285
7,0,22.92,11.585,1,1,InformationTechnology,White,0.04,1,0,0,0,ByBirth,80,1349
8,1,54.42,0.5,0,0,Financials,Black,3.96,1,0,0,0,ByBirth,180,314
9,1,42.5,4.915,0,0,Industrials,White,3.165,1,0,0,1,ByBirth,52,1442


## Create Dummy Variables

In [10]:
X = pd.get_dummies(X)
X.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,...,Industry_Transport,Industry_Utilities,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Latino,Ethnicity_Other,Ethnicity_White,Citizen_ByBirth,Citizen_ByOtherMeans,Citizen_Temporary
0,1,30.83,0.0,1,1,1.25,1,1,1,0,...,0,0,0,0,0,0,1,1,0,0
1,0,58.67,4.46,1,1,3.04,1,1,6,0,...,0,0,0,1,0,0,0,1,0,0
2,0,24.5,0.5,1,1,1.5,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,1,27.83,1.54,1,1,3.75,1,1,5,1,...,0,0,0,0,0,0,1,1,0,0
4,1,20.17,5.625,1,1,1.71,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0


## Train Test Split

### 1/ import Library
- model_selection - train_test_split
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [11]:
from sklearn.model_selection import train_test_split

### 2/ Initiate an Instance

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Decision Tree

### 1/ Initiate an Instance

In [13]:
dt_model = DecisionTreeClassifier(random_state = 42)

### 2/ Fit Model on Train Data

In [14]:
dt_model.fit(X_train,y_train)

DecisionTreeClassifier(random_state=42)

### 3/ Make Predictions on Train and Test data

In [15]:
y_train_predicted = dt_model.predict(X_train)
y_test_predicted = dt_model.predict(X_test)

### 4/ Check Performance

#### Check train Peformance

In [16]:
print(classification_report(y_train, y_train_predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       258
           1       1.00      1.00      1.00       204

    accuracy                           1.00       462
   macro avg       1.00      1.00      1.00       462
weighted avg       1.00      1.00      1.00       462



#### Check test peformance

In [17]:
print(classification_report(y_test, y_test_predicted))

              precision    recall  f1-score   support

           0       0.81      0.82      0.82       125
           1       0.78      0.77      0.77       103

    accuracy                           0.80       228
   macro avg       0.80      0.80      0.80       228
weighted avg       0.80      0.80      0.80       228



# The Peformance is Good or Bad?