# Code 7
- Decision Tree HYperparameter Optimization
- Train & Validation

## Import Libraries

In [3]:
import pandas as pd

## Import Data

In [4]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [5]:
train['Loan_Status'] = train['Loan_Status'].apply(lambda x: 1 if x == 'Y' else 0) 

## 1/ Impute Numerical Variables
- Instead of preprocessing one feature at a time, we will do them in bulk

In [6]:
# Identify all Numerical features
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

In [7]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler 

### 1.1 Impute

In [8]:
num_impute = KNNImputer(n_neighbors=2, weights="uniform")

In [9]:
# Fit for Impute 
num_impute.fit(train[numerical_features])

KNNImputer(n_neighbors=2)

In [10]:
# Impute Train
train[numerical_features]  = num_impute.transform(train[numerical_features])

In [11]:
# Impute Test
test[numerical_features]  = num_impute.transform(test[numerical_features])

### 1.2 Feature Transformation

In [12]:
num_scaler = MaxAbsScaler()

In [13]:
# Fit for Making them Normal
num_scaler.fit(train[numerical_features])

MaxAbsScaler()

In [14]:
# Transform Train
train[numerical_features]  = num_scaler.transform(train[numerical_features])

In [15]:
# Transform Train
test[numerical_features]  = num_scaler.transform(test[numerical_features])

## 2/ Impute Categorical Variables
- Lets do the same for Categorical Variables

In [16]:
# Identify all Categorical Features
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

In [17]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import SimpleImputer

### 2.1 Impute

In [18]:
cat_impute = SimpleImputer(strategy='most_frequent')

In [19]:
# Fit for Impute 
cat_impute.fit(train[categorical_features])

SimpleImputer(strategy='most_frequent')

In [20]:
# Impute Train
train[categorical_features]  = cat_impute.transform(train[categorical_features])

In [21]:
# Impute Test
test[categorical_features]  = cat_impute.transform(test[categorical_features])

### 2.2 Categorical Encoders

In [22]:
import category_encoders as ce

In [23]:
cat_encoder = ce.TargetEncoder()

In [24]:
cat_encoder.fit(train[categorical_features], train['Loan_Status'])

TargetEncoder(cols=['Gender', 'Married', 'Dependents', 'Education',
                    'Self_Employed', 'Property_Area'])

https://www.kaggle.com/matleonard/categorical-encodings

In [25]:
cat_encoded = cat_encoder.transform(train[categorical_features])

In [26]:
train[categorical_features] = cat_encoded

### Now lets do the same for test

In [27]:
test_cat_encoded = cat_encoder.transform(test[categorical_features])

In [28]:
test[categorical_features] = test_cat_encoded

## Prepare Data for Model Model

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X = train.drop(['Loan_Status','Loan_ID'], axis =  1)
y = train['Loan_Status']

In [31]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, stratify = y, random_state = 5)

## Build Model

In [32]:
#Start by importing library
from sklearn.tree import DecisionTreeClassifier

In [34]:
#Initialize
clf = DecisionTreeClassifier()

### Find Best Model using GridSearchCV

In [35]:
from sklearn.model_selection import GridSearchCV 

In [38]:
# Hyperparameter 
parameter = {'max_depth' : (2,3,4,5,6)
}

In [48]:
best_clf = GridSearchCV(clf, parameter,cv = 5, n_jobs = -1, verbose = 3)

In [49]:
model = best_clf.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.1s finished


In [46]:
best_clf.best_estimator_

DecisionTreeClassifier(max_depth=2)

In [52]:
best_clf.cv_results_

{'mean_fit_time': array([0.00816221, 0.00593176, 0.00602851, 0.00560884, 0.00560551]),
 'std_fit_time': array([0.00256528, 0.00047573, 0.00068374, 0.00154824, 0.00045084]),
 'mean_score_time': array([0.00556216, 0.00390944, 0.00330315, 0.00338526, 0.0032927 ]),
 'std_score_time': array([0.00141812, 0.00072282, 0.00055025, 0.00212681, 0.00070933]),
 'param_max_depth': masked_array(data=[2, 3, 4, 5, 6],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 2},
  {'max_depth': 3},
  {'max_depth': 4},
  {'max_depth': 5},
  {'max_depth': 6}],
 'split0_test_score': array([0.82828283, 0.80808081, 0.81818182, 0.82828283, 0.78787879]),
 'split1_test_score': array([0.80612245, 0.79591837, 0.79591837, 0.76530612, 0.75510204]),
 'split2_test_score': array([0.80612245, 0.79591837, 0.79591837, 0.82653061, 0.78571429]),
 'split3_test_score': array([0.79591837, 0.74489796, 0.76530612, 0.7244898 , 0.7244898 ]),
 'split4_tes

In [53]:
print(f" Train Accuracy : {model.score(X_train, y_train):0.1%}")
print(f" Validation Accuracy : {model.score(X_val, y_val):0.1%}")

 Train Accuracy : 81.5%
 Validation Accuracy : 80.5%


## Predict using Test X variables for Submitting to Competition

### Prepare X variables From test Data

In [54]:
X_test = test.drop(['Loan_ID'], axis =  1)

In [55]:
# We will use (Y/N) to match it to the Competition Submission Data
X_test_prep = model.predict(X_test)

## Create Submission File

In [56]:
submission = pd.DataFrame({
    'Loan_ID' : test['Loan_ID'],
    'Loan_Status' : X_test_prep
})

In [57]:
submission['Loan_Status'] = submission['Loan_Status'].apply(lambda x: 'Y' if x == 1 else 0) 

## Export Submission File

In [49]:
submission.to_csv('output/O7_DT_Validation_P3.csv', index = False)

In [None]:
# LB Accuracy : TBD