# Code 7
- Decision Tree HYperparameter Optimization
- Train & Validation

## Import Libraries

In [1]:
import pandas as pd

## Import Data

In [2]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [3]:
train['Loan_Status'] = train['Loan_Status'].apply(lambda x: 1 if x == 'Y' else 0) 

## 1/ Impute Numerical Variables
- Instead of preprocessing one feature at a time, we will do them in bulk

In [4]:
# Identify all Numerical features
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

In [5]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler 

### 1.1 Impute

In [6]:
num_impute = KNNImputer(n_neighbors=2, weights="uniform")

In [7]:
# Fit for Impute 
num_impute.fit(train[numerical_features])

KNNImputer(n_neighbors=2)

In [8]:
# Impute Train
train[numerical_features]  = num_impute.transform(train[numerical_features])

In [9]:
# Impute Test
test[numerical_features]  = num_impute.transform(test[numerical_features])

### 1.2 Feature Transformation

In [10]:
num_scaler = MaxAbsScaler()

In [11]:
# Fit for Making them Normal
num_scaler.fit(train[numerical_features])

MaxAbsScaler()

In [12]:
# Transform Train
train[numerical_features]  = num_scaler.transform(train[numerical_features])

In [13]:
# Transform Train
test[numerical_features]  = num_scaler.transform(test[numerical_features])

## 2/ Impute Categorical Variables
- Lets do the same for Categorical Variables

In [14]:
# Identify all Categorical Features
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

In [15]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import SimpleImputer

### 2.1 Impute

In [16]:
cat_impute = SimpleImputer(strategy='most_frequent')

In [17]:
# Fit for Impute 
cat_impute.fit(train[categorical_features])

SimpleImputer(strategy='most_frequent')

In [18]:
# Impute Train
train[categorical_features]  = cat_impute.transform(train[categorical_features])

In [19]:
# Impute Test
test[categorical_features]  = cat_impute.transform(test[categorical_features])

### 2.2 Categorical Encoders

In [20]:
import category_encoders as ce

In [21]:
cat_encoder = ce.TargetEncoder()

In [22]:
cat_encoder.fit(train[categorical_features], train['Loan_Status'])

TargetEncoder(cols=['Gender', 'Married', 'Dependents', 'Education',
                    'Self_Employed', 'Property_Area'])

https://www.kaggle.com/matleonard/categorical-encodings

In [23]:
cat_encoded = cat_encoder.transform(train[categorical_features])

In [24]:
train[categorical_features] = cat_encoded

### Now lets do the same for test

In [25]:
test_cat_encoded = cat_encoder.transform(test[categorical_features])

In [26]:
test[categorical_features] = test_cat_encoded

## Prepare Data for Model Model

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X = train.drop(['Loan_Status','Loan_ID'], axis =  1)
y = train['Loan_Status']

In [29]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, stratify = y, random_state = 5)

## Build Model

In [32]:
#Start by importing library
from sklearn.tree import DecisionTreeClassifier

In [39]:
#Initialize with Max_depth 2
clf = DecisionTreeClassifier(max_depth = 2)
model = clf.fit(X_train, y_train)
print(f" Train Accuracy : {model.score(X_train, y_train):0.1%}")
print(f" Validation Accuracy : {model.score(X_val, y_val):0.1%}")

 Train Accuracy : 81.5%
 Validation Accuracy : 80.5%


In [40]:
#Initialize with Max_depth 3
clf = DecisionTreeClassifier(max_depth = 3)
model = clf.fit(X_train, y_train)
print(f" Train Accuracy : {model.score(X_train, y_train):0.1%}")
print(f" Validation Accuracy : {model.score(X_val, y_val):0.1%}")

 Train Accuracy : 81.9%
 Validation Accuracy : 81.3%


In [41]:
#Initialize with Max_depth 4
clf = DecisionTreeClassifier(max_depth = 4)
model = clf.fit(X_train, y_train)
print(f" Train Accuracy : {model.score(X_train, y_train):0.1%}")
print(f" Validation Accuracy : {model.score(X_val, y_val):0.1%}")

 Train Accuracy : 82.3%
 Validation Accuracy : 78.0%


In [42]:
#Initialize with Max_depth 5
clf = DecisionTreeClassifier(max_depth = 5)
model = clf.fit(X_train, y_train)
print(f" Train Accuracy : {model.score(X_train, y_train):0.1%}")
print(f" Validation Accuracy : {model.score(X_val, y_val):0.1%}")

 Train Accuracy : 83.7%
 Validation Accuracy : 74.8%


### Make the best Model - max_depth = 3

In [43]:
#Initialize with Max_depth 3
clf = DecisionTreeClassifier(max_depth = 3)
model = clf.fit(X_train, y_train)
print(f" Train Accuracy : {model.score(X_train, y_train):0.1%}")
print(f" Validation Accuracy : {model.score(X_val, y_val):0.1%}")

 Train Accuracy : 81.9%
 Validation Accuracy : 81.3%


## Predict using Test X variables for Submitting to Competition

### Prepare X variables From test Data

In [45]:
X_test = test.drop(['Loan_ID'], axis =  1)

In [46]:
# We will use (Y/N) to match it to the Competition Submission Data
X_test_prep = model.predict(X_test)

## Create Submission File

In [47]:
submission = pd.DataFrame({
    'Loan_ID' : test['Loan_ID'],
    'Loan_Status' : X_test_prep
})

In [48]:
submission['Loan_Status'] = submission['Loan_Status'].apply(lambda x: 'Y' if x == 1 else 0) 

## Export Submission File

In [49]:
submission.to_csv('output/O7_DT_Validation_P2.csv', index = False)

In [None]:
# LB Accuracy : TBD