# Code 6
- Decision Tree Base
- Train & Validation

## Import Libraries

In [23]:
import pandas as pd

## Import Data

In [24]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [25]:
train['Loan_Status'] = train['Loan_Status'].apply(lambda x: 1 if x == 'Y' else 0) 

## 1/ Impute Numerical Variables
- Instead of preprocessing one feature at a time, we will do them in bulk

In [26]:
# Identify all Numerical features
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

In [27]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler 

### 1.1 Impute

In [28]:
num_impute = KNNImputer(n_neighbors=2, weights="uniform")

In [29]:
# Fit for Impute 
num_impute.fit(train[numerical_features])

KNNImputer(n_neighbors=2)

In [30]:
# Impute Train
train[numerical_features]  = num_impute.transform(train[numerical_features])

In [31]:
# Impute Test
test[numerical_features]  = num_impute.transform(test[numerical_features])

### 1.2 Feature Transformation

In [32]:
num_scaler = MaxAbsScaler()

In [33]:
# Fit for Making them Normal
num_scaler.fit(train[numerical_features])

MaxAbsScaler()

In [34]:
# Transform Train
train[numerical_features]  = num_scaler.transform(train[numerical_features])

In [35]:
# Transform Train
test[numerical_features]  = num_scaler.transform(test[numerical_features])

## 2/ Impute Categorical Variables
- Lets do the same for Categorical Variables

In [36]:
# Identify all Categorical Features
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

In [37]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import SimpleImputer

### 2.1 Impute

In [38]:
cat_impute = SimpleImputer(strategy='most_frequent')

In [39]:
# Fit for Impute 
cat_impute.fit(train[categorical_features])

SimpleImputer(strategy='most_frequent')

In [40]:
# Impute Train
train[categorical_features]  = cat_impute.transform(train[categorical_features])

In [41]:
# Impute Test
test[categorical_features]  = cat_impute.transform(test[categorical_features])

### 2.2 Categorical Encoders

In [42]:
import category_encoders as ce

In [43]:
cat_encoder = ce.TargetEncoder()

In [44]:
cat_encoder.fit(train[categorical_features], train['Loan_Status'])

TargetEncoder(cols=['Gender', 'Married', 'Dependents', 'Education',
                    'Self_Employed', 'Property_Area'])

https://www.kaggle.com/matleonard/categorical-encodings

In [45]:
cat_encoded = cat_encoder.transform(train[categorical_features])

In [46]:
train[categorical_features] = cat_encoded

### Now lets do the same for test

In [47]:
test_cat_encoded = cat_encoder.transform(test[categorical_features])

In [48]:
test[categorical_features] = test_cat_encoded

## Prepare Data for Model Model

In [49]:
from sklearn.model_selection import train_test_split

In [51]:
X = train.drop(['Loan_Status','Loan_ID'], axis =  1)
y = train['Loan_Status']

In [52]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, stratify = y, random_state = 5)

## Build Model

In [53]:
#Start by importing library
from sklearn.tree import DecisionTreeClassifier

#Initialize
clf = DecisionTreeClassifier()

In [54]:
model = clf.fit(X_train, y_train)

## Check Accuracy of Model on Train Data

### We can use Model Score which is faster 

In [68]:
print(f" Train Accuracy : {model.score(X_train, y_train):0.1%}")
print(f" Validation Accuracy : {model.score(X_val, y_val):0.1%}")

 Train Accuracy : 100.0%
 Validation Accuracy : 65.9%


#### At this stage we need to now look at Variance and Bias to identify best model
- Also we need to optimize the Hyperparameters of DT

## Predict using Test X variables for Submitting to Competition

### Prepare X variables From test Data

In [69]:
X_test = test.drop(['Loan_ID'], axis =  1)

In [70]:
# We will use (Y/N) to match it to the Competition Submission Data
X_test_prep = model.predict(X_test)

## Create Submission File

In [72]:
submission = pd.DataFrame({
    'Loan_ID' : test['Loan_ID'],
    'Loan_Status' : X_test_prep
})

In [73]:
submission['Loan_Status'] = submission['Loan_Status'].apply(lambda x: 'Y' if x == 1 else 0) 

## Export Submission File

In [74]:
submission.to_csv('output/O6_DT_Validation_P1.csv', index = False)

In [None]:
# LB Accuracy : TBD