# Code 6
- Decision Tree Base

## Import Libraries

In [1]:
import pandas as pd

## Import Data

In [2]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [7]:
#train['Loan_Status'] = train['Loan_Status'].apply(lambda x: 1 if x == 'Y' else 0) 

## 1/ Impute Numerical Variables
- Instead of preprocessing one feature at a time, we will do them in bulk

In [8]:
# Identify all Numerical features
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

In [9]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler 

### 1.1 Impute

In [10]:
num_impute = KNNImputer(n_neighbors=2, weights="uniform")

In [11]:
# Fit for Impute 
num_impute.fit(train[numerical_features])

KNNImputer(n_neighbors=2)

In [12]:
# Impute Train
train[numerical_features]  = num_impute.transform(train[numerical_features])

In [13]:
# Impute Test
test[numerical_features]  = num_impute.transform(test[numerical_features])

### 1.2 Feature Transformation

In [14]:
num_scaler = MaxAbsScaler()

In [15]:
# Fit for Making them Normal
num_scaler.fit(train[numerical_features])

MaxAbsScaler()

In [16]:
# Transform Train
train[numerical_features]  = num_scaler.transform(train[numerical_features])

In [17]:
# Transform Train
test[numerical_features]  = num_scaler.transform(test[numerical_features])

## 2/ Impute Categorical Variables
- Lets do the same for Categorical Variables

In [18]:
# Identify all Categorical Features
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

In [19]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import SimpleImputer

### 2.1 Impute

In [20]:
cat_impute = SimpleImputer(strategy='most_frequent')

In [21]:
# Fit for Impute 
cat_impute.fit(train[categorical_features])

SimpleImputer(strategy='most_frequent')

In [22]:
# Impute Train
train[categorical_features]  = cat_impute.transform(train[categorical_features])

In [23]:
# Impute Test
test[categorical_features]  = cat_impute.transform(test[categorical_features])

### 2.2 Categorical Encoders

In [24]:
import category_encoders as ce

- encoder = ce.BackwardDifferenceEncoder(cols=[...])
- encoder = ce.BaseNEncoder(cols=[...])
- encoder = ce.BinaryEncoder(cols=[...])
- encoder = ce.CatBoostEncoder(cols=[...])
- encoder = ce.CountEncoder(cols=[...])
- encoder = ce.GLMMEncoder(cols=[...])
- encoder = ce.HashingEncoder(cols=[...])
- encoder = ce.HelmertEncoder(cols=[...])
- encoder = ce.JamesSteinEncoder(cols=[...])
- encoder = ce.LeaveOneOutEncoder(cols=[...])
- encoder = ce.MEstimateEncoder(cols=[...])
- encoder = ce.OneHotEncoder(cols=[...])
- encoder = ce.OrdinalEncoder(cols=[...])
- encoder = ce.SumEncoder(cols=[...])
- encoder = ce.PolynomialEncoder(cols=[...])
- encoder = ce.TargetEncoder(cols=[...])
- encoder = ce.WOEEncoder(cols=[...])

- encoder.fit(X, y)
- X_cleaned = encoder.transform(X_dirty)

In [25]:
cat_encoder = ce.TargetEncoder()

In [26]:
cat_encoder.fit(train[categorical_features], train['Loan_Status'])

TargetEncoder(cols=['Gender', 'Married', 'Dependents', 'Education',
                    'Self_Employed', 'Property_Area'])

https://www.kaggle.com/matleonard/categorical-encodings

In [27]:
cat_encoded = cat_encoder.transform(train[categorical_features])

In [29]:
train[categorical_features] = cat_encoded

### Now lets do the same for test

In [30]:
test_cat_encoded = cat_encoder.transform(test[categorical_features])

In [31]:
test[categorical_features] = test_cat_encoded

## Prepare Data for Model Model

In [33]:
X = train.drop(['Loan_Status','Loan_ID'], axis =  1)
y = train['Loan_Status']

## Build Model

In [35]:
#Start by importing library
from sklearn.tree import DecisionTreeClassifier

#Initialize
clf = DecisionTreeClassifier()

In [36]:
model = clf.fit(X, y)

## Check Accuracy of Model on Train Data

In [37]:
# Predict on Train Data
y_pred = model.predict(X)

### We can use a Accuracy Function from Metrics

In [38]:
from sklearn.metrics import accuracy_score

In [39]:
y_true = train['Loan_Status']

In [40]:
print(f" Train Accuracy : {accuracy_score(y_true, y_pred):0.1%}")

 Train Accuracy : 100.0%


## Predict using Test X variables for Submitting to Competition

### Prepare X variables From test Data

In [41]:
X_test = test.drop(['Loan_ID'], axis =  1)

In [42]:
X_test.to_csv('X_test.csv', index = False)

In [43]:
# We will use (Y/N) to match it to the Competition Submission Data
X_test_prep = model.predict(X_test)

## Create Submission File

In [44]:
sample_submission = pd.read_csv('input/sample_submission.csv')
sample_submission.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,N
1,LP001022,N
2,LP001031,N
3,LP001035,N
4,LP001051,N


In [45]:
submission = pd.DataFrame({
    'Loan_ID' : test['Loan_ID'],
    'Loan_Status' : X_test_prep
})

In [48]:
submission['Loan_Status'] = submission['Loan_Status'].apply(lambda x: 'Y' if x == 1 else 0) 

## Export Submission File

In [49]:
submission.to_csv('output/O6_DT_Base.csv', index = False)

In [None]:
# LB Accuracy : TBD