# Code 8
- Random Forest HYperparameter Optimization
- Train & Validation

## Import Libraries

In [2]:
import pandas as pd

## Import Data

In [3]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [4]:
train['Loan_Status'] = train['Loan_Status'].apply(lambda x: 1 if x == 'Y' else 0) 

## 1/ Impute Numerical Variables
- Instead of preprocessing one feature at a time, we will do them in bulk

In [5]:
# Identify all Numerical features
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

In [6]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler 

### 1.1 Impute

In [7]:
num_impute = KNNImputer(n_neighbors=2, weights="uniform")

In [8]:
# Fit for Impute 
num_impute.fit(train[numerical_features])

KNNImputer(n_neighbors=2)

In [9]:
# Impute Train
train[numerical_features]  = num_impute.transform(train[numerical_features])

In [10]:
# Impute Test
test[numerical_features]  = num_impute.transform(test[numerical_features])

### 1.2 Feature Transformation

In [11]:
num_scaler = MaxAbsScaler()

In [12]:
# Fit for Making them Normal
num_scaler.fit(train[numerical_features])

MaxAbsScaler()

In [13]:
# Transform Train
train[numerical_features]  = num_scaler.transform(train[numerical_features])

In [14]:
# Transform Train
test[numerical_features]  = num_scaler.transform(test[numerical_features])

## 2/ Impute Categorical Variables
- Lets do the same for Categorical Variables

In [15]:
# Identify all Categorical Features
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

In [16]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import SimpleImputer

### 2.1 Impute

In [17]:
cat_impute = SimpleImputer(strategy='most_frequent')

In [18]:
# Fit for Impute 
cat_impute.fit(train[categorical_features])

SimpleImputer(strategy='most_frequent')

In [19]:
# Impute Train
train[categorical_features]  = cat_impute.transform(train[categorical_features])

In [20]:
# Impute Test
test[categorical_features]  = cat_impute.transform(test[categorical_features])

### 2.2 Categorical Encoders

In [21]:
import category_encoders as ce

In [22]:
cat_encoder = ce.TargetEncoder()

In [23]:
cat_encoder.fit(train[categorical_features], train['Loan_Status'])

TargetEncoder(cols=['Gender', 'Married', 'Dependents', 'Education',
                    'Self_Employed', 'Property_Area'])

https://www.kaggle.com/matleonard/categorical-encodings

In [24]:
cat_encoded = cat_encoder.transform(train[categorical_features])

In [25]:
train[categorical_features] = cat_encoded

### Now lets do the same for test

In [26]:
test_cat_encoded = cat_encoder.transform(test[categorical_features])

In [27]:
test[categorical_features] = test_cat_encoded

## Prepare Data for Model Model

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X = train.drop(['Loan_Status','Loan_ID'], axis =  1)
y = train['Loan_Status']

In [30]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, stratify = y, random_state = 5)

## Build Model

In [31]:
#Start by importing library
from sklearn.ensemble import RandomForestClassifier

In [32]:
#Initialize
clf = RandomForestClassifier()

### Find Best Model using GridSearchCV

In [33]:
from sklearn.model_selection import GridSearchCV 

In [34]:
# Hyperparameter 
parameter = {'n_estimators' : (5, 10, 15, 20, 25, 30)
              , 'max_depth' : (2,3,4)
              , 'criterion' : ('gini', 'entropy')
              , 'max_depth' : (3,5,7,9,10)
              , 'max_features' : ('auto', 'sqrt', 'log2')
              , 'min_samples_split' : (2,4,6)
              #, 'min_impurity_decrease' : 0.3
}

In [35]:
best_clf = GridSearchCV(clf, parameter,cv = 5, n_jobs = -1, verbose = 3)

In [36]:
model = best_clf.fit(X_train, y_train)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 2685 out of 2700 | elapsed:   34.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed:   34.7s finished


In [37]:
best_clf.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=3, max_features='sqrt',
                       n_estimators=5)

In [41]:
best_clf.best_estimator_.feature_importances_

array([0.00404749, 0.0025236 , 0.08068238, 0.04993604, 0.        ,
       0.08109426, 0.12213609, 0.07007792, 0.04023254, 0.41721992,
       0.13204976])

In [43]:
imp_feat=pd.Series(best_clf.best_estimator_.feature_importances_,index=X_train.columns.tolist())

In [44]:
imp_feat.sort_values(ascending=False)

Credit_History       0.417220
Property_Area        0.132050
CoapplicantIncome    0.122136
ApplicantIncome      0.081094
Dependents           0.080682
LoanAmount           0.070078
Education            0.049936
Loan_Amount_Term     0.040233
Gender               0.004047
Married              0.002524
Self_Employed        0.000000
dtype: float64

In [45]:
print(f" Train Accuracy : {model.score(X_train, y_train):0.1%}")
print(f" Validation Accuracy : {model.score(X_val, y_val):0.1%}")

 Train Accuracy : 81.9%
 Validation Accuracy : 80.5%


## Predict using Test X variables for Submitting to Competition

### Prepare X variables From test Data

In [46]:
X_test = test.drop(['Loan_ID'], axis =  1)

In [47]:
# We will use (Y/N) to match it to the Competition Submission Data
X_test_prep = model.predict(X_test)

## Create Submission File

In [48]:
submission = pd.DataFrame({
    'Loan_ID' : test['Loan_ID'],
    'Loan_Status' : X_test_prep
})

In [49]:
submission['Loan_Status'] = submission['Loan_Status'].apply(lambda x: 'Y' if x == 1 else 0) 

## Export Submission File

In [50]:
submission.to_csv('output/O8_RF_P1.csv', index = False)

In [None]:
# LB Accuracy : TBD