# Code 9
- Random Forest
- Hyperparameter Optimization
- GridsearchCV
- Best Features
- **Feature Engineering**

## 1/ Import Libraries

In [1]:
#CodeSection1
from google.colab import drive
drive.mount('/mntDrive') 

Mounted at /mntDrive


In [2]:
#CodeSection2
import pandas as pd
import numpy as np

## 2/ Import Data

In [3]:
#CodeSection3
train = pd.read_csv('/mntDrive/MyDrive/Project MasterMind/1 - Planning Stage/2- Research/6- New Teaching Method/Loan Prediction (Webinar Flow)/Solution Codes/input/train.csv')
test = pd.read_csv('/mntDrive/MyDrive/Project MasterMind/1 - Planning Stage/2- Research/6- New Teaching Method/Loan Prediction (Webinar Flow)/Solution Codes/input/test.csv')

## 3/ Create 2 New Features


In [4]:
#CodeSection4
all_data = [train, test]

In [9]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income,Loan_by_Income
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,5849.0,
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,0.021015
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,0.022
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,0.024287
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,0.0235


### 3.1 Create First Feature

In [6]:
#CodeSection5
# Total Income is Sum of Applicant Income and Co-applicant Income
for dataset in all_data:
  dataset['Total_Income'] = dataset['ApplicantIncome'] + dataset['CoapplicantIncome']

### 3.2 Create Second Feature

In [8]:
#CodeSection6
# Loan by Income is Loan Amount divided by Total Income
for dataset in all_data:
  dataset['Loan_by_Income'] = dataset['LoanAmount']/dataset['Total_Income']

## 4/ Preprocessing Improvement

### 4.1/ Identify Numerical and Categorical Features

In [10]:
#CodeSection7
# Identify all Numerical and Categorical features
numerical_features = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Total_Income', 'Loan_by_Income']
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

### 4.2/ Outlier Strategy

In [11]:
#CodeSection8

# Write a loop to do the same
for num_var in numerical_features:
  Q1 = train[num_var].quantile(0.25)
  Q3 = train[num_var].quantile(0.75)

  IQR = Q3-Q1

  Lower_Whisker = Q1 - 1.5*IQR
  Upper_Whisker = Q3 + 1.5*IQR

  train[num_var] = train[num_var].apply(lambda x : Upper_Whisker if x >= Upper_Whisker else x)
  test[num_var] = test[num_var].apply(lambda x : Upper_Whisker if x >= Upper_Whisker else x)

### 4.3/ Import Libraries
- Missing Value Imputation - SimpleImputer
- Preprocessing - StandardScaler, OrdinalEncoder
- Pipeline - make_pipeline, make_column_transformer
- Model - Decision Tree

In [12]:
#CodeSection9

# Import SimpleImputer
from sklearn.impute import SimpleImputer

# Import StandardScaler
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Make and Compose Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# Import Decision tree
from sklearn.ensemble import RandomForestClassifier

### 4.4/ Build Pipeline

In [13]:
#CodeSection10
# Create Preprocessor Pipeline
preprocessor = make_column_transformer(
    
    (make_pipeline(
    SimpleImputer(strategy = 'median'), 
    StandardScaler()), numerical_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'most_frequent'),
    OrdinalEncoder(categories = 'auto')), categorical_features),
)

### 4.5/ Divide Data into X and y

In [14]:
#CodeSection11
X = train.drop(['Loan_Status','Loan_ID','ApplicantIncome', 'CoapplicantIncome'], axis =  1)
y = train['Loan_Status']

### 4.6/ Create Train and Validation Data

In [15]:
#CodeSection12
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, random_state = 5) # also see stratify

## 5/ Build Model and Fit

In [16]:
#CodeSection13
# Create Parameter Grid
parameter = {'randomforestclassifier__n_estimators' : (22, 23)
              , 'randomforestclassifier__max_depth' : (2,3,4,5,6,7)
              , 'randomforestclassifier__criterion' : ('gini', 'entropy')
              , 'randomforestclassifier__max_features' : ('auto', 'sqrt', 'log2')
}

In [17]:
#CodeSection14
# Create Model Pipeline and Initiate Model
# Change max_depth to find which one gives the best accuracy
model = make_pipeline(preprocessor, RandomForestClassifier())

In [18]:
#CodeSection15
# Instead of Fit we so Grid Search
from sklearn.model_selection import GridSearchCV
model_search = GridSearchCV(model, param_grid = parameter, verbose = True, n_jobs = -1)

In [19]:
#CodeSection16
# Fit Gridsearch
model_search.fit(X_train,y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   20.4s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
               

In [20]:
#CodeSection17
# Get best estimator from the Girdsearch
model_search.best_estimator_.named_steps.randomforestclassifier

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=22,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## 6/ Check Best Features

In [21]:
#CodeSection18
# Get Feature Importance Score
feat_imp = model_search.best_estimator_.named_steps.randomforestclassifier.feature_importances_
feat_imp

array([0.11143295, 0.03143405, 0.4492852 , 0.1027587 , 0.16287235,
       0.01178928, 0.01649178, 0.03703063, 0.01628811, 0.01594482,
       0.04467213])

In [22]:
#CodeSection19
# Convert to Series with Feature Names
imp_feat=pd.Series(feat_imp,index=X_train.columns.tolist())

In [23]:
#CodeSection20
# List 
imp_feat.sort_values(ascending=False) # You also plot the same

Dependents          0.449285
Self_Employed       0.162872
Gender              0.111433
Education           0.102759
Loan_by_Income      0.044672
Credit_History      0.037031
Married             0.031434
Loan_Amount_Term    0.016492
Property_Area       0.016288
Total_Income        0.015945
LoanAmount          0.011789
dtype: float64

## 6/ Check Accuracy of Model on Train Data

In [24]:
#CodeSection21
# Predict on Train Data
y_train_pred = model_search.predict(X_train)
y_val_pred = model_search.predict(X_val)

### We can use a Accuracy Function from Metrics
- Check Train Accuracy

In [25]:
#CodeSection22
# Import metrics library
from sklearn.metrics import accuracy_score

In [26]:
#CodeSection23
# Print Train Accuracy
print(f" Train Accuracy : {accuracy_score(y_train, y_train_pred):0.1%}")
print(f" Validation Accuracy : {accuracy_score(y_val, y_val_pred):0.1%}")

 Train Accuracy : 86.8%
 Validation Accuracy : 77.2%


## 7/ Predict and Submission

### Predict on "Test Data"

In [27]:
#CodeSection24
# Get all the X Variables from the Test Dataset
X_test = test.drop(['Loan_ID'], axis =  1)

# Predict on X_test Data ("X_test_prep")
X_test_prep = model_search.predict(X_test)



### Create Submission File

In [28]:
#CodeSection25
submission = pd.DataFrame({
    'Loan_ID' : test['Loan_ID'],
    'Loan_Status' : X_test_prep
})

### Export Submission File

In [29]:
#CodeSection26
submission.to_csv('/mntDrive/MyDrive/Project MasterMind/1 - Planning Stage/2- Research/6- New Teaching Method/Loan Prediction (Webinar Flow)/Solution Codes/output/O9_Feature_Engineering.csv', index = False)

In [None]:
# LB Accuracy : 0.7916 (Validation Accuracy : 81.3%)