## Load Data and dependencies

In [3]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier



In [41]:
# load the datasets
train = pd.read_csv(r"C:\Users\Morinyo Baddestman\Documents\playground-series-s4e10\train.csv", index_col=0)

In [42]:
train.head()

Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [43]:
# check shape
print('train dataset shape:', train.shape)

train dataset shape: (58645, 12)


## Data cleaning 
    - drop the duplicates of the datesets first
    - check for data types and correct any errors in their representation
    - check null values and impute them using mean/ median strategy where appropriate
    - check for outliers in the dataset 
        develop boxplots for numerical features to identify feature with extreme outliers
        clamp on the outliers

In [44]:
# drop duplicates 
train = train.drop_duplicates()
print('train dataset shape:', train.shape)

train dataset shape: (58645, 12)


In [45]:
# check for data types and null values
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58645 entries, 0 to 58644
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  58645 non-null  int64  
 1   person_income               58645 non-null  int64  
 2   person_home_ownership       58645 non-null  object 
 3   person_emp_length           58645 non-null  float64
 4   loan_intent                 58645 non-null  object 
 5   loan_grade                  58645 non-null  object 
 6   loan_amnt                   58645 non-null  int64  
 7   loan_int_rate               58645 non-null  float64
 8   loan_percent_income         58645 non-null  float64
 9   cb_person_default_on_file   58645 non-null  object 
 10  cb_person_cred_hist_length  58645 non-null  int64  
 11  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 5.8+ MB


In [46]:
train.head(10)

Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0
5,27,45000,RENT,2.0,VENTURE,A,9000,8.94,0.2,N,5,0
6,25,45000,MORTGAGE,9.0,EDUCATION,A,12000,6.54,0.27,N,3,0
7,21,20000,RENT,0.0,PERSONAL,C,2500,13.49,0.13,Y,3,0
8,37,69600,RENT,11.0,EDUCATION,D,5000,14.84,0.07,Y,11,0
9,35,110000,MORTGAGE,0.0,DEBTCONSOLIDATION,C,15000,12.98,0.14,Y,6,0


In [None]:
# box plot for features 

In [None]:
# check for and clamp on outliers (strategy = mean)
def clamp_outliers(data):
    for col in data.columns:
        q1 = col.quantile[0.25]
        q3 = col.quantile[0.75]
        iqr = q3 - q1
        


## Exploratory Analysis
    - summary statistics of the dataset
    - value counts distribution for object features
    - distribution of numerical features

In [47]:
# summary statistics 
round(train.describe(), 2)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,27.55,64046.17,4.7,9217.56,10.68,0.16,5.81,0.14
std,6.03,37931.11,3.96,5563.81,3.03,0.09,4.03,0.35
min,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0


# Feature Engineering

## Model training 

1. RandomForestRegressor
    - check for correlation
    - 

2. RandomForestClassifier