## Load Data and dependencies

In [3]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier



In [None]:
# load the datasets
train = pd.read_csv(r"C:\Users\Morinyo Baddestman\Documents\playground-series-s4e10\train.csv")
test = pd.read_csv(r"C:\Users\Morinyo Baddestman\Documents\playground-series-s4e10\test.csv")

In [None]:
train.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [8]:
test.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4


In [10]:
# check shape

print('train dataset shape:', train.shape)

print('test dataset shape:', test.shape)

train dataset shape: (58645, 13)
test dataset shape: (39098, 12)


In [None]:
col_train = np.array([c for c in train.columns])
col_test = np.array([c for c in test.columns])

['id' 'person_age' 'person_income' 'person_home_ownership'
 'person_emp_length' 'loan_intent' 'loan_grade' 'loan_amnt'
 'loan_int_rate' 'loan_percent_income' 'cb_person_default_on_file'
 'cb_person_cred_hist_length' 'loan_status']
['id' 'person_age' 'person_income' 'person_home_ownership'
 'person_emp_length' 'loan_intent' 'loan_grade' 'loan_amnt'
 'loan_int_rate' 'loan_percent_income' 'cb_person_default_on_file'
 'cb_person_cred_hist_length']


## Data cleaning 

In [15]:
# drop duplicates 
train = train.drop_duplicates()
test = test.drop_duplicates()


In [19]:
# check for data types
train.dtypes

id                              int64
person_age                      int64
person_income                   int64
person_home_ownership          object
person_emp_length             float64
loan_intent                    object
loan_grade                     object
loan_amnt                       int64
loan_int_rate                 float64
loan_percent_income           float64
cb_person_default_on_file      object
cb_person_cred_hist_length      int64
loan_status                     int64
dtype: object

In [24]:
# check for null values
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

In [None]:
# check for and clamp on outliers (strategy = mean)
def clamp_outliers(data):
    for col in data.columns:
        q1 = col.quantile[0.25]
        q3 = col.quantile[0.75]
        iqr = q3 - q1
        


## Exploratory Data Analysis

In [23]:
# summary statistics 
round(train.describe(), 2)

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,29322.0,27.55,64046.17,4.7,9217.56,10.68,0.16,5.81,0.14
std,16929.5,6.03,37931.11,3.96,5563.81,3.03,0.09,4.03,0.35
min,0.0,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,14661.0,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,29322.0,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,43983.0,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,58644.0,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0
