In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# to show the whole output result
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# load data
data = pd.read_csv('../dataset/Bondora_raw.csv', low_memory=False)

In [4]:
# sort data features/columns alphabitcally
data = data.sort_index(axis=1)

In [None]:
# show data dimensions
print(data.shape)

In [None]:
# show data info (columns' names, data types,...)
print(data.info(verbose=True))

In [None]:
# show first 5 rows of data
print(data.head())

In [None]:
# show data description (count, mean, std, min, max,...) for numerical features
print(data.describe())

In [None]:
# show data unique values counts
print(data.nunique())

In [11]:
# show data unique values if they are less than 40 values
for column in data:
    unique_count = data[column].nunique()
    if(unique_count <= 40):
        print(unique_count, ' ', column)
        print(data[column].unique())

9   ActiveLateCategory
[nan '180+' '16-30' '1-7' '31-60' '8-15' '121-150' '91-120' '151-180'
 '61-90']
9   ActiveLateLastPaymentCategory
[nan '180+' '151-180' '31-60' '8-15' '1-7' '91-120' '16-30' '121-150'
 '61-90']
2   ActiveScheduleFirstPaymentReached
[ True False]
24   ApplicationSignedHour
[17 20 12 10 16  9 18 22 11 15  0 13 23 19 14  8 21  7  1  2  6  3  5  4]
7   ApplicationSignedWeekday
[5 4 6 7 1 3 2]
4   Country
['EE' 'FI' 'ES' 'SK']
7   CreditScoreEeMini
[  nan 1000.  700.  800.  600.  900.  500.    0.]
6   CreditScoreEsEquifaxRisk
[nan 'A' 'AA' 'B' 'C' 'AAA' 'D']
11   CreditScoreEsMicroL
[nan 'M3' 'M5' 'M1' 'M9' 'M2' 'M6' 'M4' 'M8' 'M7' 'M10' 'M']
14   CreditScoreFiAsiakasTietoRiskGrade
[nan 'RL2' 'RL1' 'RL4' 'RL3' 'RL0' 'RL5' '2' '1' '3' '4' '5' '6' '7' '8']
7   Education
[ 3.  5.  4.  2.  1.  0. nan -1.]
9   EmploymentDurationCurrentEmployer
['UpTo3Years' 'MoreThan5Years' 'UpTo4Years' 'UpTo2Years' 'UpTo1Year' nan
 'UpTo5Years' 'TrialPeriod' 'Other' 'Retiree']
7   Employm

In [None]:
# show data null counts
print(data.isnull().sum())

In [14]:
# show data null counts with percentage more than 70%
count=0 
for column in data:
    nulls_percentage = (data[column].isnull().sum()/data.shape[0])*100
    if(nulls_percentage > 75 ):
        print(data[column].dtype,' ', nulls_percentage,'%',' ' ,column)
        count=count +1
print(count) 

object   90.91720000892 %   CreditScoreEsEquifaxRisk
object   75.82826007775274 %   CreditScoreFiAsiakasTietoRiskGrade
float64   96.60370626407689 %   EL_V0
float64   90.39463610076638 %   EL_V1
object   86.0342379709951 %   GracePeriodEnd
object   86.0342379709951 %   GracePeriodStart
object   96.60370626407689 %   Rating_V0
object   90.39463610076638 %   Rating_V1
object   81.31034944138439 %   Rating_V2
9


In [13]:
# drop feature column which has null counts with percentage more than 70%
dropped_nulls_features=[]
for column in data:
    nulls_percentage = (data[column].isnull().sum()/data.shape[0])*100
    if(nulls_percentage > 75):
        dropped_nulls_features.append(column) 
        
new_data = data.drop(dropped_nulls_features, axis=1)

In [6]:
# drop feature that are unique foe each row 
dropped_unique_features=['LoanId','LoanNumber']
new_data = new_data.drop(dropped_unique_features, axis=1)

In [7]:
# drop feature that has only one value 
new_data = new_data.drop('ReportAsOfEOD', axis=1)

In [8]:
# get numerical features
numerical_columns = []
for column in new_data:
    if(new_data[column].dtype == 'float64' or new_data[column].dtype == 'int64'):
        numerical_columns.append(column)

# get categorical features
categorical_columns = []
for column in new_data:
    if(new_data[column].dtype == 'object'):
        categorical_columns.append(column)  
        
# get boolean features
boolean_columns = []
for column in new_data:
    if(new_data[column].dtype == 'bool'):
        boolean_columns.append(column) 

In [None]:
# replace -1 with null to be handled as missing value
for num_column in numerical_columns:
    new_data[num_column] = new_data[num_column].replace(-1., np.nan)

In [12]:
# handle missing values for numerical features
for num_column in numerical_columns:
    mean_value = new_data[num_column].mean()
    new_data[num_column].fillna(mean_value,inplace=True)
#     print(num_column,' ' , new_data[num_column].isnull().sum())

# bool
for bool_column in categorical_columns:
    mode_value = new_data[bool_column].mode().values[0]
    new_data[bool_column].fillna(mode_value,inplace=True)
    
# cat
for cat_column in boolean_columns:
    mode_value = new_data[cat_column].mode().values[0]
    new_data[cat_column].fillna(mode_value,inplace=True)
#     print(cat_column,' ' , new_data[cat_column].isnull().sum())

In [None]:
# boolean to int
for bool_column in boolean_columns:
    new_data[bool_column] = new_data[bool_column].astype(int)

In [None]:
# convert features with values that contain date + time to contain only date
date_time_features = ['BiddingStartedOn','LoanApplicationStartedDate','ListedOnUTC','StageActiveSince']
for col in date_time_features:
    new_data[col] = new_data[col].str.split().str[0]
    
print(new_data[date_time_features].nunique())

In [None]:
mapping = {'1-7': 1, '8-15': 2, '16-30': 3, '31-60': 4, '61-90': 5, '91-120': 6, '121-150': 7, '151-180': 8, '180+': 9}

features = ['ActiveLateCategory','ActiveLateLastPaymentCategory','WorseLateCategory']
new_data[features] = new_data[features].replace(mapping)

for col in features:
    print(new_data[col].unique())

In [None]:
mapping = { 'LessThan2Years':1,'2To5Years':2,'5To10Years':3,'10To15Years':4,'15To25Years':5,'MoreThan25Years':6}

new_data['WorkExperience'] = new_data['WorkExperience'].replace(mapping)

print(new_data['WorkExperience'].unique())

In [None]:
# get numerical features
numerical_columns = []
for column in new_data:
    if(new_data[column].dtype == 'float64' or new_data[column].dtype == 'int64'):
        numerical_columns.append(column)

# get categorical features
categorical_columns = []
for column in new_data:
    if(new_data[column].dtype == 'object'):
        categorical_columns.append(column)      

In [None]:
def outliers_handling(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    lower_range = Q1 - 1.5 * (Q3 - Q1)
    upper_range = Q3 + 1.5 * (Q3 - Q1)
    column = np.where(column > upper_range, upper_range, column)
    column = np.where(column < lower_range, lower_range, column)
    return column

for num_column in numerical_columns:
    new_data[num_column] = outliers_handling(new_data[num_column])

In [None]:
for num_column in numerical_columns:
    new_data[num_column].fillna(new_data[num_column].mean(),inplace=True)
    print(num_column,' ' , new_data[num_column].isnull().sum())
    
for cat_column in categorical_columns:
#     print(cat_column,' ' , new_data[cat_column].isnull().sum())
    new_data[cat_column].fillna(new_data[cat_column].mode().values[0],inplace=True)
    print(cat_column,' ' , new_data[cat_column].isnull().sum())

In [None]:
for column in new_data:
    unique_count = new_data[column].nunique()
    if(unique_count <= 40):
        print(unique_count, ' ', column)
        print(new_data[column].unique())