# Loading Packages and Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# to show the whole output result
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [30]:
# load data
data = pd.read_csv('../dataset/Bondora_raw.csv', low_memory=False)

In [31]:
# sort data features/columns alphabitcally
data = data.sort_index(axis=1)

# Data Exploration

In [5]:
# show data dimensions
print(data.shape)

(134529, 112)


In [6]:
# show data info (columns' names, data types,...)
print(data.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134529 entries, 0 to 134528
Data columns (total 112 columns):
 #    Column                                  Dtype  
---   ------                                  -----  
 0    ActiveLateCategory                      object 
 1    ActiveLateLastPaymentCategory           object 
 2    ActiveScheduleFirstPaymentReached       bool   
 3    Age                                     int64  
 4    Amount                                  float64
 5    AmountOfPreviousLoansBeforeLoan         float64
 6    ApplicationSignedHour                   int64  
 7    ApplicationSignedWeekday                int64  
 8    AppliedAmount                           float64
 9    BiddingStartedOn                        object 
 10   BidsApi                                 int64  
 11   BidsManual                              float64
 12   BidsPortfolioManager                    int64  
 13   City                                    object 
 14   ContractEndDate   

In [7]:
# show first 5 rows of data
print(data.head())

  ActiveLateCategory ActiveLateLastPaymentCategory  \
0                NaN                           NaN   
1                NaN                           NaN   
2               180+                          180+   
3                NaN                           NaN   
4                NaN                          180+   

   ActiveScheduleFirstPaymentReached  Age    Amount  \
0                               True   61  115.0408   
1                               True   48  140.6057   
2                               True   58  319.5409   
3                               True   23   57.5205   
4                               True   25  319.5436   

   AmountOfPreviousLoansBeforeLoan  ApplicationSignedHour  \
0                          83.0852                     17   
1                         255.6467                     20   
2                           0.0000                     20   
3                         134.2144                     12   
4                         146.9966     

In [8]:
# show data description (count, mean, std, min, max,...) for numerical features
print(data.describe())

                 Age         Amount  AmountOfPreviousLoansBeforeLoan  \
count  134529.000000  134529.000000                    134529.000000   
mean       40.819295    2543.872472                      2868.652401   
std        12.348693    2170.128183                      4507.046575   
min         0.000000       6.390000                         0.000000   
25%        31.000000     744.000000                         0.000000   
50%        40.000000    2125.000000                       396.354100   
75%        50.000000    3600.000000                      4250.000000   
max        77.000000   10632.000000                     53762.000000   

       ApplicationSignedHour  ApplicationSignedWeekday  AppliedAmount  \
count          134529.000000             134529.000000  134529.000000   
mean               13.374640                  3.907908    2727.947540   
std                 4.992375                  1.726192    2374.439168   
min                 0.000000                  1.000000     

In [9]:
# show data unique values counts
print(data.nunique())

ActiveLateCategory                             9
ActiveLateLastPaymentCategory                  9
ActiveScheduleFirstPaymentReached              2
Age                                           62
Amount                                      5223
AmountOfPreviousLoansBeforeLoan            12005
ApplicationSignedHour                         24
ApplicationSignedWeekday                       7
AppliedAmount                                593
BiddingStartedOn                          134135
BidsApi                                     1002
BidsManual                                  4308
BidsPortfolioManager                        5730
City                                        7733
ContractEndDate                             3944
Country                                        4
County                                      1010
CreditScoreEeMini                              7
CreditScoreEsEquifaxRisk                       6
CreditScoreEsMicroL                           11
CreditScoreFiAsiakas

In [10]:
# show data unique values if they are less than 50 values
for column in data:
    unique_count = data[column].nunique()
    if(unique_count <= 50):
        print(unique_count, ' ', column)
        print(data[column].unique())

9   ActiveLateCategory
[nan '180+' '16-30' '1-7' '31-60' '8-15' '121-150' '91-120' '151-180'
 '61-90']
9   ActiveLateLastPaymentCategory
[nan '180+' '151-180' '31-60' '8-15' '1-7' '91-120' '16-30' '121-150'
 '61-90']
2   ActiveScheduleFirstPaymentReached
[ True False]
24   ApplicationSignedHour
[17 20 12 10 16  9 18 22 11 15  0 13 23 19 14  8 21  7  1  2  6  3  5  4]
7   ApplicationSignedWeekday
[5 4 6 7 1 3 2]
4   Country
['EE' 'FI' 'ES' 'SK']
7   CreditScoreEeMini
[  nan 1000.  700.  800.  600.  900.  500.    0.]
6   CreditScoreEsEquifaxRisk
[nan 'A' 'AA' 'B' 'C' 'AAA' 'D']
11   CreditScoreEsMicroL
[nan 'M3' 'M5' 'M1' 'M9' 'M2' 'M6' 'M4' 'M8' 'M7' 'M10' 'M']
14   CreditScoreFiAsiakasTietoRiskGrade
[nan 'RL2' 'RL1' 'RL4' 'RL3' 'RL0' 'RL5' '2' '1' '3' '4' '5' '6' '7' '8']
7   Education
[ 3.  5.  4.  2.  1.  0. nan -1.]
9   EmploymentDurationCurrentEmployer
['UpTo3Years' 'MoreThan5Years' 'UpTo4Years' 'UpTo2Years' 'UpTo1Year' nan
 'UpTo5Years' 'TrialPeriod' 'Other' 'Retiree']
7   Employm

In [11]:
# show data null counts
print(data.isnull().sum())

ActiveLateCategory                         86011
ActiveLateLastPaymentCategory              82279
ActiveScheduleFirstPaymentReached              0
Age                                            0
Amount                                         0
AmountOfPreviousLoansBeforeLoan                0
ApplicationSignedHour                          0
ApplicationSignedWeekday                       0
AppliedAmount                                  0
BiddingStartedOn                               0
BidsApi                                        0
BidsManual                                     0
BidsPortfolioManager                           0
City                                        9794
ContractEndDate                            75546
Country                                        0
County                                     36840
CreditScoreEeMini                          62807
CreditScoreEsEquifaxRisk                  122310
CreditScoreEsMicroL                        29574
CreditScoreFiAsiakas

In [12]:
# show data null counts with percentage more than 75%
count=0 
for column in data:
    nulls_percentage = (data[column].isnull().sum()/data.shape[0])*100
    if(nulls_percentage > 50 ):
        print(data[column].dtype,' ', nulls_percentage,'%',' ' ,column)
        count=count +1
print(count) 

object   63.934913661738356 %   ActiveLateCategory
object   61.16079061020301 %   ActiveLateLastPaymentCategory
object   56.15592177151395 %   ContractEndDate
object   90.91720000892 %   CreditScoreEsEquifaxRisk
object   75.82826007775274 %   CreditScoreFiAsiakasTietoRiskGrade
float64   63.934913661738356 %   CurrentDebtDaysPrimary
float64   61.16079061020301 %   CurrentDebtDaysSecondary
object   63.934913661738356 %   DebtOccuredOn
object   61.16079061020301 %   DebtOccuredOnForSecondary
object   68.0998149097964 %   DefaultDate
float64   68.0998149097964 %   EAD1
float64   68.0998149097964 %   EAD2
float64   96.60370626407689 %   EL_V0
float64   90.39463610076638 %   EL_V1
object   73.17307049037754 %   EmploymentPosition
object   86.0342379709951 %   GracePeriodEnd
object   86.0342379709951 %   GracePeriodStart
float64   56.04739498546781 %   InterestAndPenaltyDebtServicingCost
float64   56.04739498546781 %   InterestAndPenaltyWriteOffs
float64   68.0998149097964 %   InterestRecover

# Data PreProcessing

## - Drop some features

In [32]:
# drop feature column which has null counts with percentage more than 50%
dropped_nulls_features=[]
for column in data:
    nulls_percentage = (data[column].isnull().sum()/data.shape[0])*100
    if(nulls_percentage > 50):
        dropped_nulls_features.append(column) 
        
new_data = data.drop(dropped_nulls_features, axis=1)

In [33]:
print(new_data.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134529 entries, 0 to 134528
Data columns (total 78 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   ActiveScheduleFirstPaymentReached       134529 non-null  bool   
 1   Age                                     134529 non-null  int64  
 2   Amount                                  134529 non-null  float64
 3   AmountOfPreviousLoansBeforeLoan         134529 non-null  float64
 4   ApplicationSignedHour                   134529 non-null  int64  
 5   ApplicationSignedWeekday                134529 non-null  int64  
 6   AppliedAmount                           134529 non-null  float64
 7   BiddingStartedOn                        134529 non-null  object 
 8   BidsApi                                 134529 non-null  int64  
 9   BidsManual                              134529 non-null  float64
 10  BidsPortfolioManager                    1345

In [34]:
# drop feature that are unique for each row 
dropped_unique_features=['LoanId','LoanNumber']
new_data = new_data.drop(dropped_unique_features, axis=1)

In [35]:
# drop feature that has only one value 
new_data = new_data.drop('ReportAsOfEOD', axis=1)

In [36]:
# drop features that won't affect related to borrower
dropped_non_needed_features=['City','County','DateOfBirth','UserName']
new_data = new_data.drop(dropped_non_needed_features, axis=1)

In [38]:
# drop separated income features as we have a feature for total income
dropped_separated_income_features=['IncomeFromChildSupport','IncomeFromFamilyAllowance','IncomeFromLeavePay','IncomeFromPension','IncomeFromPrincipalEmployer','IncomeFromSocialWelfare','IncomeOther']
new_data = new_data.drop(dropped_separated_income_features, axis=1)

In [37]:
# drop features with date values
dropped_date_features=['BiddingStartedOn','FirstPaymentDate','LastPaymentOn','ListedOnUTC','LoanApplicationStartedDate','LoanDate','MaturityDate_Last','MaturityDate_Original','StageActiveSince']
new_data = new_data.drop(dropped_date_features, axis=1)

In [40]:
# drop features that won't affect
dropped_non_needed_features=['ApplicationSignedHour','ApplicationSignedWeekday','MonthlyPaymentDay']
new_data = new_data.drop(dropped_non_needed_features, axis=1)

In [41]:
print(new_data.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134529 entries, 0 to 134528
Data columns (total 52 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   ActiveScheduleFirstPaymentReached       134529 non-null  bool   
 1   Age                                     134529 non-null  int64  
 2   Amount                                  134529 non-null  float64
 3   AmountOfPreviousLoansBeforeLoan         134529 non-null  float64
 4   AppliedAmount                           134529 non-null  float64
 5   BidsApi                                 134529 non-null  int64  
 6   BidsManual                              134529 non-null  float64
 7   BidsPortfolioManager                    134529 non-null  int64  
 8   Country                                 134529 non-null  object 
 9   CreditScoreEeMini                       71722 non-null   float64
 10  CreditScoreEsMicroL                     1049

In [22]:
# get numerical features
numerical_columns = []
for column in new_data:
    if(new_data[column].dtype == 'float64' or new_data[column].dtype == 'int64'):
        numerical_columns.append(column)

# get categorical features
categorical_columns = []
for column in new_data:
    if(new_data[column].dtype == 'object'):
        categorical_columns.append(column)  
        
# get boolean features
boolean_columns = []
for column in new_data:
    if(new_data[column].dtype == 'bool'):
        boolean_columns.append(column) 

## - Handle missing values

In [23]:
for column in boolean_columns:
    if(new_data[column].isnull().sum()):
        print(column) 
    else:
        print ('no nulls')
       

no nulls
no nulls
no nulls


In [24]:
# replace -1 with null to be handled as missing value
new_data = new_data.replace({-1.:np.nan,-1:np.nan})

In [25]:
# handle missing values in categorical features with numerical values by replacing with mode 
categorical_with_numbers_features= ['CreditScoreEeMini','Education','EmploymentStatus','Gender','HomeOwnershipType','MaritalStatus','ModelVersion','OccupationArea',
                                    'RecoveryStage','UseOfLoan','VerificationType']

for col in categorical_with_numbers_features:
    mode_value = new_data[col].mode().values[0]
    new_data[col].fillna(mode_value,inplace=True)
    
for column in categorical_with_numbers_features:
    unique_count = new_data[column].nunique()
    if(unique_count <= 40):
        print(unique_count, ' ', column)
        print(new_data[column].unique())

7   CreditScoreEeMini
[1000.  700.  800.  600.  900.  500.    0.]
6   Education
[3. 5. 4. 2. 1. 0.]
6   EmploymentStatus
[3. 2. 4. 5. 6. 0.]
3   Gender
[1. 0. 2.]
11   HomeOwnershipType
[ 1.  0.  4.  2.  3.  5.  8.  6.  7.  9. 10.]
6   MaritalStatus
[1. 4. 3. 2. 5. 0.]
7   ModelVersion
[6. 0. 1. 2. 3. 4. 5.]
20   OccupationArea
[ 7. 16.  9.  1. 10. 12.  8.  6. 11. 17.  5.  4.  3. 19.  0. 18. 15. 13.
 14.  2.]
2   RecoveryStage
[1. 2.]
16   UseOfLoan
[  7.   2.   0.   6.   8.   3.   5.   4.   1. 110. 101. 102. 104. 108.
 106. 107.]
5   VerificationType
[2. 4. 1. 3. 0.]


In [26]:
# handle missing values for numerical features by replacing with mean 
for num_column in numerical_columns:
    mean_value = new_data[num_column].mean()
    new_data[num_column].fillna(mean_value,inplace=True)
    
# handle missing values for categorical features by replacing with mode 
for cat_column in categorical_columns:
    mode_value = new_data[cat_column].mode().values[0]
    new_data[cat_column].fillna(mode_value,inplace=True)

for column in new_data:
    unique_count = new_data[column].nunique()
    if(unique_count <= 40):
        print(unique_count, ' ', column)
        print(new_data[column].unique())

2   ActiveScheduleFirstPaymentReached
[ True False]
4   Country
['EE' 'FI' 'ES' 'SK']
7   CreditScoreEeMini
[1000.  700.  800.  600.  900.  500.    0.]
11   CreditScoreEsMicroL
['M' 'M3' 'M5' 'M1' 'M9' 'M2' 'M6' 'M4' 'M8' 'M7' 'M10']
6   Education
[3. 5. 4. 2. 1. 0.]
9   EmploymentDurationCurrentEmployer
['UpTo3Years' 'MoreThan5Years' 'UpTo4Years' 'UpTo2Years' 'UpTo1Year'
 'UpTo5Years' 'TrialPeriod' 'Other' 'Retiree']
6   EmploymentStatus
[3. 2. 4. 5. 6. 0.]
39   ExistingLiabilities
[ 0  6  4  1  8  3  2  5  7  9 10 12 11 15 14 17 13 16 18 24 19 26 23 20
 21 22 25 27 36 30 28 35 29 31 32 33 34 39 40]
3   Gender
[1. 0. 2.]
11   HomeOwnershipType
[ 1.  0.  4.  2.  3.  5.  8.  6.  7.  9. 10.]
13   LanguageCode
[ 1  3  2  4  6 22 15  9  5 10 13  7 21]
31   LoanDuration
[12  1 20 15 24  6  2  5  3 10 18  4 17  9 13  7 14  8 22 16 11 19 60 21
 36 48 30 42 27 52 38]
6   MaritalStatus
[1. 4. 3. 2. 5. 0.]
7   ModelVersion
[6. 0. 1. 2. 3. 4. 5.]
2   NewCreditCustomer
[ True False]
26   NoOfPrevi

In [27]:
print(new_data.isnull().sum())

ActiveScheduleFirstPaymentReached         0
Age                                       0
Amount                                    0
AmountOfPreviousLoansBeforeLoan           0
AppliedAmount                             0
BidsApi                                   0
BidsManual                                0
BidsPortfolioManager                      0
Country                                   0
CreditScoreEeMini                         0
CreditScoreEsMicroL                       0
DebtToIncome                              0
Education                                 0
EmploymentDurationCurrentEmployer         0
EmploymentStatus                          0
ExistingLiabilities                       0
ExpectedLoss                              0
ExpectedReturn                            0
FreeCash                                  0
Gender                                    0
HomeOwnershipType                         0
IncomeTotal                               0
Interest                        

## - Encoding Categorical Variables

In [28]:
# boolean to int
for bool_column in boolean_columns:
    new_data[bool_column] = new_data[bool_column].astype(int)

In [29]:
# convert features with values that contain date + time to contain only date
date_time_features = ['BiddingStartedOn','LoanApplicationStartedDate','ListedOnUTC','StageActiveSince']
for col in date_time_features:
    new_data[col] = new_data[col].str.split().str[0]
    
print(new_data[date_time_features].nunique())

KeyError: 'BiddingStartedOn'

In [None]:
mapping = {'1-7': 1, '8-15': 2, '16-30': 3, '31-60': 4, '61-90': 5, '91-120': 6, '121-150': 7, '151-180': 8, '180+': 9}

features = ['ActiveLateCategory','ActiveLateLastPaymentCategory','WorseLateCategory']
new_data[features] = new_data[features].replace(mapping)

for col in features:
    print(new_data[col].unique())

In [None]:
mapping = { 'LessThan2Years':1,'2To5Years':2,'5To10Years':3,'10To15Years':4,'15To25Years':5,'MoreThan25Years':6}

new_data['WorkExperience'] = new_data['WorkExperience'].replace(mapping)

print(new_data['WorkExperience'].unique())

In [None]:
new_data['CreditScoreEsMicroL'] = new_data['CreditScoreEsMicroL'].str[1:].astype(int)

print(new_data['CreditScoreEsMicroL'].unique())

In [None]:
mapping = {'TrialPeriod':0,'UpTo1Year':1,'UpTo2Years':2,'UpTo3Years':3,'UpTo4Years':4,'UpTo5Years':5,'MoreThan5Years':6,'Retiree':7,'Other':8}

new_data['EmploymentDurationCurrentEmployer'] = new_data['EmploymentDurationCurrentEmployer'].replace(mapping)

print(new_data['EmploymentDurationCurrentEmployer'].unique())

In [None]:
new_data['NrOfDependants'] = new_data['NrOfDependants'].str[:2].astype(int)

print(new_data['NrOfDependants'].unique())

In [None]:
mapping = {'AA':1,'A':2,'B':3,'C':4,'D':5,'E':6,'F':7,'HR':8  }

new_data['Rating'] = new_data['Rating'].replace(mapping)

print(new_data['Rating'].unique())

In [None]:
mapping = {'EE':1,'FI':2,'ES':3,'SK':4}

new_data['Country'] = new_data['Country'].replace(mapping)

print(new_data['Country'].unique())

## - Handle outliers

In [None]:
def outliers_handling(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    lower_range = Q1 - 1.5 * (Q3 - Q1)
    upper_range = Q3 + 1.5 * (Q3 - Q1)
    column = np.where(column > upper_range, upper_range, column)
    column = np.where(column < lower_range, lower_range, column)
    return column

for num_column in numerical_columns:
    new_data[num_column] = outliers_handling(new_data[num_column])

In [None]:
for num_column in numerical_columns:
    new_data[num_column].fillna(new_data[num_column].mean(),inplace=True)
    print(num_column,' ' , new_data[num_column].isnull().sum())
    
for cat_column in categorical_columns:
#     print(cat_column,' ' , new_data[cat_column].isnull().sum())
    new_data[cat_column].fillna(new_data[cat_column].mode().values[0],inplace=True)
    print(cat_column,' ' , new_data[cat_column].isnull().sum())