In [None]:
data = pd.read_csv('../dataset/Bondora_raw.csv', low_memory=False)

In [None]:
def drop_nulls_columns(data):
    dropped_nulls_features=[]
    for column in data:
        nulls_percentage = (data[column].isnull().sum()/data.shape[0])*100
        if(nulls_percentage > 40):
            dropped_nulls_features.append(column) 

    data = data.drop(dropped_nulls_features, axis=1)

In [None]:
def drop_unnecessary_columns(data):
    dropped_features=['LoanId','LoanNumber','ReportAsOfEOD','City','County','DateOfBirth','UserName','IncomeFromChildSupport','IncomeFromFamilyAllowance','IncomeFromLeavePay','IncomeFromPension','IncomeFromPrincipalEmployer','IncomeFromSocialWelfare','IncomeOther','BiddingStartedOn','FirstPaymentDate','LastPaymentOn','ListedOnUTC','LoanApplicationStartedDate','LoanDate','MaturityDate_Last','MaturityDate_Original','StageActiveSince','ApplicationSignedHour','ApplicationSignedWeekday','MonthlyPaymentDay','ModelVersion']
    data = data.drop(dropped_features, axis=1)

In [None]:
def handle_status_column(data):
    data = data[data.Status != 'Current']
    data['Status'] = data['DefaultDate'].notnull().astype(int)
    data = data.drop('DefaultDate', axis=1)

In [None]:
def convert_categorical_column(data):

    data['Education'] = data['Education'].astype(str)
    mapping = {'-1.0':'Unknown','0.0':'Unknown','1.0':'Primary education','2.0':'Basic education','3.0':'Vocational education','4.0':'Secondary education','5.0':'Higher education','nan':np.nan}
    data['Education'] = data['Education'].replace(mapping)
    data['Education'].fillna('Unknown',inplace=True)

    data['EmploymentStatus'] = data['EmploymentStatus'].astype(str)
    mapping = {'-1.0':'Unknown','0.0':'Unknown','1.0':'Unemployed','2.0':'Partially employed','3.0':'Fully employed','4.0':'Self-employed','5.0':'Entrepreneur','6.0':'Retiree','nan':np.nan}
    data['EmploymentStatus'] = data['EmploymentStatus'].replace(mapping)
    data['EmploymentStatus'].fillna('Unknown',inplace=True)

    data['Gender'] = data['Gender'].astype(str)
    mapping = {'0.0':'Male','1.0':'Female','2.0':'Undefined','nan':np.nan}
    data['Gender'] = data['Gender'].replace(mapping)
    data['Gender'].fillna('Undefined',inplace=True)

    data['HomeOwnershipType'] = data['HomeOwnershipType'].astype(str)
    mapping = {'-1.0':'Unknown','0.0': 'Homeless', '1.0': 'Owner', '2.0': 'Living with parents', '3.0': 'Tenant, pre-furnished property',
               '4.0': 'Tenant, unfurnished property', '5.0': 'Council house', '6.0': 'Joint tenant', '7.0': 'Joint ownership', 
               '8.0': 'Mortgage', '9.0': 'Owner with encumbrance', '10.0': 'Other','nan':np.nan}
    data['HomeOwnershipType'] = data['HomeOwnershipType'].replace(mapping)
    data['HomeOwnershipType'].fillna('Unknown',inplace=True)

    data['LanguageCode'] = data['LanguageCode'].astype(str)
    mapping = {'1': 'Estonian', '2': 'English', '3': 'Russian','4': 'Finnish', '5': 'German', '6': 'Spanish', '9': 'Slovakian', 
               '7': 'Other', '22': 'Other', '15': 'Other', '10': 'Other', '13': 'Other', '21': 'Other'}
    data['LanguageCode'] = data['LanguageCode'].replace(mapping)

    data['MaritalStatus'] = data['MaritalStatus'].astype(str)
    mapping = {'-1.0':'Unknown','0.0': 'Unknown', '1.0': 'Married', '2.0': 'Cohabitant', '3.0': 'Single',
               '4.0': 'Divorced', '5.0': 'Widow','nan':np.nan}
    data['MaritalStatus'] = data['MaritalStatus'].replace(mapping)
    data['MaritalStatus'].fillna('Unknown',inplace=True)

    data['OccupationArea'] = data['OccupationArea'].astype(str)
    mapping = {'-1.0':'Unknown','0.0': 'Unknown', '1.0': 'Other', '2.0': 'Mining', '3.0': 'Processing', '4.0': 'Energy', '5.0': 'Utilities', 
               '6.0': 'Construction', '7.0': 'Retail and wholesale', '8.0': 'Transport and warehousing', 
               '9.0': 'Hospitality and catering', '10.0': 'Info and telecom', '11.0': 'Finance and insurance', 
               '12.0': 'Real-estate', '13.0': 'Research', '14.0': 'Administrative', '15.0': 'Civil service & military', 
               '16.0': 'Education', '17.0': 'Healthcare and social help', '18.0': 'Art and entertainment', 
               '19.0': 'Agriculture, forestry and fishing','nan':np.nan}
    data['OccupationArea'] = data['OccupationArea'].replace(mapping)
    data['OccupationArea'].fillna('Unknown',inplace=True)

    data['VerificationType'] = data['VerificationType'].astype(str)
    mapping = {'0.0': 'Not set', '1.0': 'Income unverified', '2.0': 'Income unverified, cross-referenced by phone', '3.0': 'Income verified', '4.0': 'Income and expenses verified','nan':np.nan}
    data['VerificationType'] = data['VerificationType'].replace(mapping)
    data['VerificationType'].fillna('Not set',inplace=True)

    data['UseOfLoan'] = data['UseOfLoan'].astype(str)
    mapping = {'-1':'Unknown','0': 'Loan consolidation', '1': 'Real estate', '2': 'Home improvement', '3': 'Business', 
               '4': 'Education', '5': 'Travel', '6': 'Vehicle', '7': 'Other', '8': 'Health', 
               '101': 'Working capital financing', '102': 'Purchase of machinery equipment', 
               '103': 'Renovation of real estate', '104': 'Accounts receivable financing', 
               '105': 'Acquisition of means of transport', '106': 'Construction finance', 
               '107': 'Acquisition of stocks', '108': 'Acquisition of real estate', 
               '109': 'Guaranteeing obligation', '110': 'Other business'}
    data['UseOfLoan'] = data['UseOfLoan'].replace(mapping)

    data['EmploymentDurationCurrentEmployer'].fillna('Other',inplace=True)

In [None]:
# get numerical features
numerical_columns = []
for column in new_data:
    if(new_data[column].dtype == 'float64' or new_data[column].dtype == 'int64'):
        numerical_columns.append(column)

# get categorical features
categorical_columns = []
for column in new_data:
    if(new_data[column].dtype == 'object'):
        categorical_columns.append(column)  
        
# get boolean features
boolean_columns = []
for column in new_data:
    if(new_data[column].dtype == 'bool'):
        boolean_columns.append(column) 

In [None]:
for bool_column in boolean_columns:
    new_data[bool_column] = new_data[bool_column].astype(int)

In [None]:
# handle outliers using IQR for numerical columns
def outliers_handling(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    lower_range = Q1 - 1.5 * (Q3 - Q1)
    upper_range = Q3 + 1.5 * (Q3 - Q1)
    column = np.where(column > upper_range, upper_range, column)
    column = np.where(column < lower_range, lower_range, column)
    return column

for num_column in numerical_columns:
    new_data[num_column] = outliers_handling(new_data[num_column])