# Inspired from Kaggle : AMOL DESHMUKH

In [117]:
#### Dealing with Warnings ####
import warnings
warnings.filterwarnings('ignore')

In [118]:
#### Dependencies ####
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

In [119]:
train_loan_data = pd.read_csv('Data/train.csv').drop('Id',axis=1)
test_loan_data = pd.read_csv('Data/test.csv').drop('Id',axis=1)

In [120]:
train_loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [121]:
#### Report on Missing Values ####
def missing_values_report(data,train=False):
    features = data.columns
    if train :
        print('----Missing Values in Training Data----')
    else:
        print('----Missing Values in Test Data----')
    for feature in features:
        if data[feature].dtype in ['int64','float64']:
            missing_values = data[feature].isnull().sum()
            if missing_values > 0:
                print(f'{feature} : {missing_values}')
        else:
            # dealing with str input
            missing_values = data[feature].isnull().sum() + data[data[feature] == ""].shape[0]
            if missing_values > 0:
                print(f'{feature} : {missing_values}')
        
missing_values_report(train_loan_data,train=True)
missing_values_report(test_loan_data)

----Missing Values in Training Data----
Annual Income : 1557
Years in current job : 371
Months since last delinquent : 4081
Bankruptcies : 14
Credit Score : 1557
----Missing Values in Test Data----
Annual Income : 513
Years in current job : 86
Months since last delinquent : 1358
Bankruptcies : 3
Credit Score : 513


# Dealing with missing values :
    - Years in current job : if nan then -1, if < 1 year then 0, if 10+ then 10
    - Annual Income : if nan then if years in current job != 0 then median income else 0
    - Months since last delinquent : if nan then median
    - Bankruptcies : if nan then median of people with the same number of credit default
    - Credit Score : if nan then median

In [122]:
### Years in current job ###
train_loan_data['Years in current job'].unique()
years_dict = {
    '-1' : -1,
    '< 1 year' : 0,
    '1 year' : 1,
    '2 years' : 2,
    '3 years' : 3,
    '4 years' : 4,
    '5 years' : 5,
    '6 years' : 6,
    '7 years' : 7,
    '8 years' : 8,
    '9 years' : 9,
    '10+ years' : 10,
}

train_loan_data['Years in current job'] = train_loan_data['Years in current job'].fillna('-1').map(years_dict)
test_loan_data['Years in current job'] = test_loan_data['Years in current job'].fillna('-1').map(years_dict)

In [123]:
### Annual Income ###
train_loan_data['Annual Income'][train_loan_data['Years in current job']==-1] = train_loan_data['Annual Income'][train_loan_data['Years in current job']==-1].fillna(0)
income_med = train_loan_data['Annual Income'].dropna().median()
train_loan_data['Annual Income'][train_loan_data['Years in current job']!=-1] = train_loan_data['Annual Income'][train_loan_data['Years in current job']!=-1].fillna(income_med)
train_loan_data.head()

test_loan_data['Annual Income'][test_loan_data['Years in current job']==-1] = test_loan_data['Annual Income'][test_loan_data['Years in current job']==-1].fillna(0)
income_med = test_loan_data['Annual Income'].dropna().median()
test_loan_data['Annual Income'][test_loan_data['Years in current job']!=-1] = test_loan_data['Annual Income'][test_loan_data['Years in current job']!=-1].fillna(income_med)

In [124]:
### Months Since Last Deliquent ###
delinquent_med = train_loan_data['Months since last delinquent'].median()
train_loan_data['Months since last delinquent'] = train_loan_data['Months since last delinquent'].fillna(delinquent_med)

delinquent_med = test_loan_data['Months since last delinquent'].median()
test_loan_data['Months since last delinquent'] = test_loan_data['Months since last delinquent'].fillna(delinquent_med)

In [125]:
train_loan_data['Number of Credit Problems'].corr(train_loan_data['Bankruptcies'])

0.730750619475721

High correlation between number of credit pbs and bankruptcies

In [126]:
credit_pbs_to_bankrupt_dict = { i : ceil(train_loan_data['Bankruptcies'].dropna()[train_loan_data['Number of Credit Problems']==i].median()) for i in range(8)}
train_loan_data['Bankruptcies'] = train_loan_data['Bankruptcies'].fillna(train_loan_data['Number of Credit Problems'].map(credit_pbs_to_bankrupt_dict))

credit_pbs_to_bankrupt_dict = { i : ceil(test_loan_data['Bankruptcies'].dropna()[test_loan_data['Number of Credit Problems']==i].median()) for i in [0,1,2,3,4,5,7]}
test_loan_data['Bankruptcies'] = test_loan_data['Bankruptcies'].fillna(test_loan_data['Number of Credit Problems'].map(credit_pbs_to_bankrupt_dict))

In [127]:
### Credit Score ###
credit_med = train_loan_data['Credit Score'].median()
train_loan_data['Credit Score'] = train_loan_data['Credit Score'].fillna(credit_med)

credit_med = test_loan_data['Credit Score'].median()
test_loan_data['Credit Score'] = test_loan_data['Credit Score'].fillna(credit_med)

In [129]:
### Testing results ###
missing_values_report(train_loan_data,True)
missing_values_report(test_loan_data)

----Missing Values in Training Data----
----Missing Values in Test Data----
