In [113]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use('dark_background')

In [114]:
# initial data
data_init = lambda name: pd.read_csv(f'data/{name}_data.csv')

df_train = data_init('train')
df_test = data_init('test')

In [115]:
df_train.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [116]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [117]:
# 5 columns with NaN: [
#   'Annual Income',
#   'Years in current job',
#   'Months since last delinquent',
#   'Bankruptcies',
#   'Credit Score'
# ]
df_train.isna().sum()

Home Ownership                     0
Annual Income                   1557
Years in current job             371
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64

In [130]:
def change_on_median(feature):
    df_train.loc[df_train[feature].isna(), feature] = df_train[feature].median()

def change_on_frequent(feature):
    df_train.loc[df_train[feature].isna(), feature] = df_train[feature].value_counts().index[0]

#### 1. Column: Annual Income  (numeric)

In [119]:
df_train['Annual Income'].head(6)

0     482087.0
1    1025487.0
2     751412.0
3     805068.0
4     776264.0
5          NaN
Name: Annual Income, dtype: float64

In [129]:
# 'Annual Income' == NaN => median
change_on_median('Annual Income')
df_train['Annual Income'].isna().sum()
#df_train.loc[df_train['Annual Income'].isna(), 'Annual Income'] = df_train['Annual Income'].median()

0

#### 2. Column: Years in current job  (categorical)

In [121]:
df_train['Years in current job'].value_counts()

10+ years    2332
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64

In [122]:
# 'Years in current job' == NaN => most frequent  ()
change_on_frequent('Years in current job')
df_train['Years in current job'].value_counts()

10+ years    2703
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64

In [123]:
#### 3. Column: Months since last delinquent (numeric)
df_train['Months since last delinquent'].head()

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: Months since last delinquent, dtype: float64

In [132]:
change_on_median('Months since last delinquent')
df_train['Months since last delinquent'].isna().sum()

0

#### 4. Column: Bankruptcies  (categorical)

In [125]:
df_train['Bankruptcies'].value_counts()

0.0    6660
1.0     786
2.0      31
3.0       7
4.0       2
Name: Bankruptcies, dtype: int64

In [131]:
# 'Bankruptcies' == NaN => most frequent  ()
change_on_frequent('Bankruptcies')
df_train['Bankruptcies'].value_counts()

0.0    6674
1.0     786
2.0      31
3.0       7
4.0       2
Name: Bankruptcies, dtype: int64

#### 5. Column: Credit Score  (numeric)

In [127]:
df_train['Credit Score'].head(6)

0    749.0
1    737.0
2    742.0
3    694.0
4    719.0
5      NaN
Name: Credit Score, dtype: float64

In [133]:
# 'Credit Score' == NaN => median
change_on_median('Credit Score')
df_train['Credit Score'].isna().sum()

0