In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_rows = 100
%matplotlib inline

# Load Dataset

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

print('Train:')
print(train.info(verbose=False), '\n')
print('Test:')
print(test.info(verbose=False))

Train:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722143 entries, 0 to 722142
Columns: 70 entries, loan_amnt to target
dtypes: float64(14), int64(38), object(18)
memory usage: 385.7+ MB
None 

Test:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98727 entries, 0 to 98726
Columns: 70 entries, loan_amnt to target
dtypes: float64(14), int64(38), object(18)
memory usage: 52.7+ MB
None


# Data Basic Information.

In [3]:
# imbalanced dataset
target1 = train['target'].sum()
target0 = (1 - train['target']).sum()
print('Target 1:\t', np.round(target1 / len(train), 4))
print('Target 0:\t', np.round(target0 / len(train), 4))
print('Ratio:\t\t', np.round(target0 / target1, 4))

Target 1:	 0.2384
Target 0:	 0.7616
Ratio:		 3.1938


In [4]:
# get missing information
train_null = train.isnull().sum() / len(train)
train_null = train_null.reset_index()
train_null = train_null.rename(columns={'index': 'feature', 0: 'train miss'})

test_null = test.isnull().sum() / len(test)
test_null = test_null.reset_index()
test_null = test_null.rename(columns={'index': 'feature', 0: 'test miss'})

# merge train and test null information
data_null = pd.merge(left=train_null, right=test_null, how='outer')
data_null = data_null.fillna(value=0)

feature_list = data_null['feature']
null_rate = []
not_null_rate = []
category = []
train_unique = []
test_unique = []
for feature in feature_list:
    null_rate.append(train[train[feature].isnull()]['target'].mean())
    not_null_rate.append(train[~train[feature].isnull()]['target'].mean())
    category.append(train[feature].dtype)
    train_unique.append(len(train[feature].unique()))
    test_unique.append(len(test[feature].unique()))

data_null['dtype'] = category    
data_null['null mean'] = null_rate
data_null['not null mean'] = not_null_rate
data_null['train unique'] = train_unique
data_null['test unique'] = test_unique

data_null = data_null.reset_index(drop=True)

data_null[['feature', 'train miss', 'test miss', 'dtype', 'null mean', 
           'not null mean', 'train unique', 'test unique']]

Unnamed: 0,feature,train miss,test miss,dtype,null mean,not null mean,train unique,test unique
0,loan_amnt,0.0,0.0,int64,,0.238444,1482,1471
1,term,0.0,0.0,object,,0.238444,2,2
2,int_rate,0.0,0.0,object,,0.238444,211,92
3,installment,0.0,0.0,float64,,0.238444,62435,17469
4,grade,0.0,0.0,object,,0.238444,7,7
5,sub_grade,0.0,0.0,object,,0.238444,35,35
6,emp_title,0.058919,0.073901,object,0.313223,0.233763,184018,35913
7,emp_length,0.057162,0.072807,object,0.316263,0.233727,12,12
8,home_ownership,0.0,0.0,object,,0.238444,4,5
9,annual_inc,0.0,0.0,float64,,0.238444,41183,8508


# Invalid Data

In [5]:
# there are some problem with `int_rate` and `revol_util`
print(train['int_rate'].values[:3], '\t', test['int_rate'].values[:3])
print(train['revol_util'].values[:3], '\t', train['revol_util'].values[:3])

['6.99%' '12.39%' '13.66%'] 	 ['6.99%' '12.74%' '6.99%']
['31.60%' '29%' '59.40%'] 	 ['31.60%' '29%' '59.40%']


In [6]:
def str_parser(x):
    """ function to parse `int_rate` and `revol_util` """
    if '%' in str(x):
        return float(str(x).replace('%', ''))
    return x

In [7]:
train['int_rate'] = train['int_rate'].apply(str_parser)
train['revol_util'] = train['revol_util'].apply(str_parser)
test['int_rate'] = test['int_rate'].apply(str_parser)
test['revol_util'] = test['revol_util'].apply(str_parser)

# Missing Values

### 1. Categorical Variable

In [8]:
# `emp_title` contains too many categories, one choice is to delete it
train = train.drop(labels='emp_title', axis=1)
test = test.drop(labels='emp_title', axis=1)

In [9]:
# for `emp_length`, since the order matters, we will fill the missing value with mode
emp_length_mode = train['emp_length'].mode()[0]
train['emp_length'] = train['emp_length'].fillna(value=emp_length_mode)
test['emp_length'] = test['emp_length'].fillna(value=emp_length_mode)

In [10]:
# for `title`, we will fill the missing value with `missing`
train['title'] = train['title'].fillna(value='missing')
test['title'] = test['title'].fillna(value='missing')

### 2. Numerical Variable

In [11]:
# for `dti`, there is one value with `dti`= -1, only in training data, just remove it
train = train[train['dti'] != -1]

# fill the missing value with the median to make it more robust
dti_median = train['dti'].median()
train['dti'] = train['dti'].fillna(value=dti_median)
test['dti'] = test['dti'].fillna(value=dti_median)

In [12]:
# for `inq_last_6mths`, fill the missing value with the mode
inq_last_6mths_mode = train['inq_last_6mths'].mode()[0]
train['inq_last_6mths'] = train['inq_last_6mths'].fillna(value=inq_last_6mths_mode)
test['inq_last_6mths'] = test['inq_last_6mths'].fillna(value=inq_last_6mths_mode)

In [13]:
# for `revol_util`, fill the missing value with the median to make it more robust
revol_util_median = train['revol_util'].median()
train['revol_util'] = train['revol_util'].fillna(value=revol_util_median)
test['revol_util'] = test['revol_util'].fillna(value=revol_util_median)

In [14]:
# for `avg_cur_bal`, fill the missing value with the median to make it more robust
avg_cur_bal_median = train['avg_cur_bal'].median()
train['avg_cur_bal'] = train['avg_cur_bal'].fillna(value=avg_cur_bal_median)
test['avg_cur_bal'] = test['avg_cur_bal'].fillna(value=avg_cur_bal_median)

In [15]:
# for `bc_open_to_buy`, fill the missing value with the median to make it more robust
bc_open_to_buy_median = train['bc_open_to_buy'].median()
train['bc_open_to_buy'] = train['bc_open_to_buy'].fillna(value=bc_open_to_buy_median)
test['bc_open_to_buy'] = test['bc_open_to_buy'].fillna(value=bc_open_to_buy_median)

In [16]:
# for `bc_util`, fill the missing value with the median to make it more robust
bc_util_median = train['bc_util'].median()
train['bc_util'] = train['bc_util'].fillna(value=bc_util_median)
test['bc_util'] = test['bc_util'].fillna(value=bc_util_median)

In [17]:
# for `mo_sin_old_il_acct`, fill the missing value with the median to make it more robust
mo_sin_old_il_acct_median = train['mo_sin_old_il_acct'].median()
train['mo_sin_old_il_acct'] = train['mo_sin_old_il_acct'].fillna(value=mo_sin_old_il_acct_median)
test['mo_sin_old_il_acct'] = test['mo_sin_old_il_acct'].fillna(value=mo_sin_old_il_acct_median)

In [18]:
# for `mths_since_recent_bc`, fill the missing value with the median to make it more robust
mths_since_recent_bc_median = train['mths_since_recent_bc'].median()
train['mths_since_recent_bc'] = train['mths_since_recent_bc'].fillna(value=mths_since_recent_bc_median)
test['mths_since_recent_bc'] = test['mths_since_recent_bc'].fillna(value=mths_since_recent_bc_median)

In [19]:
# for `mths_since_recent_inq`, fill the missing value with the median to make it more robust
mths_since_recent_inq_median = train['mths_since_recent_inq'].median()
train['mths_since_recent_inq'] = train['mths_since_recent_inq'].fillna(value=mths_since_recent_inq_median)
test['mths_since_recent_inq'] = test['mths_since_recent_inq'].fillna(value=mths_since_recent_inq_median)

In [20]:
# for `num_rev_accts`, fill the missing value with 0
train['num_rev_accts'] = train['num_rev_accts'].fillna(value=0)
test['num_rev_accts'] = test['num_rev_accts'].fillna(value=0)

In [21]:
# for `num_tl_120dpd_2m`, fill the missing value with 0
train['num_tl_120dpd_2m'] = train['num_tl_120dpd_2m'].fillna(value=0)
test['num_tl_120dpd_2m'] = test['num_tl_120dpd_2m'].fillna(value=0)

In [22]:
# for `percent_bc_gt_75`, fill the missing value with the median to make it more robust
percent_bc_gt_75_median = train['percent_bc_gt_75'].median()
train['percent_bc_gt_75'] = train['percent_bc_gt_75'].fillna(value=percent_bc_gt_75_median)
test['percent_bc_gt_75'] = test['percent_bc_gt_75'].fillna(value=percent_bc_gt_75_median)

# Feature Parsing

In [23]:
data_null[data_null['dtype'] == object]

Unnamed: 0,feature,train miss,test miss,dtype,null mean,not null mean,train unique,test unique
1,term,0.0,0.0,object,,0.238444,2,2
2,int_rate,0.0,0.0,object,,0.238444,211,92
4,grade,0.0,0.0,object,,0.238444,7,7
5,sub_grade,0.0,0.0,object,,0.238444,35,35
6,emp_title,0.058919,0.073901,object,0.313223,0.233763,184018,35913
7,emp_length,0.057162,0.072807,object,0.316263,0.233727,12,12
8,home_ownership,0.0,0.0,object,,0.238444,4,5
10,verification_status,0.0,0.0,object,,0.238444,3,3
11,purpose,0.0,0.0,object,,0.238444,14,13
12,title,0.015882,0.0,object,0.372482,0.236281,1970,12


### 1. `home_ownership`

for `home_ownership`, test data have 5 categories while training data only have 4

In [24]:
test['home_ownership'].value_counts()

MORTGAGE    49505
RENT        36736
OWN         12359
ANY           125
NONE            2
Name: home_ownership, dtype: int64

In [25]:
train['home_ownership'].value_counts()

MORTGAGE    355645
RENT        287768
OWN          78689
ANY             40
Name: home_ownership, dtype: int64

In [26]:
# we change the ANY and NONE type into Other
train.loc[train['home_ownership'].isin(['ANY', 'NONE']), 'home_ownership'] = 'Other'
test.loc[test['home_ownership'].isin(['ANY', 'NONE']), 'home_ownership'] = 'Other'

### 2. `grade` and `sub_grade`

Since for `grade` and `sub_grade`, the order matters, we will user label-encoder to encode it

In [27]:
grade_encoder = LabelEncoder()
grade_encoder = grade_encoder.fit(train['grade'])

train['grade'] = grade_encoder.transform(train['grade'])
test['grade'] = grade_encoder.transform(test['grade'])

In [28]:
sub_grade_encoder = LabelEncoder()
sub_grade_encoder = grade_encoder.fit(train['sub_grade'])

train['sub_grade'] = grade_encoder.transform(train['sub_grade'])
test['sub_grade'] = grade_encoder.transform(test['sub_grade'])

### 3. `emp_length`, 
 For `emp_length`, there are some inherent order

In [29]:
# let's manually endoce it
emp_length_map = {'< 1 year': 0, 
                  '1 year': 1, 
                  '2 years': 2, 
                  '3 years': 3, 
                  '4 years': 4, 
                  '5 years': 5, 
                  '6 years': 6, 
                  '7 years': 7, 
                  '8 years': 9, 
                  '9 years': 9, 
                  '10+ years': 10}

In [30]:
train['emp_length'] = train['emp_length'].apply(lambda x: emp_length_map[x])
test['emp_length'] = test['emp_length'].apply(lambda x: emp_length_map[x])

### 4. `title`
For title, we need to manually encode them to reduce the categorical level

In [31]:
# change into lower case
train['title'] = train['title'].apply(str.lower)
test['title'] = test['title'].apply(str.lower)

In [32]:
lists = ['debt consolidation', 'credit card refinancing', 'business', 'vacation', 
         'home improvement', 'major purchase', 'medical expenses', 'car financing', 
         'moving and relocation', 'home buying', 'green loan', 'consolidation']

train.loc[~train['title'].isin(lists), 'title'] = 'other'
test.loc[~test['title'].isin(lists), 'title'] = 'other'

### 5. `earliest_cr_line`
We can transform `earliest_cr_line` into the variable called `credit_length`

In [33]:
# extract `earliest_year` information
train['earliest_year'] = train['earliest_cr_line'].apply(lambda x: x.split('-')[1])
test['earliest_year'] = test['earliest_cr_line'].apply(lambda x: x.split('-')[1])

In [34]:
# format `earliest_year` information
def parse_earliest_year(x):
    if int(x) < 20:
        return int('20' + x)
    return int('19' + x)

train['earliest_year'] = train['earliest_year'].apply(parse_earliest_year)
test['earliest_year'] = test['earliest_year'].apply(parse_earliest_year)

In [35]:
# create `credit_length` variable
train['credit_length'] = train['year'] - train['earliest_year']
test['credit_length'] = test['year'] - test['earliest_year']

In [36]:
# remove `year`, `earliest_cr_line` and `earliest_year`
train = train.drop(labels=['year', 'earliest_cr_line', 'earliest_year'], axis=1)
test = test.drop(labels=['year', 'earliest_cr_line', 'earliest_year'], axis=1)

### 6. `zip_code`
For `zip_code`, we can try to add latitude, longitude, local population and local economic data

# Final Note
Let's check the cleaned dataset again

In [37]:
# get missing information
train_null = train.isnull().sum() / len(train)
train_null = train_null.reset_index()
train_null = train_null.rename(columns={'index': 'feature', 0: 'train miss'})

test_null = test.isnull().sum() / len(test)
test_null = test_null.reset_index()
test_null = test_null.rename(columns={'index': 'feature', 0: 'test miss'})

# merge train and test null information
data_null = pd.merge(left=train_null, right=test_null, how='outer')
data_null = data_null.fillna(value=0)

feature_list = data_null['feature']
null_rate = []
not_null_rate = []
category = []
train_unique = []
test_unique = []
for feature in feature_list:
    null_rate.append(train[train[feature].isnull()]['target'].mean())
    not_null_rate.append(train[~train[feature].isnull()]['target'].mean())
    category.append(train[feature].dtype)
    train_unique.append(len(train[feature].unique()))
    test_unique.append(len(test[feature].unique()))

data_null['dtype'] = category    
data_null['null mean'] = null_rate
data_null['not null mean'] = not_null_rate
data_null['train unique'] = train_unique
data_null['test unique'] = test_unique

data_null = data_null.reset_index(drop=True)

data_null[['feature', 'train miss', 'test miss', 'dtype', 'null mean', 
           'not null mean', 'train unique', 'test unique']]

Unnamed: 0,feature,train miss,test miss,dtype,null mean,not null mean,train unique,test unique
0,loan_amnt,0.0,0.0,int64,,0.238445,1482,1471
1,term,0.0,0.0,object,,0.238445,2,2
2,int_rate,0.0,0.0,float64,,0.238445,211,92
3,installment,0.0,0.0,float64,,0.238445,62435,17469
4,grade,0.0,0.0,int64,,0.238445,7,7
5,sub_grade,0.0,0.0,int64,,0.238445,35,35
6,emp_length,0.0,0.0,int64,,0.238445,10,10
7,home_ownership,0.0,0.0,object,,0.238445,4,4
8,annual_inc,0.0,0.0,float64,,0.238445,41183,8508
9,verification_status,0.0,0.0,object,,0.238445,3,3


In [38]:
# save to disk
train.to_csv('./data/train_clean.csv', index=False)
test.to_csv('./data/test_clean.csv', index=False)

print('Training set:\t', train.shape)
print('Test set:\t', test.shape)

Training set:	 (722142, 68)
Test set:	 (98727, 68)
