In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

### Import data from 'credit_scoring_sample.csv'

In [3]:
data = pd.read_csv('../mlcourse.ai/data/credit_scoring_sample.csv', sep=';')
data.head()

Unnamed: 0,SeriousDlqin2yrs,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0,64,0,0.249908,0,0,8158.0,0.0
1,0,58,0,3870.0,0,0,,0.0
2,0,41,0,0.456127,0,0,6666.0,0.0
3,0,43,0,0.00019,0,0,10500.0,2.0
4,1,49,0,0.27182,0,0,400.0,0.0


First, let's see what types of data exist in the dataframe

In [9]:
data.dtypes ## returns a Series so we can perform more pandas methods

SeriousDlqin2yrs                          int64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
NumberOfTimes90DaysLate                   int64
NumberOfTime60-89DaysPastDueNotWorse      int64
MonthlyIncome                           float64
NumberOfDependents                      float64
dtype: object

In [10]:
data.dtypes.value_counts()

int64      5
float64    3
dtype: int64

Let's search for 'NaN' values, identify which columns have them, and replace with the `mean` of each column

In [37]:
has_nan = {}
for col in data.columns:
    has_nan[col] = data[col].isnull().any()
print(has_nan)

{'SeriousDlqin2yrs': False, 'age': False, 'NumberOfTime30-59DaysPastDueNotWorse': False, 'DebtRatio': False, 'NumberOfTimes90DaysLate': False, 'NumberOfTime60-89DaysPastDueNotWorse': False, 'MonthlyIncome': True, 'NumberOfDependents': True}


In [47]:
imp = SimpleImputer(missing_values=np.nan, strategy="mean")

In [54]:
imputed_data = imp.fit_transform(data)

In [56]:
new_data = pd.DataFrame(imputed_data, columns=data.columns)

The dataframe `new_data` has columns only in `float64`, but we want to convert most to `int` types

In [67]:
int_cols = [col for col in list(new_data.columns) if col not in ['DebtRatio', 'MonthlyIncome']]

In [70]:
for i_col in int_cols:
    new_data[i_col] = new_data[i_col].astype('int64')

new_data.dtypes

SeriousDlqin2yrs                          int64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
NumberOfTimes90DaysLate                   int64
NumberOfTime60-89DaysPastDueNotWorse      int64
MonthlyIncome                           float64
NumberOfDependents                        int64
dtype: object

In [71]:
new_data.head()

Unnamed: 0,SeriousDlqin2yrs,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0,64,0,0.249908,0,0,8158.0,0
1,0,58,0,3870.0,0,0,6452.263399,0
2,0,41,0,0.456127,0,0,6666.0,0
3,0,43,0,0.00019,0,0,10500.0,2
4,1,49,0,0.27182,0,0,400.0,0
