In [2]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt

In [3]:
# Load the data
df = pd.read_csv('cat_model_data.csv', index_col = 0)

In [4]:
# Data for iterative imputing
df_iter_impute = df.copy()

In [5]:
df_iter_impute.isna().sum()

MIS_Status                    0
City                          0
State                         0
Zip                           0
Bank                          0
BankState                     0
NAICS                         0
Term                          0
NoEmp                         0
NewExist                   1162
CreateJob                     0
RetainedJob                   0
FranchiseCode                 0
UrbanRural               322826
RevLineCr                277255
LowDoc                        0
GrAppv                        0
NAICS_class_code              0
Industry                      0
FranchiseCode_Encoded         0
RealEstate_Backed             0
Region                        0
TermDays                      0
Recession                     0
dtype: int64

#### Dealing with missing values

- Create 'missing' variables for columns with NaNs

In [6]:
df_iter_impute.columns

Index(['MIS_Status', 'City', 'State', 'Zip', 'Bank', 'BankState', 'NAICS',
       'Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob',
       'FranchiseCode', 'UrbanRural', 'RevLineCr', 'LowDoc', 'GrAppv',
       'NAICS_class_code', 'Industry', 'FranchiseCode_Encoded',
       'RealEstate_Backed', 'Region', 'TermDays', 'Recession'],
      dtype='object')

In [7]:
# Set pandas options to display all rows and columns in full width
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

df_iter_impute.head()

Unnamed: 0,MIS_Status,City,State,Zip,Bank,BankState,NAICS,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,GrAppv,NAICS_class_code,Industry,FranchiseCode_Encoded,RealEstate_Backed,Region,TermDays,Recession
0,PIF,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,84,4,2.0,0,0,1,,N,Y,60000.0,45,Retail trade,No,No,Eastern,2520,0
1,PIF,NEW PARIS,IN,46526,1ST SOURCE BANK,IN,722410,60,2,2.0,0,0,1,,N,Y,40000.0,72,Accommodation and food services,No,No,Eastern,1800,0
2,PIF,BLOOMINGTON,IN,47401,GRANT COUNTY STATE BANK,IN,621210,180,7,1.0,0,0,1,,N,N,287000.0,62,Health care and social assistance,No,No,Eastern,5400,0
3,PIF,BROKEN ARROW,OK,74012,1ST NATL BK & TR CO OF BROKEN,OK,0,60,2,1.0,0,0,1,,N,Y,35000.0,81,Other services,No,No,Eastern,1800,0
4,PIF,ORLANDO,FL,32801,FLORIDA BUS. DEVEL CORP,FL,0,240,14,1.0,7,7,1,,N,N,229000.0,81,Other services,No,Yes,Eastern,7200,0


In [8]:
# Create or Encode Existing Features
df_iter_impute['MIS_Status_Encoded'] = df_iter_impute['MIS_Status'].map({"PIF": 1 , "CHGOFF": 0})
df_iter_impute['CreateJob_Encoded'] = df_iter_impute['CreateJob'].apply(lambda x: 1 if x > 0 else 0)
df_iter_impute['FranchiseCode_Encoded'] = df_iter_impute['FranchiseCode_Encoded'].apply(lambda x: 1 if x == 'Yes' else 0)
df_iter_impute['NewExist'] = df_iter_impute['NewExist'].map({1.0: 1, 2.0: 2}, na_action='ignore')
df_iter_impute['UrbanRural'] = df_iter_impute['UrbanRural'].map({1.0: 1, 2.0: 2}, na_action='ignore')
df_iter_impute['RealEstate_Backed'] = df_iter_impute['RealEstate_Backed'].apply(lambda x: 1 if x == 'Yes' else 0)
df_iter_impute['LowDoc'] = df_iter_impute['LowDoc'].map({'Y': 1, 'N': 0}, na_action='ignore')
df_iter_impute['RevLineCr'] = df_iter_impute['RevLineCr'].map({'Y': 1, 'N': 0}, na_action='ignore')
df_iter_impute['NoEmp_Encoded'] = df_iter_impute['NoEmp'].apply(lambda x: 1 if x > 0 else 0)

In [9]:
df_iter_impute.isna().sum()

MIS_Status                    0
City                          0
State                         0
Zip                           0
Bank                          0
BankState                     0
NAICS                         0
Term                          0
NoEmp                         0
NewExist                   1162
CreateJob                     0
RetainedJob                   0
FranchiseCode                 0
UrbanRural               322826
RevLineCr                277255
LowDoc                        0
GrAppv                        0
NAICS_class_code              0
Industry                      0
FranchiseCode_Encoded         0
RealEstate_Backed             0
Region                        0
TermDays                      0
Recession                     0
MIS_Status_Encoded            0
CreateJob_Encoded             0
NoEmp_Encoded                 0
dtype: int64

In [10]:
df_iter_impute.columns

Index(['MIS_Status', 'City', 'State', 'Zip', 'Bank', 'BankState', 'NAICS',
       'Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob',
       'FranchiseCode', 'UrbanRural', 'RevLineCr', 'LowDoc', 'GrAppv',
       'NAICS_class_code', 'Industry', 'FranchiseCode_Encoded',
       'RealEstate_Backed', 'Region', 'TermDays', 'Recession',
       'MIS_Status_Encoded', 'CreateJob_Encoded', 'NoEmp_Encoded'],
      dtype='object')

- Iteratively fill missing data points for columns with NaNs

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier

# Create a copy of the DataFrame 
df_target = df_iter_impute[['MIS_Status_Encoded', 'Term', 'LowDoc', 'GrAppv',
       'NAICS_class_code', 'RealEstate_Backed', 'Recession',
        'UrbanRural', 'RevLineCr', 'NewExist']]

# Initialize the IterativeImputer
imputer = IterativeImputer(estimator=RandomForestClassifier(), random_state=42, n_nearest_features=4)

# Fit and transform the numeric columns
df_imputed = imputer.fit_transform(df_target)

df_imputed =pd.DataFrame(df_imputed, columns=df_target.columns)