In [22]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [23]:
def load():
    path = '../data/credir_risk_reto.xlsx'
    data = pd.read_excel(path)

    print("DATA HEAD")
    print()
    print(data.head())

    print("DATA TYPES AND FEATURES")
    print()
    print(data.info()) #Getting data type and features info
    print("-"*100)
    print()
    print("MISSING VALUES")
    print()
    print(data.isna().sum()) #Getting the number of missing values per feature
    print("-"*100)
    print()

    return data

In [29]:
def impute(data):
    # We have missing values in these features: Saving accounts --> 183 | Checking account --> 394 (both categorical features)
    # I selected to use MICE imputation (secuencial regressor for non-monotone missing values)
    
    # First we are gonna encode the categorical features with ordinal encoder (just for the imputation process)
    categorical_features = ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']
    ordinal_encoder = OrdinalEncoder()
    data[categorical_features] = ordinal_encoder.fit_transform(data[categorical_features])

    # Apply MICE imputation (IterativeImputer)
    imputer = IterativeImputer(random_state=42)
    data_imputed = imputer.fit_transform(data)
    
    # Convert the DF again with his column names
    data_imputed = pd.DataFrame(data_imputed, columns=data.columns)
    
    # Round the values imputed and convert it into int to decode
    data_imputed[categorical_features] = data_imputed[categorical_features].round().astype(int)
    
    # Return to the original categorical values (decode)
    data_imputed[categorical_features] = ordinal_encoder.inverse_transform(data_imputed[categorical_features])
    data_imputed.to_excel("../data/credit_risk_imputed.xlsx", index=False)

In [28]:
def main():
    data = load()
    impute(data)

In [27]:
main()

DATA HEAD

   Age     Sex  Job Housing Saving accounts Checking account  Credit amount  \
0   67    male    2     own             NaN           little           1169   
1   22  female    2     own          little         moderate           5951   
2   49    male    1     own          little              NaN           2096   
3   45    male    2    free          little           little           7882   
4   53    male    2    free          little           little           4870   

   Duration              Purpose  
0         6             radio/TV  
1        48             radio/TV  
2        12            education  
3        42  furniture/equipment  
4        24                  car  
DATA TYPES AND FEATURES

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null