In [1]:
import pandas as pd
import polars as pl
from sklearn.preprocessing import LabelEncoder
import numpy as np

ROOT_DIR = './../home-credit-credit-risk-model-stability/'

In [2]:
# Utils to format the tables
def set_table_dtypes(df):
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

def inspect_columns(df):
    df = pd.DataFrame(df)
    result = pd.DataFrame({
        'unique': df.nunique() == len(df),
        'cardinality': df.nunique(),
        'with_null': df.isna().any(),
        'null_pct': round((df.isnull().sum() / len(df)) * 100, 2),
        '1st_row': df.iloc[0],
        'random_row': df.iloc[np.random.randint(low=0, high=len(df))],
        'last_row': df.iloc[-1],
        'dtype': df.dtypes
    })
    return result

In [3]:
train_appl_prev_depth2 = pd.read_csv(ROOT_DIR + "csv_files/train/train_applprev_2.csv").pipe(set_table_dtypes)

In [4]:
inspect_columns(train_appl_prev_depth2)

Unnamed: 0,unique,cardinality,with_null,null_pct,1st_row,random_row,last_row,dtype
case_id,False,1221522,False,0.0,2,2536086,2703454,int64
cacccardblochreas_147M,False,9,True,0.78,,a55475b1,a55475b1,object
conts_type_509L,False,9,True,17.01,EMPLOYMENT_PHONE,EMPLOYMENT_PHONE,HOME_PHONE,object
credacc_cards_status_52L,False,6,True,97.57,,,,object
num_group1,False,20,False,0.0,1,3,1,int64
num_group2,False,12,False,0.0,1,0,1,int64


### Clean up the data

In [6]:
def appl_prev_depth2_transform(df):
    df.drop('credacc_cards_status_52L', axis=1, inplace=True) # Drop Card status of the previous credit account:  Because 98% is 0
    df.drop('cacccardblochreas_147M', axis=1, inplace=True)   # Drop Card block reason: because 99% of data has value a55475b1.

    df['conts_type_509L'] = LabelEncoder().fit_transform(df['conts_type_509L']) # Encode ordinal value

appl_prev_depth2_transform(train_appl_prev_depth2)

In [7]:
inspect_columns(train_appl_prev_depth2)

Unnamed: 0,unique,cardinality,with_null,null_pct,1st_row,random_row,last_row,dtype
case_id,False,1221522,False,0.0,2,2672730,2703454,int64
conts_type_509L,False,10,False,0.0,1,2,2,int32
num_group1,False,20,False,0.0,1,0,1,int64
num_group2,False,12,False,0.0,1,1,1,int64


In [8]:
train_appl_prev_depth2

Unnamed: 0,case_id,conts_type_509L,num_group1,num_group2
0,2,1,1,1
1,2,1,0,1
2,2,5,0,0
3,2,5,1,0
4,3,5,0,1
...,...,...,...,...
14075482,2703454,5,0,0
14075483,2703454,9,1,3
14075484,2703454,9,0,1
14075485,2703454,5,1,0


In [9]:
train_appl_prev_depth2.to_csv('./../home-credit-credit-risk-model-stability/normalized_csv/train/train_appl_prev_depth2.csv', index=True)