### Importing Packages

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 

import os
from loguru import logger

import warnings
warnings.filterwarnings('ignore')
import gc

In [2]:
mkdir train

### Common functions

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
#         print(f'{col}:{col_type}')
        
        if col_type not in [object,'category']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

### Importing Data

##### Importing train base data

In [4]:
input_path = "/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/"
output_path = "/kaggle/working/train/"

In [5]:
for dirname, _, filenames in os.walk('/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/'):
    for filename in filenames:
        logger.info(f"Filename is:{filename}")

[32m2024-03-03 13:24:47.428[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mFilename is:train_credit_bureau_a_1_3.csv[0m
[32m2024-03-03 13:24:47.430[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mFilename is:train_static_cb_0.csv[0m
[32m2024-03-03 13:24:47.432[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mFilename is:train_applprev_1_0.csv[0m
[32m2024-03-03 13:24:47.433[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mFilename is:train_person_2.csv[0m
[32m2024-03-03 13:24:47.434[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mFilename is:train_base.csv[0m
[32m2024-03-03 13:24:47.436[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mFilename is:train_tax_registry_a_1.csv[0m
[32m2024-03-03 13:24:47.437[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mFilename is:train_s

## Base data

In [6]:
df_base = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_base.csv")

In [7]:
df_base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   case_id        1526659 non-null  int64 
 1   date_decision  1526659 non-null  object
 2   MONTH          1526659 non-null  int64 
 3   WEEK_NUM       1526659 non-null  int64 
 4   target         1526659 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 58.2+ MB


In [8]:
logger.info(f"Number of unique records: {df_base['case_id'].nunique()}")

[32m2024-03-03 13:24:49.329[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mNumber of unique records: 1526659[0m


In [9]:
df_base = reduce_mem_usage(df_base)

Memory usage of dataframe is 58.24 MB
Memory usage after optimization is: 17.49 MB
Decreased by 70.0%


### Datasets with depth 0 

#### Static data

In [10]:
df_static_0 = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_static_0_0.csv")
df_static_1 = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_static_0_1.csv")

In [11]:
df_static_0 = reduce_mem_usage(df_static_0)
df_static_1 = reduce_mem_usage(df_static_1)

Memory usage of dataframe is 1279.85 MB
Memory usage after optimization is: 361.68 MB
Decreased by 71.7%
Memory usage of dataframe is 666.73 MB
Memory usage after optimization is: 190.29 MB
Decreased by 71.5%


In [12]:
logger.info(f"Number of unique records in 0: {df_static_0['case_id'].nunique()}")
logger.info(f"Number of unique records in 1: {df_static_1['case_id'].nunique()}")

[32m2024-03-03 13:25:57.143[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mNumber of unique records in 0: 1003757[0m
[32m2024-03-03 13:25:57.159[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mNumber of unique records in 1: 522902[0m


In [13]:
## Combining the datasets
df_static = pd.concat([df_static_0,df_static_1],ignore_index=True)

In [14]:
df_static = reduce_mem_usage(df_static)

Memory usage of dataframe is 768.74 MB
Memory usage after optimization is: 552.16 MB
Decreased by 28.2%


In [15]:
logger.info(f"Total records in static dataset: {df_static['case_id'].nunique()}")

[32m2024-03-03 13:26:07.737[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mTotal records in static dataset: 1526659[0m


#### Static CB

In [16]:
df_static_cb = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_static_cb_0.csv")

In [17]:
logger.info(f"Number of unique records in 0: {df_static_cb['case_id'].nunique()}")

[32m2024-03-03 13:26:19.829[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mNumber of unique records in 0: 1500476[0m


In [18]:
df_static_cb = reduce_mem_usage(df_static_cb)
df_static_cb.info()

Memory usage of dataframe is 606.73 MB
Memory usage after optimization is: 159.58 MB
Decreased by 73.7%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500476 entries, 0 to 1500475
Data columns (total 53 columns):
 #   Column                   Non-Null Count    Dtype   
---  ------                   --------------    -----   
 0   case_id                  1500476 non-null  int32   
 1   assignmentdate_238D      136996 non-null   category
 2   assignmentdate_4527235D  114978 non-null   category
 3   assignmentdate_4955616D  71633 non-null    category
 4   birthdate_574D           607871 non-null   category
 5   contractssum_5085716L    157329 non-null   float32 
 6   dateofbirth_337D         1385691 non-null  category
 7   dateofbirth_342D         36500 non-null    category
 8   days120_123L             1385691 non-null  float16 
 9   days180_256L             1385691 non-null  float16 
 10  days30_165L              1385691 non-null  float16 
 11  days360_512L             1385691 non-

In [19]:
gc.collect()

0

##### Saving the new files 

In [20]:
!pwd

/kaggle/working


In [21]:
mkdir train

mkdir: cannot create directory 'train': File exists


In [22]:
df_base.to_parquet('train/train_base.parquet')
df_static.to_parquet('train/train_static.parquet')
df_static_cb.to_parquet('train/train_static_cb.parquet')

### Datasets with Depth 1

In [23]:
df_applprev_1_0 = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_applprev_1_0.csv")

df_applprev_1_1 = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_applprev_1_1.csv")

df_applprev = pd.concat([df_applprev_1_0,df_applprev_1_1],ignore_index=True)

In [24]:
df_applprev = reduce_mem_usage(df_applprev)

Memory usage of dataframe is 2041.36 MB
Memory usage after optimization is: 592.79 MB
Decreased by 71.0%


In [25]:
df_applprev.to_parquet('/kaggle/working/train/train_applprev.parquet')

In [26]:
df_other = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_other_1.csv")

In [27]:
df_other = reduce_mem_usage(df_other)

Memory usage of dataframe is 2.73 MB
Memory usage after optimization is: 1.22 MB
Decreased by 55.4%


In [28]:
df_other.to_parquet('/kaggle/working/train/train_other.parquet')

In [29]:
df_tax = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_tax_registry_a_1.csv")

In [30]:
df_tax = reduce_mem_usage(df_tax)

Memory usage of dataframe is 124.96 MB
Memory usage after optimization is: 52.03 MB
Decreased by 58.4%


In [31]:
df_tax[df_tax['case_id'] == 28631]

Unnamed: 0,case_id,amount_4527230A,name_4527232M,num_group1,recorddate_4527225D
0,28631,1946.0,f980a1ea,2,2019-09-13
1,28631,711.0,f980a1ea,3,2019-09-13
2,28631,3616.400146,f980a1ea,0,2019-09-13
3,28631,2600.0,f980a1ea,1,2019-09-13


In [32]:
df_tax.to_parquet('/kaggle/working/train/train_tax.parquet')

In [33]:
df_tax_registry = pd.read_csv(input_path+'train_tax_registry_a_1.csv')

In [34]:
df_tax_registry = reduce_mem_usage(df_tax_registry)

Memory usage of dataframe is 124.96 MB
Memory usage after optimization is: 52.03 MB
Decreased by 58.4%


In [35]:
df_tax_registry.to_parquet(output_path+"train_tax_registry.parquet")

In [36]:
df_tax_registry_b = pd.read_csv(input_path+"train_tax_registry_b_1.csv")

df_tax_registry_b = reduce_mem_usage(df_tax_registry_b)

df_tax_registry_b.to_parquet(output_path+"train_tax_registry_b.parquet")

Memory usage of dataframe is 42.26 MB
Memory usage after optimization is: 18.30 MB
Decreased by 56.7%


In [37]:
df_tax_registry_c = pd.read_csv(input_path+"train_tax_registry_c_1.csv")

df_tax_registry_c = reduce_mem_usage(df_tax_registry_c)

df_tax_registry_c.to_parquet(output_path+"train_tax_registry_c.parquet")

Memory usage of dataframe is 127.56 MB
Memory usage after optimization is: 53.04 MB
Decreased by 58.4%


In [38]:
df_train_credit_bureau_a_1_0 = pd.read_csv(input_path+"train_credit_bureau_a_1_0.csv")
df_train_credit_bureau_a_1_0 = reduce_mem_usage(df_train_credit_bureau_a_1_0)
df_train_credit_bureau_a_1_0.to_parquet(output_path+"train_credit_bureau_a_1_0.parquet")

df_train_credit_bureau_a_1_1 = pd.read_csv(input_path+"train_credit_bureau_a_1_1.csv")
df_train_credit_bureau_a_1_1 = reduce_mem_usage(df_train_credit_bureau_a_1_1)
df_train_credit_bureau_a_1_1.to_parquet(output_path+"train_credit_bureau_a_1_1.parquet")

df_train_credit_bureau_a_1_2 = pd.read_csv(input_path+"train_credit_bureau_a_1_2.csv")
df_train_credit_bureau_a_1_2 = reduce_mem_usage(df_train_credit_bureau_a_1_2)
df_train_credit_bureau_a_1_2.to_parquet(output_path+"train_credit_bureau_a_1_2.parquet")

df_train_credit_bureau_a_1_3 = pd.read_csv(input_path+"train_credit_bureau_a_1_3.csv")
df_train_credit_bureau_a_1_3 = reduce_mem_usage(df_train_credit_bureau_a_1_3)
df_train_credit_bureau_a_1_3.to_parquet(output_path+"train_credit_bureau_a_1_3.parquet")

Memory usage of dataframe is 2476.11 MB
Memory usage after optimization is: 836.31 MB
Decreased by 66.2%
Memory usage of dataframe is 3621.87 MB
Memory usage after optimization is: 1256.91 MB
Decreased by 65.3%
Memory usage of dataframe is 2256.48 MB
Memory usage after optimization is: 762.46 MB
Decreased by 66.2%
Memory usage of dataframe is 1253.25 MB
Memory usage after optimization is: 416.13 MB
Decreased by 66.8%


In [39]:
df_train_credit_bureau_b = pd.read_csv(input_path+"train_credit_bureau_b_1.csv")
df_train_credit_bureau_b = reduce_mem_usage(df_train_credit_bureau_b)
df_train_credit_bureau_b.to_parquet(output_path+"train_credit_bureau_b.parquet")

Memory usage of dataframe is 29.45 MB
Memory usage after optimization is: 10.33 MB
Decreased by 64.9%


In [40]:
df_train_deposit = pd.read_csv(input_path+"train_deposit_1.csv")
df_train_deposit = reduce_mem_usage(df_train_deposit)
df_train_deposit.to_parquet(output_path+"train_deposit.parquet")

Memory usage of dataframe is 5.53 MB
Memory usage after optimization is: 1.92 MB
Decreased by 65.4%


In [41]:
df_train_person = pd.read_csv(input_path+"train_person_1.csv")
df_train_person = reduce_mem_usage(df_train_person)
df_train_person.to_parquet(output_path+"train_person.parquet")

Memory usage of dataframe is 839.52 MB
Memory usage after optimization is: 159.73 MB
Decreased by 81.0%


In [42]:
df_train_debitcard = pd.read_csv(input_path+"train_debitcard_1.csv")
df_train_debitcard = reduce_mem_usage(df_train_debitcard)
df_train_debitcard.to_parquet(output_path+"train_debitcard_1.parquet")

Memory usage of dataframe is 7.20 MB
Memory usage after optimization is: 2.93 MB
Decreased by 59.4%


In [43]:
df_train_applprev_2 = pd.read_csv(input_path+"train_applprev_2.csv")
df_train_applprev_2 = reduce_mem_usage(df_train_applprev_2)
df_train_applprev_2.to_parquet(output_path+"train_applprev_2.parquet")

Memory usage of dataframe is 644.32 MB
Memory usage after optimization is: 120.81 MB
Decreased by 81.2%


In [44]:
df_train_person_2 = pd.read_csv(input_path+"train_person_2.csv")
df_train_person_2 = reduce_mem_usage(df_train_person_2)
df_train_person_2.to_parquet(output_path+"train_person_2.parquet")

Memory usage of dataframe is 137.92 MB
Memory usage after optimization is: 28.65 MB
Decreased by 79.2%


In [45]:
df_train_credit_bureau_a_2 = pd.read_csv(input_path+"train_credit_bureau_a_2_0.csv")
df_train_credit_bureau_a_2 = reduce_mem_usage(df_train_credit_bureau_a_2)
df_train_credit_bureau_a_2.to_parquet(output_path+"train_credit_bureau_a_2_0.parquet")

df_train_credit_bureau_a_2_1 = pd.read_csv(input_path+"train_credit_bureau_a_2_1.csv")
df_train_credit_bureau_a_2_1 = reduce_mem_usage(df_train_credit_bureau_a_2_1)
df_train_credit_bureau_a_2_1.to_parquet(output_path+"train_credit_bureau_a_2_1.parquet")

df_train_credit_bureau_a_2_2 = pd.read_csv(input_path+"train_credit_bureau_a_2_2.csv")
df_train_credit_bureau_a_2_2 = reduce_mem_usage(df_train_credit_bureau_a_2_2)
df_train_credit_bureau_a_2_2.to_parquet(output_path+"train_credit_bureau_a_2_2.parquet")

df_train_credit_bureau_a_2_3 = pd.read_csv(input_path+"train_credit_bureau_a_2_3.csv")
df_train_credit_bureau_a_2_3 = reduce_mem_usage(df_train_credit_bureau_a_2_3)
df_train_credit_bureau_a_2_3.to_parquet(output_path+"train_credit_bureau_a_2_3.parquet")

df_train_credit_bureau_a_2_4 = pd.read_csv(input_path+"train_credit_bureau_a_2_4.csv")
df_train_credit_bureau_a_2_4 = reduce_mem_usage(df_train_credit_bureau_a_2_4)
df_train_credit_bureau_a_2_4.to_parquet(output_path+"train_credit_bureau_a_2_4.parquet")

df_train_credit_bureau_a_2_5 = pd.read_csv(input_path+"train_credit_bureau_a_2_5.csv")
df_train_credit_bureau_a_2_5 = reduce_mem_usage(df_train_credit_bureau_a_2_5)
df_train_credit_bureau_a_2_5.to_parquet(output_path+"train_credit_bureau_a_2_5.parquet")

df_train_credit_bureau_a_2_6 = pd.read_csv(input_path+"train_credit_bureau_a_2_6.csv")
df_train_credit_bureau_a_2_6 = reduce_mem_usage(df_train_credit_bureau_a_2_6)
df_train_credit_bureau_a_2_6.to_parquet(output_path+"train_credit_bureau_a_2_6.parquet")

df_train_credit_bureau_a_2_7 = pd.read_csv(input_path+"train_credit_bureau_a_2_7.csv")
df_train_credit_bureau_a_2_7 = reduce_mem_usage(df_train_credit_bureau_a_2_7)
df_train_credit_bureau_a_2_7.to_parquet(output_path+"train_credit_bureau_a_2_7.parquet")

df_train_credit_bureau_a_2_8 = pd.read_csv(input_path+"train_credit_bureau_a_2_8.csv")
df_train_credit_bureau_a_2_8 = reduce_mem_usage(df_train_credit_bureau_a_2_8)
df_train_credit_bureau_a_2_8.to_parquet(output_path+"train_credit_bureau_a_2_8.parquet")

df_train_credit_bureau_a_2_9 = pd.read_csv(input_path+"train_credit_bureau_a_2_9.csv")
df_train_credit_bureau_a_2_9 = reduce_mem_usage(df_train_credit_bureau_a_2_9)
df_train_credit_bureau_a_2_9.to_parquet(output_path+"train_credit_bureau_a_2_9.parquet")

df_train_credit_bureau_a_2_10 = pd.read_csv(input_path+"train_credit_bureau_a_2_10.csv")
df_train_credit_bureau_a_2_10 = reduce_mem_usage(df_train_credit_bureau_a_2_10)
df_train_credit_bureau_a_2_10.to_parquet(output_path+"train_credit_bureau_a_2_10.parquet")

Memory usage of dataframe is 767.70 MB
Memory usage after optimization is: 333.35 MB
Decreased by 56.6%
Memory usage of dataframe is 1139.64 MB
Memory usage after optimization is: 307.40 MB
Decreased by 73.0%
Memory usage of dataframe is 2593.82 MB
Memory usage after optimization is: 733.78 MB
Decreased by 71.7%
Memory usage of dataframe is 3850.66 MB
Memory usage after optimization is: 1089.33 MB
Decreased by 71.7%
Memory usage of dataframe is 3917.61 MB
Memory usage after optimization is: 1108.27 MB
Decreased by 71.7%
Memory usage of dataframe is 4791.42 MB
Memory usage after optimization is: 1355.47 MB
Decreased by 71.7%
Memory usage of dataframe is 3698.08 MB
Memory usage after optimization is: 1046.17 MB
Decreased by 71.7%
Memory usage of dataframe is 1167.78 MB
Memory usage after optimization is: 330.36 MB
Decreased by 71.7%
Memory usage of dataframe is 2018.85 MB
Memory usage after optimization is: 571.12 MB
Decreased by 71.7%
Memory usage of dataframe is 2714.09 MB
Memory usage

In [46]:
df_train_credit_bureau_b_2 = pd.read_csv(input_path+"train_credit_bureau_b_2.csv")
df_train_credit_bureau_b_2 = reduce_mem_usage(df_train_credit_bureau_b_2)
df_train_credit_bureau_b_2.to_parquet(output_path+"train_credit_bureau_b_2.parquet")

Memory usage of dataframe is 58.90 MB
Memory usage after optimization is: 18.41 MB
Decreased by 68.7%
