In [64]:
#%pip install -r requirements.txt

In [65]:
import numpy as np 
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import pickle as pkl
import  gc
import glob
from tqdm import tqdm 
from dateutil.relativedelta import relativedelta
import os

In [66]:
class CFG:
    home_directory = os.path.expanduser('~/kaggle_HomeCredit/parquet_files/')
    train_data_path = os.path.join(home_directory, 'train/')
    test_data_path = os.path.join(home_directory, 'test/')
    
    train_applprev_path = 'train_applprev_*.parquet'
    train_base_path =  'train_base_*.parquet'
    train_credit_path = 'train_credit_bureau_*.parquet'
    train_debitcard_path = 'train_debitcard_1.parquet'
    train_deposit_path = 'train_deposit_1.parquet'
    train_other_path = 'train_other_1.parquet'
    train_person_path = 'train_person_*.parquet'
    train_static_path = 'train_static_*.parquet'
    train_tax__path = 'train_tax_registry_*.parquet'

In [67]:
#Helper functions
def reduce_mem_usage(df, int_cast=True, obj_to_category=False, subset=None):
    """
    データフレームの初期メモリ使用量を計算します。
    全ての列をイテレートし、データ型を確認します。
    数値型の列については、その範囲に基づいて、可能な限り小さい整数型または浮動小数点型に変換します。
    オブジェクト型の列（文字列など）は、必要に応じてカテゴリ型に変換されます。これにより、特にカテゴリの数が少ない場合にメモリ使用量が削減されます。
    変換後のデータフレームのメモリ使用量を再計算し、削減された割合を表示します。
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    gc.collect()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
#     cols_none = subset if subset is  None else df.columns.tolist()
#     for col_non in tqdm(cols_none):
#         df[col_non] = df[col_non].fillna(-888)
    
    cols = subset if subset is not None else df.columns.tolist()

    for col in tqdm(cols):
        col_type = df[col].dtype

        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
#             df[col] = df[col].fillna(-888)
            c_min = df[col].min()
            c_max = df[col].max()

#             # test if column can be converted to an integer
#             treat_as_int = str(col_type)[:3] == 'int'
#             if int_cast and not treat_as_int:
#                 treat_as_int = check_if_integer(df[col])
                
            treat_as_int = True
            if treat_as_int:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8, errors='ignore')
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8, errors='ignore')
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16, errors='ignore')
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16, errors='ignore')
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32, errors='ignore')
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    df[col] = df[col].astype(np.uint32, errors='ignore')
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64, errors='ignore')
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    df[col] = df[col].astype(np.uint64, errors='ignore')
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16, errors='ignore')
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32, errors='ignore')
                else:
                    df[col] = df[col].astype(np.float64, errors='ignore')
        elif 'datetime' not in col_type.name and obj_to_category:
            df[col] = df[col].fillna('Mis')
            df[col] = df[col].astype('category')
    gc.collect()
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

def date_column_depth_0(df):
    """
    特定の命名規則に基づいて日付列を特定し、それらを日付型に変換します。
    'date_decision'列と他の日付列との差分（日数）を計算し、新しい列としてデータフレームに追加します。
    """
    date_columns = ['date_decision'] + [x for x in df.columns if x[-1] == 'D'] 
    df[date_columns] = df[date_columns].apply(pd.to_datetime, errors='coerce')
    df_diff = df[date_columns].apply(lambda col: (df['date_decision'] - col).dt.days)
    df_diff.columns = [f'Diff_{col}' for col in df_diff.columns]
    df = pd.concat([df, df_diff], axis=1)
    return df


def union_parquest(list_parq):
    """
    指定されたParquetファイルのリストからデータフレームを読み込みます。
    各データフレームにreduce_mem_usage関数を適用し、メモリ使用量を削減します。
    これらのデータフレームを結合し、再度メモリ使用量の削減を試みます。
    """
    df_list = [reduce_mem_usage(pd.read_parquet(i)) for i in list_parq]
    union_df = pd.concat(df_list)
    union_df = reduce_mem_usage(union_df)
    return union_df   

def gini(x):
    """
    配列内の各値に対して、他の全ての値との差の絶対値を計算し、これらの差の総和を求めます。
    この総和を配列の長さの2乗と配列の平均値の積で正規化し、Gini係数を計算します。
    """
    total = 0
    for i, xi in enumerate(x[:-1], 1):
        total += np.sum(np.abs(xi - x[i:]))
    return total / (len(x)**2 * np.mean(x))

In [68]:
def multi_merge(base_data,train_vs_test,data_type):
    if train_vs_test ==  'train':
        file_path = CFG.train_data_path
        list_parq =  [file_path + '/' + i for i in os.listdir(file_path) if data_type in i ] 
        
    elif train_vs_test ==  'test':
        file_path = CFG.test_data_path
        list_parq =  [file_path + '/' + i for i in os.listdir(file_path) if data_type in i ] 
        
    df_i_merged = pd.DataFrame()
    
    for i in list_parq:
        print(i)
        df_i = pd.read_parquet(i)
        df_i = reduce_mem_usage(df_i)
        if 'num_group1' in df_i.columns: 
            df_i = df_i[df_i['num_group1'] == 0 ]
            df_i = df_i.drop(columns = 'num_group1')
    #         df_i_merged = df_i_merged.merge(df_i,how = 'left',on = 'case_id')
        df_i_merged = pd.concat([df_i_merged,df_i])
        del df_i
        gc.collect()
    return df_i_merged        

In [69]:
def multi_merge_v2(base_data,train_vs_test,data_type):
    if train_vs_test ==  'train':
        file_path = CFG.train_data_path
        list_parq =  [file_path + '/' + i for i in os.listdir(file_path) if data_type in i ] 
        
    elif train_vs_test ==  'test':
        file_path = CFG.test_data_path
        list_parq =  [file_path + '/' + i for i in os.listdir(file_path) if data_type in i ] 
        
    df_i_merged = pd.DataFrame()
    
    for i in list_parq:
        print(i)
        df_i = pd.read_parquet(i)
        df_i = reduce_mem_usage(df_i)
        if 'num_group1' in df_i.columns: 
#             df_i = df_i[df_i['num_group1'] == 0 ]
            df_i = df_i.drop(columns = 'num_group1')
    #         df_i_merged = df_i_merged.merge(df_i,how = 'left',on = 'case_id')
        df_i_merged = pd.concat([df_i_merged,df_i])
        del df_i
        gc.collect()
    return df_i_merged        

In [70]:
def multi_merge_depth_2(base_data,train_vs_test,data_type):
    if train_vs_test ==  'train':
        file_path = CFG.train_data_path
        list_parq =  [file_path + '/' + i for i in os.listdir(file_path) if data_type in i ] 
        
    elif train_vs_test ==  'test':
        file_path = CFG.test_data_path
        list_parq =  [file_path + '/' + i for i in os.listdir(file_path) if data_type in i ] 
        
    df_i_merged = pd.DataFrame()
    
    for i in list_parq:
        print(i)
        df_i = pd.read_parquet(i)
        df_i = reduce_mem_usage(df_i)
        if 'num_group1' in df_i.columns: 
            df_i = df_i[df_i['num_group1'] == 0 ]
            df_i = df_i.select_dtypes(exclude=['object'])
            df_i = pd.pivot_table(df_i,index = 'case_id', aggfunc= {'max','min'})
            df_i.columns = [f'{j}_{i}' if j != '' else f'{i}' for i,j in df_i.columns]
            df_i = df_i.drop(columns = [ i for i in df_i.columns if 'num_group' in i])
        df_i_merged = pd.concat([df_i_merged,df_i])
        del df_i
        gc.collect()
    return df_i_merged        

In [71]:
#Train
train_base_df = pd.read_parquet(CFG.train_data_path + 'train_base.parquet')
train_base_df = reduce_mem_usage(train_base_df)

Memory usage of dataframe is 58.24 MB


100%|██████████| 5/5 [00:00<00:00, 298.38it/s]

Memory usage after optimization is: 26.207 MB
Decreased by 55.0%





In [72]:
df_merged = train_base_df[['case_id']]
variable_type_list = ['train_static_0',
                      'train_static_cb_0',
                      'train_applprev_1',
                      'train_credit_bureau_a_1',
                     'train_credit_bureau_b_1',
                     'train_debitcard_1',
                     'train_deposit_1',
                     'train_person_1',
                     'train_tax_registry_a_1',
                     'train_tax_registry_b_1',
                     'train_tax_registry_c_1']
for k in variable_type_list:
    df_k = multi_merge(train_base_df,'train',k)
    df_merged = df_merged.merge(df_k,how = 'outer',on = 'case_id')
    del df_k
    gc.collect()
    
    
#Merge with Base
df_merged_train_depth_1_0 = train_base_df.merge(df_merged,how = 'left',on = 'case_id')
del df_merged
#Convert date columns to difference
date_columns_train_depth_1_0 = [x for x in df_merged_train_depth_1_0.columns if x[-1] == 'D']
df_merged_train_depth_1_0 = date_column_depth_0(df_merged_train_depth_1_0)


df_merged_train_depth_1_0 = df_merged_train_depth_1_0.drop(columns = date_columns_train_depth_1_0)
gc.collect()

# df_merged_train_depth_1_0.to_parquet('/kaggle/working/df_merged_train_depth_1_0.parquet')
del df_merged_train_depth_1_0
gc.collect()

/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_static_0_0.parquet
Memory usage of dataframe is 1279.85 MB


100%|██████████| 168/168 [00:00<00:00, 183.78it/s]


Memory usage after optimization is: 1045.325 MB
Decreased by 18.3%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_static_0_1.parquet
Memory usage of dataframe is 666.73 MB


100%|██████████| 168/168 [00:00<00:00, 392.61it/s]


Memory usage after optimization is: 547.050 MB
Decreased by 18.0%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_static_cb_0.parquet
Memory usage of dataframe is 606.73 MB


100%|██████████| 53/53 [00:00<00:00, 182.07it/s]


Memory usage after optimization is: 601.006 MB
Decreased by 0.9%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_applprev_1_0.parquet
Memory usage of dataframe is 1216.09 MB


100%|██████████| 41/41 [00:00<00:00, 61.73it/s]


Memory usage after optimization is: 1175.304 MB
Decreased by 3.4%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_applprev_1_1.parquet
Memory usage of dataframe is 825.27 MB


100%|██████████| 41/41 [00:00<00:00, 108.00it/s]


Memory usage after optimization is: 797.596 MB
Decreased by 3.4%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_1_0.parquet
Memory usage of dataframe is 2476.11 MB


100%|██████████| 79/79 [00:01<00:00, 44.13it/s]


Memory usage after optimization is: 2436.932 MB
Decreased by 1.6%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_1_1.parquet
Memory usage of dataframe is 3621.87 MB


100%|██████████| 79/79 [00:02<00:00, 26.84it/s]


Memory usage after optimization is: 3564.565 MB
Decreased by 1.6%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_1_3.parquet
Memory usage of dataframe is 1253.25 MB


100%|██████████| 79/79 [00:01<00:00, 73.87it/s] 


Memory usage after optimization is: 1233.424 MB
Decreased by 1.6%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_1_2.parquet
Memory usage of dataframe is 2256.48 MB


100%|██████████| 79/79 [00:01<00:00, 43.14it/s]


Memory usage after optimization is: 2220.774 MB
Decreased by 1.6%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_b_1.parquet
Memory usage of dataframe is 29.45 MB


100%|██████████| 45/45 [00:00<00:00, 1865.37it/s]

Memory usage after optimization is: 28.554 MB
Decreased by 3.1%





/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_debitcard_1.parquet
Memory usage of dataframe is 7.20 MB


100%|██████████| 6/6 [00:00<00:00, 796.79it/s]

Memory usage after optimization is: 5.551 MB
Decreased by 22.9%





/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_deposit_1.parquet
Memory usage of dataframe is 5.53 MB


100%|██████████| 5/5 [00:00<00:00, 1589.84it/s]

Memory usage after optimization is: 3.459 MB
Decreased by 37.5%





/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_person_1.parquet
Memory usage of dataframe is 839.52 MB


100%|██████████| 37/37 [00:00<00:00, 290.07it/s]

Memory usage after optimization is: 808.322 MB
Decreased by 3.7%





/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_tax_registry_a_1.parquet
Memory usage of dataframe is 124.96 MB


100%|██████████| 5/5 [00:00<00:00, 246.19it/s]

Memory usage after optimization is: 78.101 MB
Decreased by 37.5%





/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_tax_registry_b_1.parquet
Memory usage of dataframe is 42.26 MB


100%|██████████| 5/5 [00:00<00:00, 702.85it/s]


Memory usage after optimization is: 26.415 MB
Decreased by 37.5%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_tax_registry_c_1.parquet
Memory usage of dataframe is 127.56 MB


100%|██████████| 5/5 [00:00<00:00, 259.43it/s]

Memory usage after optimization is: 79.723 MB
Decreased by 37.5%





0

In [73]:
df_merged = train_base_df[['case_id']]
df_k = multi_merge(df_merged,'train','train_credit_bureau_a_2')


train_bureau = glob.glob(os.path.join(CFG.train_data_path,CFG.train_credit_path))
credit_bureau_b_2 = pd.read_parquet(train_bureau)
# credit_bureau_b_2 = credit_bureau_b_2[credit_bureau_b_2['num_group1'] == 0 ]
credit_bureau_b_2 = credit_bureau_b_2.rename(columns = {'pmts_date_1107D':'record_date','pmts_dpdvalue_108P':'max_dpd'})
credit_bureau_b_2 = credit_bureau_b_2.dropna(subset = 'record_date')
credit_bureau_b_2 = credit_bureau_b_2[['case_id','record_date','max_dpd']]
credit_bureau_b_2['record_date'] = pd.to_datetime(credit_bureau_b_2['record_date'])



# df_k = df_k[df_k['num_group1'] == 0 ]
df_k = df_k.select_dtypes(exclude=['object'])
#Get max record date
df_k['record_date'] = pd.to_datetime(df_k[['pmts_year_1139T', 'pmts_year_507T']].max(axis = 1).astype('Int64').astype(str) +  '-'  +df_k[['pmts_month_158T', 'pmts_month_706T']].max(axis = 1).astype('Int64').astype(str) +  '-' +  '1',errors= 'coerce')
#Get max dpd
df_k['max_dpd'] = df_k[['pmts_dpd_1073P', 'pmts_dpd_303P']].max(axis = 1)
df_k = df_k[['case_id','record_date','max_dpd']]
df_k = pd.concat([df_k,credit_bureau_b_2],axis = 0)
#Merge with base
df_k_merged = train_base_df[['case_id','date_decision']].merge(df_k[['case_id','record_date','max_dpd']], how = 'inner', on ='case_id')
#Delete df_k
del df_k
gc.collect()


df_k_merged['date_decision'] = pd.to_datetime(df_k_merged['date_decision'])
df_k_merged = df_k_merged.assign(
    time_diff=
    (df_k_merged.date_decision.dt.year - df_k_merged.record_date.dt.year) * 12 +
    (df_k_merged.date_decision.dt.month - df_k_merged.record_date.dt.month)
)
df_k_merged = df_k_merged[df_k_merged['time_diff'] >= 0]
df_k_merged['max_dpd'] = df_k_merged['max_dpd'].fillna(0)
# df_k_merged.loc[(df_k_merged['time_diff'] > 0) & (df_k_merged['time_diff'] <= 3),'time_diff_cat'] = '0_3_months'
# df_k_merged.loc[(df_k_merged['time_diff'] > 3) & (df_k_merged['time_diff'] <= 6),'time_diff_cat'] = '3_6_months'
# df_k_merged.loc[(df_k_merged['time_diff'] > 6) & (df_k_merged['time_diff'] <= 9),'time_diff_cat'] = '6_9_months'
# df_k_merged.loc[(df_k_merged['time_diff'] > 9) & (df_k_merged['time_diff'] <= 12),'time_diff_cat'] = '9_12_months'
# df_k_merged.loc[(df_k_merged['time_diff'] > 12) & (df_k_merged['time_diff'] <= 18),'time_diff_cat'] = '12_18_months'
# df_k_merged.loc[(df_k_merged['time_diff'] > 18) & (df_k_merged['time_diff'] <= 24),'time_diff_cat'] = '18_24_months'

# df_k_merged.loc[(df_k_merged['time_diff'] > 0) & (df_k_merged['time_diff'] <= 3),'time_diff_cat'] = '0_3_months'
# df_k_merged.loc[(df_k_merged['time_diff'] > 3) & (df_k_merged['time_diff'] <= 6),'time_diff_cat'] = '3_6_months'
# df_k_merged.loc[(df_k_merged['time_diff'] > 6) & (df_k_merged['time_diff'] <= 9),'time_diff_cat'] = '6_9_months'
df_k_merged.loc[(df_k_merged['time_diff'] > 0) & (df_k_merged['time_diff'] <= 6),'time_diff_cat'] = '0_6_months'
df_k_merged.loc[(df_k_merged['time_diff'] > 0) & (df_k_merged['time_diff'] <= 12),'time_diff_cat'] = '0_12_months'
df_k_merged.loc[(df_k_merged['time_diff'] > 0) & (df_k_merged['time_diff'] <= 24),'time_diff_cat'] = '0_24_months'
df_k_merged.loc[(df_k_merged['time_diff'] > 24),'time_diff_cat'] = '24_months'
df_k_pivot = pd.pivot_table(df_k_merged,index = 'case_id', columns = 'time_diff_cat',values = 'max_dpd',aggfunc= {'max','min','mean','median'})
del df_k_merged
gc.collect()

df_k_pivot.columns = [f'{i}_{j}' if j != '' else f'{i}' for i,j in df_k_pivot.columns]
df_k_pivot = df_k_pivot.fillna(0)


/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_4.parquet
Memory usage of dataframe is 3917.61 MB


100%|██████████| 19/19 [00:02<00:00,  8.93it/s]


Memory usage after optimization is: 3479.457 MB
Decreased by 11.2%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_5.parquet
Memory usage of dataframe is 4791.42 MB


100%|██████████| 19/19 [00:02<00:00,  6.98it/s]


Memory usage after optimization is: 4255.541 MB
Decreased by 11.2%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_10.parquet
Memory usage of dataframe is 635.80 MB


100%|██████████| 19/19 [00:00<00:00, 69.68it/s]


Memory usage after optimization is: 560.505 MB
Decreased by 11.8%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_7.parquet
Memory usage of dataframe is 1167.78 MB


100%|██████████| 19/19 [00:00<00:00, 37.74it/s]


Memory usage after optimization is: 1037.176 MB
Decreased by 11.2%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_6.parquet
Memory usage of dataframe is 3698.08 MB


100%|██████████| 19/19 [00:02<00:00,  8.33it/s]


Memory usage after optimization is: 3284.483 MB
Decreased by 11.2%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_3.parquet
Memory usage of dataframe is 3850.66 MB


100%|██████████| 19/19 [00:02<00:00,  8.51it/s]


Memory usage after optimization is: 3419.997 MB
Decreased by 11.2%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_2.parquet
Memory usage of dataframe is 2593.82 MB


100%|██████████| 19/19 [00:01<00:00, 14.65it/s]


Memory usage after optimization is: 2303.722 MB
Decreased by 11.2%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_9.parquet
Memory usage of dataframe is 2714.09 MB


100%|██████████| 19/19 [00:01<00:00, 12.74it/s]


Memory usage after optimization is: 2410.541 MB
Decreased by 11.2%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_0.parquet
Memory usage of dataframe is 767.70 MB


100%|██████████| 19/19 [00:00<00:00, 76.38it/s]


Memory usage after optimization is: 676.792 MB
Decreased by 11.8%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_1.parquet
Memory usage of dataframe is 1139.64 MB


100%|██████████| 19/19 [00:00<00:00, 37.93it/s]


Memory usage after optimization is: 1012.177 MB
Decreased by 11.2%
/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train//train_credit_bureau_a_2_8.parquet
Memory usage of dataframe is 2018.85 MB


100%|██████████| 19/19 [00:01<00:00, 16.75it/s]


Memory usage after optimization is: 1793.055 MB
Decreased by 11.2%


: 

In [None]:
# df_k_pivot.to_parquet('/kaggle/working/df_merged_train_depth_2_v2.parquet')
del df_k_pivot
gc.collect()

In [None]:
df_merged_train_depth_1_0 = pd.read_parquet('/kaggle/input/efficient-data-read-only-pandas-lgbm/df_merged_train_depth_1_0.parquet')
df_merged_train_depth_2 = pd.read_parquet('/kaggle/input/efficient-data-read-only-pandas-lgbm/df_merged_train_depth_2_v2.parquet')
df_merged_train_depth_1_0 = reduce_mem_usage(df_merged_train_depth_1_0)
df_merged_train_depth_2 = reduce_mem_usage(df_merged_train_depth_2)
df_merged_train = df_merged_train_depth_1_0.merge(df_merged_train_depth_2,how = 'left',on=  'case_id')
del df_merged_train_depth_1_0
del df_merged_train_depth_2
gc.collect()

In [None]:
#Fill Missialue
num_cols = df_merged_train.select_dtypes(include=np.number).columns
df_merged_train[num_cols] = df_merged_train[num_cols].fillna(0)

object_cols = df_merged_train.select_dtypes(include='object').columns
df_merged_train[object_cols] = df_merged_train[object_cols].fillna('Mis')
df_merged_train = df_merged_train.drop_duplicates(subset= 'case_id')    
    
#Reindexing
identifier_cols = ['date_decision','MONTH']
target = 'target'
# Reindex
df_merged_train = df_merged_train.set_index(['case_id','WEEK_NUM']) 

In [None]:
#Define X,y
X = df_merged_train.drop(columns = identifier_cols + [target])
X = X.select_dtypes(exclude=['object'])
y = df_merged_train['target']
#Delete data
del df_merged_train
gc.collect()
#Pick some weeks from starting and some weeks from end as OOT
oot_weeks = [0,  1,  2,  3, 
                        48, 49, 50, 51, 52,
                        87, 88, 89,90, 91]
#oot df
X_oot = X[X.index.isin(oot_weeks,level = 1)]
y_oot = y[y.index.isin(oot_weeks,level = 1)]

#training df
X = X[~X.index.isin(oot_weeks,level = 1)]
y = y[~y.index.isin(oot_weeks,level = 1)]


#Train test split(stratified with WEEK_NUM in index 1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, stratify= list(X.index.get_level_values(1)) , random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val,stratify= list(X_val.index.get_level_values(1)) ,test_size=0.50, random_state=42)
#delete
del X,y
gc.collect()

In [None]:
def investigate_data(file_path):
    for file in file_path:
        file_name = os.path.basename(file)
        df = pl.read_parquet(file)
        print(f"ファイル名: {file_name}")
        print(df)

In [None]:
file_path = glob.glob(os.path.join(CFG.train_data_path,CFG.train_applprev_path))
investigate_data(file_path)

ファイル名: train_applprev_2.parquet
shape: (14_075_487, 6)
┌─────────┬─────────────────────┬──────────────────┬─────────────────────┬────────────┬────────────┐
│ case_id ┆ cacccardblochreas_1 ┆ conts_type_509L  ┆ credacc_cards_statu ┆ num_group1 ┆ num_group2 │
│ ---     ┆ 47M                 ┆ ---              ┆ s_52L               ┆ ---        ┆ ---        │
│ i64     ┆ ---                 ┆ str              ┆ ---                 ┆ i64        ┆ i64        │
│         ┆ str                 ┆                  ┆ str                 ┆            ┆            │
╞═════════╪═════════════════════╪══════════════════╪═════════════════════╪════════════╪════════════╡
│ 2       ┆ null                ┆ EMPLOYMENT_PHONE ┆ null                ┆ 1          ┆ 1          │
│ 2       ┆ null                ┆ EMPLOYMENT_PHONE ┆ null                ┆ 0          ┆ 1          │
│ 2       ┆ null                ┆ PRIMARY_MOBILE   ┆ null                ┆ 0          ┆ 0          │
│ 2       ┆ null                ┆ PR