In [37]:
# %additional_python_modules duckdb==1.4.2

In [38]:
# %idle_timeout 10
# %glue_version 5.0
# %worker_type G.1X
# %number_of_workers 2

# import sys
# from awsglue.transforms import *
# from awsglue.utils import getResolvedOptions
# from pyspark.context import SparkContext
# from awsglue.context import GlueContext
# from awsglue.job import Job
  
# sc = SparkContext.getOrCreate()
# glueContext = GlueContext(sc)
# spark = glueContext.spark_session
# job = Job(glueContext)

In [39]:
import duckdb
print("Installed package")

Installed package


In [40]:
import pandas as pd
import numpy as np
import gc

In [41]:
def drop_columns(df, cols):
    # Drop the columns if they exist in the dataframe
    df.drop(columns=[col for col in cols if col in df.columns], inplace=True)
    
    print(f"Dropped {len(cols)} columns: {cols}")

def find_correlation_high_missing(df, threshold=0.8):
    """
    Finds pairs of columns with high correlation and returns the columns with the highest
    number of missing values.
    
    df: pandas DataFrame
        The dataframe to check for correlations.
        
    threshold: float
        The minimum correlation value to consider as "high".
        
    Returns:
    list: A list of column names that are highly correlated and have the most missing values.
    """
    # Calculate the correlation matrix for numeric columns and take the absolute values
    corr_matrix = df.select_dtypes(include=[np.number]).corr().abs()

    # Get the upper triangle of the correlation matrix (excluding the diagonal)
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Set to store columns to drop (those with the most missing values)
    columns_to_drop = set()

    # Iterate over each pair of correlated columns
    for col1 in upper.columns:
        for col2 in upper.index:
            if upper.loc[col2, col1] > threshold:  # If the correlation is above threshold
                # Check the missing values in both columns
                missing_col1 = df[col1].isnull().sum()
                missing_col2 = df[col2].isnull().sum()
                
                # Drop the column with more missing values
                if missing_col1 > missing_col2:
                    columns_to_drop.add(col1)
                elif missing_col1 == 0 and missing_col2 == 0:
                    break
                else:
                    columns_to_drop.add(col2)

    return list(columns_to_drop)

def high_missing(df, threshold):
    missing_ratio = df.isna().mean()
    high_missing = missing_ratio[missing_ratio > threshold].index
    return list(high_missing)


In [42]:
applications_train = pd.read_csv("s3://crisk-nico-prod/raw/applications/application_train.csv.gz", compression='gzip')
print('applications_train shape: ', applications_train.shape)
applications_test = pd.read_csv("s3://crisk-nico-prod/raw/applications/application_test.csv.gz", compression='gzip')
print('applications_test shape: ', applications_test.shape)

applications_train shape:  (307511, 122)
applications_test shape:  (48744, 121)


In [43]:
high_corr_pairs = find_correlation_high_missing(applications_train, threshold=0.8)
drop_columns(applications_train, high_corr_pairs)
drop_columns(applications_test, high_corr_pairs)

Dropped 36 columns: ['LIVINGAREA_MEDI', 'LIVINGAPARTMENTS_AVG', 'APARTMENTS_MEDI', 'NONLIVINGAREA_MODE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'ELEVATORS_MODE', 'NONLIVINGAPARTMENTS_MODE', 'ELEVATORS_MEDI', 'FLOORSMAX_MODE', 'APARTMENTS_AVG', 'CNT_FAM_MEMBERS', 'LANDAREA_AVG', 'COMMONAREA_AVG', 'NONLIVINGAREA_AVG', 'ENTRANCES_AVG', 'BASEMENTAREA_AVG', 'YEARS_BUILD_AVG', 'LANDAREA_MODE', 'YEARS_BEGINEXPLUATATION_AVG', 'FLOORSMAX_AVG', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'ELEVATORS_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'LIVINGAREA_MODE', 'FLOORSMIN_AVG', 'LIVINGAREA_AVG', 'BASEMENTAREA_MODE', 'FLOORSMIN_MODE', 'NONLIVINGAPARTMENTS_AVG', 'LIVINGAPARTMENTS_MEDI', 'AMT_GOODS_PRICE', 'APARTMENTS_MODE', 'ENTRANCES_MODE', 'LIVINGAPARTMENTS_MODE']
Dropped 36 columns: ['LIVINGAREA_MEDI', 'LIVINGAPARTMENTS_AVG', 'APARTMENTS_MEDI', 'NONLIVINGAREA_MODE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'ELEVATORS_MODE', 'NONLIVINGAPARTMENTS_MODE', 'ELEVATORS_MEDI', 'FLOORSMAX_MODE', 'APARTMENTS_AVG

In [44]:
high_missings_cols = high_missing(applications_train, threshold=0.6)
drop_columns(applications_train, high_missings_cols)
drop_columns(applications_test, high_missings_cols)

Dropped 6 columns: ['OWN_CAR_AGE', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'FLOORSMIN_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'FONDKAPREMONT_MODE']
Dropped 6 columns: ['OWN_CAR_AGE', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'FLOORSMIN_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'FONDKAPREMONT_MODE']


In [45]:
con = duckdb.connect()
con.register('applications_train', applications_train)
con.register('applications_test', applications_test)

<_duckdb.DuckDBPyConnection at 0x1ed94068fb0>

In [46]:
applications_train = con.sql("WITH base AS( " \
    # KEY/ID                        
    "SELECT " \
    "SK_ID_CURR, " \
    "TARGET, " \
    "NAME_CONTRACT_TYPE," \
    "CODE_GENDER," \
    "FLAG_OWN_CAR," \
    "FLAG_OWN_REALTY," \
    
    "CASE" \
    "   WHEN CNT_CHILDREN = 0 THEN '0_Children' " \
    "   WHEN CNT_CHILDREN BETWEEN 1 AND 2 THEN '1_2_Children' " \
    "   WHEN CNT_CHILDREN BETWEEN 3 AND 4 THEN '3_4_Children' " \
    "   WHEN CNT_CHILDREN > 4 THEN '5_more_Children' " \
    "END AS CNT_CHILDREN_CAT," \
    
    "CASE " \
    "WHEN AMT_INCOME_TOTAL < 1000000 THEN AMT_INCOME_TOTAL " \
    "ELSE NULL " \
    "END AS AMT_INCOME_TOTAL," \
    
    "AMT_CREDIT," \
    "AMT_ANNUITY," \
    "NAME_INCOME_TYPE," \
    "NAME_EDUCATION_TYPE," \
    "NAME_FAMILY_STATUS," \
    "NAME_HOUSING_TYPE," \
    "REGION_POPULATION_RELATIVE," \
    "DAYS_BIRTH / -365 AS YEARS_BIRTH," \

    "CASE " \
    "WHEN -DAYS_EMPLOYED >= 0 THEN -DAYS_EMPLOYED " \
    "END AS DAYS_EMPLOYED," \

    "-DAYS_REGISTRATION AS DAYS_REGISTRATION," \
    "-DAYS_ID_PUBLISH AS DAYS_ID_PUBLISH," \
    "FLAG_EMP_PHONE," \
    "FLAG_WORK_PHONE," \
    "FLAG_PHONE," \
    "FLAG_EMAIL," \
    "OCCUPATION_TYPE," \
    "REGION_RATING_CLIENT," \
    "REGION_RATING_CLIENT_W_CITY," \
    "WEEKDAY_APPR_PROCESS_START," \

    "CASE" \
    "   WHEN HOUR_APPR_PROCESS_START BETWEEN 0 AND 5 THEN 'Early-morning' " \
    "   WHEN HOUR_APPR_PROCESS_START BETWEEN 6 AND 12 THEN 'Morning' " \
    "   WHEN HOUR_APPR_PROCESS_START BETWEEN 13 AND 18 THEN 'Afternoon' " \
    "   WHEN HOUR_APPR_PROCESS_START BETWEEN 19 AND 23 THEN 'Night' " \
    "END AS HOUR_APPR_PROCESS_START_CAT," \

    "ORGANIZATION_TYPE," \
    "EXT_SOURCE_1," \
    "EXT_SOURCE_2," \
    "EXT_SOURCE_3," \
    "OBS_60_CNT_SOCIAL_CIRCLE," \
    "DEF_60_CNT_SOCIAL_CIRCLE," \
    "DAYS_LAST_PHONE_CHANGE," \

    "CASE" \
    "   WHEN AMT_REQ_CREDIT_BUREAU_HOUR = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_HOUR > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_HOUR," \

    "CASE" \
    "   WHEN AMT_REQ_CREDIT_BUREAU_DAY = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_DAY > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_DAY," \

    "CASE" \
    "   WHEN AMT_REQ_CREDIT_BUREAU_WEEK = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_WEEK > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_WEEK," \
    
    "CASE" \
    "   WHEN AMT_REQ_CREDIT_BUREAU_MON = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_MON > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_MON," \
    "CASE" \
    
    "   WHEN AMT_REQ_CREDIT_BUREAU_QRT = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_QRT > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_QRT," \
    
    "CASE" \
    "   WHEN AMT_REQ_CREDIT_BUREAU_YEAR = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_YEAR > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_YEAR," \


    "FROM applications_train)" \
"SELECT * FROM base").df()

con.unregister('applications_train')

print("total number of missing values in applications_train: ", applications_train.isna().sum().sum())

total number of missing values in applications_train:  638187


In [47]:
applications_test = con.sql("WITH base AS( " \
    # KEY/ID                        
    "SELECT " \
    "SK_ID_CURR, " \
    "NAME_CONTRACT_TYPE," \
    "CODE_GENDER," \
    "FLAG_OWN_CAR," \
    "FLAG_OWN_REALTY," \
    
    "CASE" \
    "   WHEN CNT_CHILDREN = 0 THEN '0_Children' " \
    "   WHEN CNT_CHILDREN BETWEEN 1 AND 2 THEN '1_2_Children' " \
    "   WHEN CNT_CHILDREN BETWEEN 3 AND 4 THEN '3_4_Children' " \
    "   WHEN CNT_CHILDREN > 4 THEN '5_more_Children' " \
    "END AS CNT_CHILDREN_CAT," \
    
    "CASE " \
    "WHEN AMT_INCOME_TOTAL < 1000000 THEN AMT_INCOME_TOTAL " \
    "ELSE NULL " \
    "END AS AMT_INCOME_TOTAL," \
    
    "AMT_CREDIT," \
    "AMT_ANNUITY," \
    "NAME_INCOME_TYPE," \
    "NAME_EDUCATION_TYPE," \
    "NAME_FAMILY_STATUS," \
    "NAME_HOUSING_TYPE," \
    "REGION_POPULATION_RELATIVE," \
    "DAYS_BIRTH / -365 AS YEARS_BIRTH," \

    "CASE " \
    "WHEN -DAYS_EMPLOYED >= 0 THEN -DAYS_EMPLOYED " \
    "END AS DAYS_EMPLOYED," \

    "-DAYS_REGISTRATION AS DAYS_REGISTRATION," \
    "-DAYS_ID_PUBLISH AS DAYS_ID_PUBLISH," \
    "FLAG_EMP_PHONE," \
    "FLAG_WORK_PHONE," \
    "FLAG_PHONE," \
    "FLAG_EMAIL," \
    "OCCUPATION_TYPE," \
    "REGION_RATING_CLIENT," \
    "REGION_RATING_CLIENT_W_CITY," \
    "WEEKDAY_APPR_PROCESS_START," \

    "CASE" \
    "   WHEN HOUR_APPR_PROCESS_START BETWEEN 0 AND 5 THEN 'Early-morning' " \
    "   WHEN HOUR_APPR_PROCESS_START BETWEEN 6 AND 12 THEN 'Morning' " \
    "   WHEN HOUR_APPR_PROCESS_START BETWEEN 13 AND 18 THEN 'Afternoon' " \
    "   WHEN HOUR_APPR_PROCESS_START BETWEEN 19 AND 23 THEN 'Night' " \
    "END AS HOUR_APPR_PROCESS_START_CAT," \

    "ORGANIZATION_TYPE," \
    "EXT_SOURCE_1," \
    "EXT_SOURCE_2," \
    "EXT_SOURCE_3," \
    "OBS_60_CNT_SOCIAL_CIRCLE," \
    "DEF_60_CNT_SOCIAL_CIRCLE," \
    "DAYS_LAST_PHONE_CHANGE," \

    "CASE" \
    "   WHEN AMT_REQ_CREDIT_BUREAU_HOUR = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_HOUR > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_HOUR," \

    "CASE" \
    "   WHEN AMT_REQ_CREDIT_BUREAU_DAY = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_DAY > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_DAY," \

    "CASE" \
    "   WHEN AMT_REQ_CREDIT_BUREAU_WEEK = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_WEEK > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_WEEK," \
    
    "CASE" \
    "   WHEN AMT_REQ_CREDIT_BUREAU_MON = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_MON > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_MON," \
    "CASE" \
    
    "   WHEN AMT_REQ_CREDIT_BUREAU_QRT = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_QRT > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_QRT," \
    
    "CASE" \
    "   WHEN AMT_REQ_CREDIT_BUREAU_YEAR = 0 THEN 0 " \
    "   WHEN AMT_REQ_CREDIT_BUREAU_YEAR > 0 THEN 1 " \
    "END AS FLAG_REQ_CREDIT_BUREAU_YEAR," \


    "FROM applications_test)" \
"SELECT * FROM base").df()

con.unregister('applications_test')

print("total number of missing values in applications_test: ", applications_test.isna().sum().sum())

total number of missing values in applications_test:  90501


In [48]:
installments_payments = pd.read_csv("s3://crisk-nico-prod/raw/installments_payments/installments_payments.csv.gz", compression='gzip')
print('installments_payments shape: ', installments_payments.shape)

installments_payments shape:  (13605401, 8)


In [49]:
high_corr_pairs = find_correlation_high_missing(installments_payments, threshold=0.8)
drop_columns(installments_payments, high_corr_pairs)

Dropped 2 columns: ['DAYS_ENTRY_PAYMENT', 'AMT_PAYMENT']


In [50]:
high_missings_cols = high_missing(installments_payments, threshold=0.6)
drop_columns(installments_payments, high_missings_cols)

Dropped 0 columns: []


In [51]:
con.register('installments_payments', installments_payments)

<_duckdb.DuckDBPyConnection at 0x1ed94068fb0>

In [52]:
installments_agg = con.sql("WITH base AS( " \
    # KEY/ID                        
    "SELECT SK_ID_CURR, " \
    # NUM_INSTALMENT_VERSION avg
    "AVG(NUM_INSTALMENT_VERSION) AS INSTALL_NUM_INSTALMENT_VERSION_AVG," \
    # NUM_INSTALMENT_NUMBER avg
    "AVG(NUM_INSTALMENT_NUMBER) AS INSTALL_NUM_INSTALMENT_NUMBER_AVG," \
    # DAYS_INSTALMENT avg
    "AVG(DAYS_INSTALMENT) AS INSTALL_DAYS_INSTALMENT_AVG," \
    # AMT_INSTALMENT sum
    "SUM(AMT_INSTALMENT) AS INSTALL_AMT_INSTALMENT_SUM," \
    
    "FROM installments_payments GROUP BY SK_ID_CURR)" \
"SELECT * FROM base").df()

con.unregister('installments_payments')

gc.enable()
del installments_payments
gc.collect()

print("total number of missing values in previous_agg: ", installments_agg.isna().sum().sum())

total number of missing values in previous_agg:  0


In [53]:
applications_train = applications_train.merge(installments_agg, on='SK_ID_CURR', how='left')
applications_test = applications_test.merge(installments_agg, on='SK_ID_CURR', how='left')

gc.enable()
del installments_agg
gc.collect()

0

In [54]:
bureau = pd.read_csv("s3://crisk-nico-prod/raw/bureau/bureau.csv.gz", compression='gzip')
print('bureau shape: ', bureau.shape)

bureau shape:  (1716428, 17)


In [55]:
high_corr_pairs = find_correlation_high_missing(bureau, threshold=0.8)
drop_columns(bureau, high_corr_pairs)

Dropped 1 columns: ['DAYS_ENDDATE_FACT']


In [56]:
high_missings_cols = high_missing(bureau, threshold=0.6)
drop_columns(bureau, high_missings_cols)

Dropped 2 columns: ['AMT_CREDIT_MAX_OVERDUE', 'AMT_ANNUITY']


In [57]:
con.register('bureau', bureau)

<_duckdb.DuckDBPyConnection at 0x1ed94068fb0>

In [58]:
bureau_agg = con.sql("WITH base AS( " \
    # KEY/ID                        
    "SELECT SK_ID_CURR, " \
    
    # CREDIT_ACTIVE: Count actives
    "SUM(CASE WHEN CREDIT_ACTIVE = 'Active' THEN 1 ELSE 0 END)   AS ACTIVE_COUNT, " \
    "SUM(CASE WHEN CREDIT_ACTIVE = 'Closed' THEN 1 ELSE 0 END)   AS CLOSED_COUNT, " \
    "SUM(CASE WHEN CREDIT_ACTIVE = 'Sold' THEN 1 ELSE 0 END)     AS SOLD_COUNT, " \
    "SUM(CASE WHEN CREDIT_ACTIVE = 'Bad debt' THEN 1 ELSE 0 END) AS BAD_DEBT_COUNT, " \
    
    # CREDIT_TYPE: count types and others
    "SUM(CASE WHEN CREDIT_TYPE = 'Consumer credit' THEN 1 ELSE 0 END)   AS CONSUMER_CREDIT_COUNT, " \
    "SUM(CASE WHEN CREDIT_TYPE = 'Credit card' THEN 1 ELSE 0 END)   AS CREDIT_CARD_COUNT, " \
    "SUM(CASE WHEN CREDIT_TYPE = 'Car loan' THEN 1 ELSE 0 END)   AS CAR_LOAN_COUNT, " \
    "SUM(CASE WHEN CREDIT_TYPE = 'Mortgage' THEN 1 ELSE 0 END)   AS MORTGAGE_COUNT, " \
    "SUM(CASE WHEN CREDIT_TYPE = 'Microloan' THEN 1 ELSE 0 END)   AS MICROLOAN_COUNT, " \
    "SUM(CASE WHEN CREDIT_TYPE NOT IN ('Consumer credit', 'Credit card', 'Car loan', 'Mortgage', 'Microloan') THEN 1 ELSE 0 END) AS OTHER_COUNT," \

    # DAYS_CREDIT: Max, Min, Avg and transform to positive values
    "MIN(-DAYS_CREDIT) AS DAYS_CREDIT_MAX," \
    "ROUND(-AVG(DAYS_CREDIT),2) AS DAYS_CREDIT_AVG," \
    
    # CREDIT_DAY_OVERDUE: AVG
    "ROUND(AVG(CREDIT_DAY_OVERDUE),2) AS CREDIT_DAY_OVERDUE_AVG," \

    # DAYS_CREDIT_ENDDATE max, min, avg and handle outliers more or less than 3650 = 10 years
    "CASE " \
    "WHEN MAX(DAYS_CREDIT_ENDDATE) < 3650 AND MAX(DAYS_CREDIT_ENDDATE) > -3650 THEN MAX(DAYS_CREDIT_ENDDATE) " \
    "END AS DAYS_CREDIT_ENDDATE_MAX, "

    "CASE " \
    "WHEN MIN(DAYS_CREDIT_ENDDATE) < 3650 AND MIN(DAYS_CREDIT_ENDDATE) > -3650 THEN MIN(DAYS_CREDIT_ENDDATE) " \
    "END AS DAYS_CREDIT_ENDDATE_MIN, " \
        
    "CASE " \
    "WHEN AVG(DAYS_CREDIT_ENDDATE) < 3650 AND AVG(DAYS_CREDIT_ENDDATE) > -3650 THEN AVG(DAYS_CREDIT_ENDDATE) " \
    "END AS DAYS_CREDIT_ENDDATE_AVG, " \

    # CNT_CREDIT_PROLONG sum and categorical to experiment
    "SUM(CNT_CREDIT_PROLONG) AS CNT_CREDIT_PROLONG," \

    "CASE " \
    "    WHEN SUM(CNT_CREDIT_PROLONG) = 0 THEN 'Non-prolonged' " \
    "    WHEN SUM(CNT_CREDIT_PROLONG) <> 0 THEN 'Prolonged' " \
    "END AS CNT_CREDIT_PROLONG_BINARY, " \

    # AMT_CREDIT_SUM sum and handle outliers more than 1.000.000.000   
    "CASE " \
    "WHEN SUM(AMT_CREDIT_SUM) < 1000000000 THEN SUM(AMT_CREDIT_SUM) " \
    "END AS AMT_CREDIT_SUM, "

    # AMT_CREDIT_SUM_DEBT sum and handle outliers more than 100.000.000  
    "CASE " \
    "WHEN SUM(AMT_CREDIT_SUM_DEBT) < 100000000 THEN SUM(AMT_CREDIT_SUM_DEBT) " \
    "END AS AMT_CREDIT_SUM_DEBT, "
    
    # AMT_CREDIT_SUM_LIMIT sum
    "COALESCE(SUM(AMT_CREDIT_SUM_LIMIT),0) AS AMT_CREDIT_SUM_LIMIT," \
    
    # AMT_CREDIT_SUM_OVERDUE sum
    "SUM(AMT_CREDIT_SUM_OVERDUE) AS AMT_CREDIT_SUM_OVERDUE," \
    
    # DAYS_CREDIT_UPDATE max, min, avg and handle outliers more or less than 3650 = 10 years
    "CASE " \
    "WHEN MAX(DAYS_CREDIT_UPDATE) < 3650 AND MAX(DAYS_CREDIT_UPDATE) > -3650 THEN MAX(DAYS_CREDIT_UPDATE) " \
    "END AS DAYS_CREDIT_UPDATE_MAX, "

    "CASE " \
    "WHEN MIN(DAYS_CREDIT_UPDATE) < 3650 AND MIN(DAYS_CREDIT_UPDATE) > -3650 THEN MIN(DAYS_CREDIT_UPDATE) " \
    "END AS DAYS_CREDIT_UPDATE_MIN, " \
        
    "CASE " \
    "WHEN AVG(DAYS_CREDIT_UPDATE) < 3650 AND AVG(DAYS_CREDIT_UPDATE) > -3650 THEN AVG(DAYS_CREDIT_UPDATE) " \
    "END AS DAYS_CREDIT_UPDATE_AVG, " \

    # # Portfolio size
    # "COUNT(*) AS TOTAL_LOANS " \
    
    "FROM bureau GROUP BY SK_ID_CURR)" \
"SELECT * FROM base").df()

print("total number of missing values in bureau: ", bureau_agg.isna().sum().sum())

total number of missing values in bureau:  105042


In [59]:
applications_train = applications_train.merge(bureau_agg, on='SK_ID_CURR', how='left')
applications_test = applications_test.merge(bureau_agg, on='SK_ID_CURR', how='left')

gc.enable()
del bureau_agg
gc.collect()

6

In [60]:
bureau_balance = pd.read_csv("s3://crisk-nico-prod/raw/bureau_balance/bureau_balance.csv.gz", compression='gzip')
print('bureau_balance shape: ', bureau_balance.shape)

bureau_balance shape:  (27299925, 3)


In [61]:
high_missings_cols = high_missing(bureau_balance, threshold=0.6)
drop_columns(bureau_balance, high_missings_cols)

Dropped 0 columns: []


In [62]:
high_corr_pairs = find_correlation_high_missing(bureau_balance, threshold=0.8)
drop_columns(bureau_balance, high_corr_pairs)

Dropped 0 columns: []


In [63]:
con.register('bureau_balance', bureau_balance)

<_duckdb.DuckDBPyConnection at 0x1ed94068fb0>

In [64]:
bureau_balance_agg = con.sql("WITH base AS( " \
    # KEY/ID                        
    "SELECT " \
    "b.SK_ID_CURR," \
    
    # STATUS: Count status
    "SUM(CASE WHEN bb.STATUS = 'C' THEN 1 ELSE 0 END)   AS STATUS_C_COUNT, " \
    "SUM(CASE WHEN bb.STATUS = '0' THEN 1 ELSE 0 END)   AS STATUS_0_COUNT, " \
    "SUM(CASE WHEN bb.STATUS = 'X' THEN 1 ELSE 0 END)     AS STATUS_X_COUNT, " \
    "SUM(CASE WHEN bb.STATUS = '1' THEN 1 ELSE 0 END) AS STATUS_1_COUNT, " \
    "SUM(CASE WHEN bb.STATUS = '2' THEN 1 ELSE 0 END) AS STATUS_2_COUNT, " \
    "SUM(CASE WHEN bb.STATUS = '3' THEN 1 ELSE 0 END) AS STATUS_3_COUNT, " \
    "SUM(CASE WHEN bb.STATUS = '4' THEN 1 ELSE 0 END) AS STATUS_4_COUNT, " \
    "SUM(CASE WHEN bb.STATUS = '5' THEN 1 ELSE 0 END) AS STATUS_5_COUNT, " \

    "AVG(-bb.MONTHS_BALANCE) AS MONTHS_BALANCE_AVG " \
    "FROM bureau_balance bb LEFT JOIN bureau b ON bb.SK_ID_BUREAU = b.SK_ID_BUREAU WHERE b.SK_ID_CURR IS NOT NULL GROUP BY b.SK_ID_CURR)" \
"SELECT * FROM base").df()

con.unregister('bureau_balance')

con.unregister('bureau')

gc.enable()
del bureau
gc.collect()

gc.enable()
del bureau_balance
gc.collect()

print("total number of missing values in bureau_balance: ", bureau_balance_agg.isna().sum().sum())

total number of missing values in bureau_balance:  0


In [65]:
applications_train = applications_train.merge(bureau_balance_agg, on='SK_ID_CURR', how='left')
applications_test = applications_test.merge(bureau_balance_agg, on='SK_ID_CURR', how='left')

gc.enable()
del bureau_balance_agg
gc.collect()

0

In [66]:
previous_application = pd.read_csv("s3://crisk-nico-prod/raw/previous_application/previous_application.csv.gz", compression='gzip')
print('previous_application shape: ', previous_application.shape)

previous_application shape:  (1670214, 37)


In [67]:
high_corr_pairs = find_correlation_high_missing(previous_application, threshold=0.8)
drop_columns(previous_application, high_corr_pairs)

Dropped 5 columns: ['AMT_CREDIT', 'DAYS_LAST_DUE', 'AMT_GOODS_PRICE', 'AMT_ANNUITY', 'DAYS_FIRST_DRAWING']


In [68]:
high_missings_cols = high_missing(previous_application, threshold=0.6)
drop_columns(previous_application, high_missings_cols)

Dropped 2 columns: ['RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED']


In [69]:
con.register('previous_application', previous_application)

<_duckdb.DuckDBPyConnection at 0x1ed94068fb0>

In [70]:
previous_agg = con.sql("WITH base AS( " \
    # KEY/ID                        
    "SELECT SK_ID_CURR, " \
    # NAME_CONTRACT_TYPE: Count contract type
    "SUM(CASE WHEN NAME_CONTRACT_TYPE = 'Cash loans' THEN 1 ELSE 0 END)   AS CASH_LOANS_COUNT, " \
    "SUM(CASE WHEN NAME_CONTRACT_TYPE = 'Consumer loans' THEN 1 ELSE 0 END)   AS CONSUMER_LOANS_COUNT, " \
    "SUM(CASE WHEN NAME_CONTRACT_TYPE = 'Revolving loans' THEN 1 ELSE 0 END)     AS REVOLVING_LOANS_COUNT, " \
    "SUM(CASE WHEN NAME_CONTRACT_TYPE = 'XNA' THEN 1 ELSE 0 END) AS XNA_COUNT," \
    # NAME_CONTRACT_STATUS: Count contract status
    "SUM(CASE WHEN NAME_CONTRACT_STATUS = 'Approved' THEN 1 ELSE 0 END)     AS CONTRACT_STATUS_APPROVED_COUNT, " \
    "SUM(CASE WHEN NAME_CONTRACT_STATUS = 'Canceled' THEN 1 ELSE 0 END) AS CONTRACT_STATUS_CANCELED_COUNT," \
    "SUM(CASE WHEN NAME_CONTRACT_STATUS = 'Refused' THEN 1 ELSE 0 END) AS CONTRACT_STATUS_REFUSED_COUNT," \
    "SUM(CASE WHEN NAME_CONTRACT_STATUS = 'Unused offer' THEN 1 ELSE 0 END) AS CONTRACT_STATUS_UNUSED_COUNT," \
    # CODE_REJECT_REASON: Count rejected reason
    "SUM(CASE WHEN CODE_REJECT_REASON = 'XAP' THEN 1 ELSE 0 END)     AS REJECTED_REASON_XAP_COUNT, " \
    "SUM(CASE WHEN CODE_REJECT_REASON = 'HC' THEN 1 ELSE 0 END) AS REJECTED_REASON_HC_COUNT," \
    "SUM(CASE WHEN CODE_REJECT_REASON = 'LIMIT' THEN 1 ELSE 0 END) AS REJECTED_REASON_LIMIT_COUNT," \
    "SUM(CASE WHEN CODE_REJECT_REASON = 'SCO' THEN 1 ELSE 0 END) AS REJECTED_REASON_SCO_COUNT," \
    "SUM(CASE WHEN CODE_REJECT_REASON NOT IN ('XAP', 'HC', 'LIMIT', 'SCO') THEN 1 ELSE 0 END) AS REJECTED_REASON_OTHER_COUNT," \
    # NAME_CLIENT_TYPE: Count client types
    "SUM(CASE WHEN NAME_CLIENT_TYPE = 'Repeater' THEN 1 ELSE 0 END)     AS CLIENT_TYPE_REPEATER_COUNT, " \
    "SUM(CASE WHEN NAME_CLIENT_TYPE = 'New' THEN 1 ELSE 0 END) AS CLIENT_TYPE_NEW_COUNT," \
    "SUM(CASE WHEN NAME_CLIENT_TYPE = 'Refreshed' THEN 1 ELSE 0 END) AS CLIENT_TYPE_REFRESHED_COUNT," \
    "SUM(CASE WHEN NAME_CLIENT_TYPE = 'XNA' THEN 1 ELSE 0 END) AS CLIENT_TYPE_XNA_COUNT," \
    # NAME_PORTFOLIO: Count portfolio name
    "SUM(CASE WHEN NAME_PORTFOLIO = 'POS' THEN 1 ELSE 0 END)     AS NAME_PORTFOLIO_POS_COUNT, " \
    "SUM(CASE WHEN NAME_PORTFOLIO = 'Cash' THEN 1 ELSE 0 END) AS NAME_PORTFOLIO_CASH_COUNT," \
    "SUM(CASE WHEN NAME_PORTFOLIO = 'XNA' THEN 1 ELSE 0 END) AS NAME_PORTFOLIO_XNA_COUNT," \
    "SUM(CASE WHEN NAME_PORTFOLIO = 'Cards' THEN 1 ELSE 0 END) AS NAME_PORTFOLIO_CARDS_COUNT," \
    "SUM(CASE WHEN NAME_PORTFOLIO = 'Cars' THEN 1 ELSE 0 END) AS NAME_PORTFOLIO_CARS_COUNT," \
    # NAME_YIELD_GROUP: Count grouped interest rate 
    "SUM(CASE WHEN NAME_YIELD_GROUP = 'XNA' THEN 1 ELSE 0 END)     AS YIELD_GROUP_XNA_COUNT, " \
    "SUM(CASE WHEN NAME_YIELD_GROUP = 'middle' THEN 1 ELSE 0 END) AS YIELD_GROUP_MIDDLE_COUNT," \
    "SUM(CASE WHEN NAME_YIELD_GROUP = 'high' THEN 1 ELSE 0 END) AS YIELD_GROUP_HIGH_COUNT," \
    "SUM(CASE WHEN NAME_YIELD_GROUP = 'low_normal' THEN 1 ELSE 0 END) AS YIELD_GROUP_LOW_NORMAL_COUNT," \
    "SUM(CASE WHEN NAME_YIELD_GROUP = 'low_action' THEN 1 ELSE 0 END) AS YIELD_GROUP_LOW_ACTION_COUNT," \
    # AMT_APPLICATION sum
    "SUM(AMT_APPLICATION) AS AMT_APPLICATION_SUM," \
    # AMT_DOWN_PAYMENT sum
    "SUM(AMT_DOWN_PAYMENT) AS AMT_DOWN_PAYMENT_SUM," \
    # RATE_DOWN_PAYMENT avg
    "AVG(RATE_DOWN_PAYMENT) AS RATE_DOWN_PAYMENT_AVG," \
    # CNT_PAYMENT avg
    "AVG(CNT_PAYMENT) AS CNT_PAYMENT_AVG," \

    "FROM previous_application GROUP BY SK_ID_CURR)" \
"SELECT * FROM base").df()

con.unregister('previous_application')

gc.enable()
del previous_application
gc.collect()

print("total number of missing values in previous_agg: ", previous_agg.isna().sum().sum())

total number of missing values in previous_agg:  40686


In [71]:
applications_train = applications_train.merge(previous_agg, on='SK_ID_CURR', how='left')
applications_test = applications_test.merge(previous_agg, on='SK_ID_CURR', how='left')

gc.enable()
del previous_agg
gc.collect()

0

In [72]:
POS_CASH_balance = pd.read_csv("s3://crisk-nico-prod/raw/POS_CASH_balance/POS_CASH_balance.csv.gz", compression='gzip')
print('POS_CASH_balance shape: ', POS_CASH_balance.shape)

POS_CASH_balance shape:  (10001358, 8)


In [73]:
high_corr_pairs = find_correlation_high_missing(POS_CASH_balance, threshold=0.8)
drop_columns(POS_CASH_balance, high_corr_pairs)

Dropped 1 columns: ['CNT_INSTALMENT_FUTURE']


In [74]:
high_missings_cols = high_missing(POS_CASH_balance, threshold=0.6)
drop_columns(POS_CASH_balance, high_missings_cols)

Dropped 0 columns: []


In [75]:
con.register('POS_CASH_balance', POS_CASH_balance)

<_duckdb.DuckDBPyConnection at 0x1ed94068fb0>

In [76]:
POS_CASH_agg = con.sql("WITH base AS( " \
    # KEY/ID                        
    "SELECT SK_ID_CURR, " \
    # NAME_CONTRACT_STATUS: Count contract status
    "SUM(CASE WHEN NAME_CONTRACT_STATUS = 'Active' THEN 1 ELSE 0 END)   AS POS_CASH_CONTRACT_STATUS_ACTIVE_COUNT, " \
    "SUM(CASE WHEN NAME_CONTRACT_STATUS = 'Completed' THEN 1 ELSE 0 END)   AS POS_CASH_CONTRACT_STATUS_COMPLETED_COUNT, " \
    "SUM(CASE WHEN NAME_CONTRACT_STATUS = 'Signed' THEN 1 ELSE 0 END)     AS POS_CASH_CONTRACT_STATUS_SIGNED_COUNT, " \
    "SUM(CASE WHEN NAME_CONTRACT_STATUS NOT IN ('Active','Completed','Signed') THEN 1 ELSE 0 END) AS POS_CASH_CONTRACT_STATUS_OTHER_COUNT," \
    # MONTHS_BALANCE avg
    "AVG(MONTHS_BALANCE) AS POS_CASH_MONTHS_BALANCE_AVG," \
    # CNT_INSTALMENT avg
    "AVG(CNT_INSTALMENT) AS POS_CASH_CNT_INSTALMENT_AVG," \
    # SK_DPD (days past due) to binary past due
    "CASE " \
    "    WHEN SUM(SK_DPD) = 0 THEN 'Non-DPD' " \
    "    WHEN SUM(SK_DPD) <> 0 THEN 'DPD' " \
    "END AS POS_CASH_SK_DPD_BINARY, " \
    # SK_DPD_DEF (days past due during the month) to binary past due during the month
    "CASE " \
    "    WHEN SUM(SK_DPD_DEF) = 0 THEN 'Non-DPD-DEF' " \
    "    WHEN SUM(SK_DPD_DEF) <> 0 THEN 'DPD-DEF' " \
    "END AS POS_CASH_SK_DPD_DEF_BINARY, " \
    "FROM POS_CASH_balance GROUP BY SK_ID_CURR)" \
"SELECT * FROM base").df()

con.unregister('POS_CASH_balance')

gc.enable()
del POS_CASH_balance
gc.collect()

print("total number of missing values in previous_agg: ", POS_CASH_agg.isna().sum().sum())

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

total number of missing values in previous_agg:  28


In [77]:
applications_train = applications_train.merge(POS_CASH_agg, on='SK_ID_CURR', how='left')
applications_test = applications_test.merge(POS_CASH_agg, on='SK_ID_CURR', how='left')

gc.enable()
del POS_CASH_agg
gc.collect()

0

In [81]:
credit_card_balance = pd.read_csv("s3://crisk-nico-prod/raw/credit_card_balance/credit_card_balance.csv.gz", compression='gzip')
print('credit_card_balance shape: ', credit_card_balance.shape)

credit_card_balance shape:  (3840312, 23)


In [82]:
high_corr_pairs = find_correlation_high_missing(credit_card_balance, threshold=0.8)
drop_columns(credit_card_balance, high_corr_pairs)

Dropped 4 columns: ['AMT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'AMT_PAYMENT_CURRENT', 'AMT_INST_MIN_REGULARITY']


In [83]:
high_missings_cols = high_missing(credit_card_balance, threshold=0.6)
drop_columns(credit_card_balance, high_missings_cols)

Dropped 0 columns: []


In [84]:
con.register('credit_card_balance', credit_card_balance)

<_duckdb.DuckDBPyConnection at 0x1ed94068fb0>

In [105]:
credit_card_balance.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'AMT_BALANCE',
       'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_CURRENT',
       'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT',
       'AMT_PAYMENT_TOTAL_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL',
       'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE', 'CNT_DRAWINGS_ATM_CURRENT',
       'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT',
       'CNT_INSTALMENT_MATURE_CUM', 'NAME_CONTRACT_STATUS', 'SK_DPD',
       'SK_DPD_DEF'],
      dtype='object')

In [177]:
credit_card_balance_agg = con.sql("WITH base AS( " \
    # KEY/ID                        
    "SELECT SK_ID_CURR, " \
    # NAME_CONTRACT_STATUS: Count contract status
    "SUM(CASE WHEN NAME_CONTRACT_STATUS = 'Active' THEN 1 ELSE 0 END)   AS CREDIT_CARD_CONTRACT_STATUS_ACTIVE_COUNT, " \
    "SUM(CASE WHEN NAME_CONTRACT_STATUS = 'Completed' THEN 1 ELSE 0 END)   AS CREDIT_CARD_CONTRACT_STATUS_COMPLETED_COUNT, " \
    "SUM(CASE WHEN NAME_CONTRACT_STATUS = 'Signed' THEN 1 ELSE 0 END)     AS CREDIT_CARD_CONTRACT_STATUS_SIGNED_COUNT, " \
    "SUM(CASE WHEN NAME_CONTRACT_STATUS NOT IN ('Active','Completed','Signed') THEN 1 ELSE 0 END) AS CREDIT_CARD_CONTRACT_STATUS_OTHER_COUNT," \
    # MONTHS_BALANCE avg
    "AVG(MONTHS_BALANCE) AS CREDIT_CARD_MONTHS_BALANCE_AVG," \
    # AMT_BALANCE avg
    "AVG(AMT_BALANCE) AS CREDIT_CARD_AMT_BALANCE_AVG," \
    # AMT_CREDIT_LIMIT_ACTUAL avg
    "AVG(AMT_CREDIT_LIMIT_ACTUAL) AS CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_AVG," \
    # AMT_CREDIT_LIMIT_ACTUAL sum
    "SUM(AMT_CREDIT_LIMIT_ACTUAL) AS CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_SUM," \
    # AMT_PAYMENT_TOTAL_CURRENT avg
    "AVG(AMT_PAYMENT_TOTAL_CURRENT) AS CREDIT_CARD_AMT_PAYMENT_TOTAL_CURRENT_AVG," \
    # AMT_PAYMENT_TOTAL_CURRENT sum
    "SUM(AMT_PAYMENT_TOTAL_CURRENT) AS CREDIT_CARD_AMT_PAYMENT_TOTAL_CURRENT_SUM," \
    # AMT_RECEIVABLE_PRINCIPAL avg
    "AVG(AMT_RECEIVABLE_PRINCIPAL) AS CREDIT_CARD_AMT_RECEIVABLE_PRINCIPAL_AVG," \
    # AMT_RECEIVABLE_PRINCIPAL sum
    "SUM(AMT_RECEIVABLE_PRINCIPAL) AS CREDIT_CARD_AMT_RECEIVABLE_PRINCIPAL_SUM," \
    # AMT_RECIVABLE avg
    "AVG(AMT_RECIVABLE) AS CREDIT_CARD_AMT_RECIVABLE_AVG," \
    # AMT_RECIVABLE sum
    "SUM(AMT_RECIVABLE) AS CREDIT_CARD_AMT_RECIVABLE_SUM," \
    # AMT_TOTAL_RECEIVABLE avg
    "AVG(AMT_TOTAL_RECEIVABLE) AS CREDIT_CARD_AMT_TOTAL_RECEIVABLE_AVG," \
    # AMT_TOTAL_RECEIVABLE sum
    "SUM(AMT_TOTAL_RECEIVABLE) AS CREDIT_CARD_AMT_TOTAL_RECEIVABLE_SUM," \
    # CNT_INSTALMENT_MATURE_CUM avg
    "AVG(CNT_INSTALMENT_MATURE_CUM) AS CREDIT_CARD_CNT_INSTALMENT_MATURE_CUM_AVG," \
    # CNT_INSTALMENT_MATURE_CUM sum
    "SUM(CNT_INSTALMENT_MATURE_CUM) AS CREDIT_CARD_CNT_INSTALMENT_MATURE_CUM_SUM," \
    # SK_DPD (days past due) to binary past due
    "CASE " \
    "    WHEN SUM(SK_DPD) = 0 THEN 'Non-DPD' " \
    "    WHEN SUM(SK_DPD) <> 0 THEN 'DPD' " \
    "END AS CREDIT_CARD_SK_DPD_BINARY, " \
    # SK_DPD_DEF (days past due during the month) to binary past due during the month
    "CASE " \
    "    WHEN SUM(SK_DPD_DEF) = 0 THEN 'Non-DPD-DEF' " \
    "    WHEN SUM(SK_DPD_DEF) <> 0 THEN 'DPD-DEF' " \
    "END AS CREDIT_CARD_SK_DPD_DEF_BINARY, " \
    
    "FROM credit_card_balance GROUP BY SK_ID_CURR)" \
"SELECT * FROM base").df()

# con.unregister('credit_card_balance', credit_card_balance)

# gc.enable()
# del credit_card_balance
# gc.collect()

print("total number of missing values in previous_agg: ", credit_card_balance_agg.isna().sum().sum())

total number of missing values in previous_agg:  0


In [178]:
applications_train = applications_train.merge(credit_card_balance_agg, on='SK_ID_CURR', how='left')
applications_test = applications_test.merge(credit_card_balance_agg, on='SK_ID_CURR', how='left')

# gc.enable()
# del credit_card_balance_agg
# gc.collect()

In [180]:
print("total number of missing values in train aggregation: ", applications_train.isna().sum().sum(), "and shape: ", applications_train.shape)
print("total number of missing values in test aggregation: ", applications_test.isna().sum().sum(), "and shape: ", applications_test.shape)

total number of missing values in train aggregation:  8931218 and shape:  (307511, 138)
total number of missing values in test aggregation:  1011717 and shape:  (48744, 137)


In [None]:
def correlation_target(df, cols=None, top_n=10, target="TARGET"):
    if cols is None:
        num_df = df.select_dtypes(include=[np.number]).copy()
    else:
        num_df = df[cols].select_dtypes(include=[np.number]).copy()

        # Ensure target is included
        if target not in num_df.columns:
            if target in df.columns:
                num_df[target] = df[target]
            else:
                raise ValueError(f"Column '{target}' does not exist in the DataFrame")

    target_corr = (
        num_df
        .corr()[target]
        .drop(target)
        .sort_values(key=abs, ascending=False)
    )

    print(f"Top {top_n} correlations with {target}:\n")
    display(target_corr.head(top_n))

In [188]:
correlation_target(applications_train, top_n=30)

Top 30 correlations with TARGET:



CONTRACT_STATUS_UNUSED_COUNT                 0.000517
FLAG_REQ_CREDIT_BUREAU_WEEK                  0.000725
FLAG_REQ_CREDIT_BUREAU_HOUR                  0.000991
FLAG_EMAIL                                  -0.001758
SK_ID_CURR                                  -0.002108
OTHER_COUNT                                 -0.002160
NAME_PORTFOLIO_CARS_COUNT                   -0.002830
CLIENT_TYPE_XNA_COUNT                        0.003192
CREDIT_CARD_CONTRACT_STATUS_OTHER_COUNT      0.003685
BAD_DEBT_COUNT                               0.004003
CNT_CREDIT_PROLONG                           0.004058
YIELD_GROUP_MIDDLE_COUNT                    -0.004393
POS_CASH_CONTRACT_STATUS_SIGNED_COUNT       -0.004421
AMT_APPLICATION_SUM                          0.004607
CREDIT_CARD_AMT_PAYMENT_TOTAL_CURRENT_SUM   -0.004898
FLAG_REQ_CREDIT_BUREAU_DAY                   0.004920
POS_CASH_CONTRACT_STATUS_OTHER_COUNT         0.005426
FLAG_REQ_CREDIT_BUREAU_MON                  -0.006077
CREDIT_CARD_CONTRACT_STATUS_

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [None]:
CURATED = "s3://crisk-nico-prod/curated/applications_train"

applications_train["bucket_id"] = applications_train["SK_ID_CURR"] % 8
spark_df = spark.createDataFrame(applications_train)

spark_df.write.mode("overwrite") \
    .partitionBy("bucket_id") \
    .parquet(CURATED)

In [None]:
CURATED = "s3://crisk-nico-prod/curated/applications_test"

applications_test["bucket_id"] = applications_test["SK_ID_CURR"] % 8
spark_df = spark.createDataFrame(applications_test)

spark_df.write.mode("overwrite") \
    .partitionBy("bucket_id") \
    .parquet(CURATED)