In [None]:

application_train = pd.read_csv('/dbfs/data/credit-default-risk-data/application_train.csv')
application_test = pd.read_csv('/dbfs/data/credit-default-risk-data/application_test.csv')
bureau = pd.read_csv('/dbfs/data/credit-default-risk-data/bureau.csv')
bureau_balance = pd.read_csv('/dbfs/data/credit-default-risk-data/bureau_balance.csv')
credit_card_balance = pd.read_csv('/dbfs/data/credit-default-risk-data/credit_card_balance.csv')
installments_payments = pd.read_csv('/dbfs/data/credit-default-risk-data/installments_payments.csv')
pos_cash_balance = pd.read_csv('/dbfs/data/credit-default-risk-data/POS_CASH_balance.csv')
previous_application = pd.read_csv('/dbfs/data/credit-default-risk-data/previous_application.csv')


homecredit_columns_description = pd.read_csv('/dbfs/data/credit-default-risk-data/HomeCredit_columns_description.csv',encoding='latin1')

In [None]:
def aggAvgInstalments(df):
    df_ = df.copy()
    df_['INSTALMENT_MISSED'] = (df_['AMT_INSTALMENT'] > df_['AMT_PAYMENT']).astype(int)
    df_['AMT_UNPAID'] = df_['AMT_INSTALMENT'] - df_['AMT_PAYMENT']
    df_['PERC_UNPAID'] = df_['AMT_UNPAID']/df_['AMT_INSTALMENT']
    df_ = df_.fillna(0)
    agg = df_.groupby("SK_ID_CURR")
    # percentage of missed payments
    missed_instalments = agg['INSTALMENT_MISSED'].agg(lambda x: x.sum()/x.count()). \
        reset_index().set_index("SK_ID_CURR")
    # percentage of payments difference for each missed payment
    avg_percent_unpaid = agg['PERC_UNPAID'].mean().reset_index().set_index("SK_ID_CURR")
    # average payments difference for each missed payment
    avg_unpaid = agg['AMT_UNPAID'].mean().reset_index().set_index("SK_ID_CURR")
    final_df = missed_instalments
    final_df = final_df.join(avg_percent_unpaid, on='SK_ID_CURR')
    final_df = final_df.join(avg_unpaid,on="SK_ID_CURR")
    return final_df

installment_payments_features = spark.createDataFrame(aggAvgInstalments(installments_payments)).select('INSTALMENT_MISSED').distinct()

In [None]:
def bureauBalanceRollingCreditLoan(df):
    df_final = df.copy()
    df_final['STATUS'] = df_final['STATUS'].replace(['X','C'],'0')
    df_final['STATUS'] = pd.to_numeric(df_final['STATUS'])
    df_final = df_final.groupby("SK_ID_BUREAU")['STATUS'].agg(
        lambda x: x.ewm(span=x.shape[0], adjust=False).mean().mean()
    )
    df_final = df_final.reset_index(name="CREDIT_STATUS_EMA_AVG")
    df_final = df_final.set_index('SK_ID_BUREAU')
    return df_final

bureau_balance_rolling_features = spark.createDataFrame(bureauBalanceRollingCreditLoan(bureau_balance)).select('CREDIT_STATUS_EMA_AVG').distinct()

In [None]:
def aggAvgBureau(df):
    agg = df.groupby('SK_ID_CURR')
    # average of CREDIT_DAY_OVERDUE
    final_df = agg['CREDIT_DAY_OVERDUE'].mean().reset_index(name = "CREDIT_DAY_OVERDUE_MEAN")
    # average of days between credits of DAYS_CREDIT
    days_credit_between = pd.DataFrame(df['SK_ID_CURR'])
    days_credit_between['diff'] = agg['DAYS_CREDIT'].diff()
    days_credit_between = days_credit_between.groupby("SK_ID_CURR")['diff'].mean().reset_index(name = 'DAYS_CREDIT_BETWEEN_MEAN')
    days_credit_between.set_index("SK_ID_CURR",inplace=True)
    final_df = final_df.join(days_credit_between, on='SK_ID_CURR')
    final_df = final_df.set_index("SK_ID_CURR")
    return final_df

agg_avg_bureau_features = spark.createDataFrame(aggAvgBureau(bureau)).select('CREDIT_DAY_OVERDUE_MEAN').distinct()

# Feature Store Ingestion

Now that we have computed the features, let's put them into a feature store!

In [None]:
%sql 
CREATE DATABASE IF NOT EXISTS feature_store_home_credit_bureau_data;

In [None]:
from databricks import feature_store

fs = feature_store.FeatureStoreClient()

In [None]:
# This cell uses an API introduced with Databricks Runtime 10.2 ML.
# If your cluster is running Databricks Runtime 10.1 ML or below, skip or comment out this cell and uncomment and run Cmd 20.

spark.conf.set("spark.sql.shuffle.partitions", "5")

fs.create_table(
    name="feature_store_home_credit_bureau_data.installment_payments_features",
    primary_keys=["INSTALMENT_MISSED"],
    df=installment_payments_features,
    description="Installment Payments Features",
)

fs.create_table(
    name="feature_store_home_credit_bureau_data.bureau_balance_rolling_features",
    primary_keys=["CREDIT_STATUS_EMA_AVG"],
    df=bureau_balance_rolling_features,
    description="Bureau Balance Rolling Credit Features",
)

fs.create_table(
    name="feature_store_home_credit_bureau_data.agg_avg_bureau_features",
    primary_keys=["CREDIT_DAY_OVERDUE_MEAN"],
    df=agg_avg_bureau_features,
    description="Aggregate Avg Bureau Features",
)

Troubleshooting

In [None]:
%sql
SELECT *
FROM  feature_store_home_credit_bureau_data.installment_payments_features


# Feature Look-up

Let's create a training data set using the features that we have put into the feature store!

In [None]:
from databricks.feature_store import FeatureLookup
import mlflow

installment_payments_features_table = "feature_store_home_credit_bureau_data.installment_payments_features"
bureau_balance_rolling_features_table = "feature_store_home_credit_bureau_data.bureau_balance_rolling_features"
agg_avg_bureau_features_table = "feature_store_home_credit_bureau_data.agg_avg_bureau_features"

installment_payments_feature_lookups = [
    FeatureLookup( 
      table_name = installment_payments_features_table,
      feature_names = "INSTALMENT_MISSED",
      lookup_key = ["INSTALMENT_MISSED"],
    )
]

bureau_balance_rolling_feature_lookups = [
    FeatureLookup( 
      table_name = bureau_balance_rolling_features_table,
      feature_names = "CREDIT_STATUS_EMA_AVG",
      lookup_key = ["CREDIT_STATUS_EMA_AVG"],
    )
]

agg_avg_bureau_feature_lookups = [
    FeatureLookup( 
      table_name = agg_avg_bureau_features_table,
      feature_names = "CREDIT_DAY_OVERDUE_MEAN",
      lookup_key = ["CREDIT_DAY_OVERDUE_MEAN"],
    )
]