In [1]:
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import StratifiedGroupKFold
import pandas as pd
import polars as pl
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import joblib
from datetime import datetime
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline


import sys
sys.path.append("..")
#sys.path.append("./../src/preproc")
from src.modeling import random_seed
from src.preproc.preproc_static import main_preproc_static 
from src.preproc.utils import reduce_memory_usage_pl, print_memory
random_seed(42)

# 1. Setup and data loading

In [2]:
### Setup
ROOT = '..'
EXTENSION = 'parquet'
DATA_FOLDER = 'data'

In [3]:
train = pl.read_parquet(f'{ROOT}/{DATA_FOLDER}/train/train_base.{EXTENSION}')
train = train.with_columns(pl.col('date_decision').cast(pl.Date))
train.head(2)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,date,i64,i64,i64
0,2019-01-03,201901,0,0
1,2019-01-03,201901,0,0


In [4]:
# Import static features
train_static_0 = pl.read_parquet(f'{ROOT}/{DATA_FOLDER}/train/train_static_0_0.{EXTENSION}')
train_static_1 = pl.read_parquet(f'{ROOT}/{DATA_FOLDER}/train/train_static_0_1.{EXTENSION}')

train_static_concat = pl.concat([train_static_0, train_static_1], how="vertical_relaxed")

# Getting rid of high missing columns
#filter_high_na = (train_static_concat.isna().sum()/train_static_concat.shape[0]) < 0.5
#columns_to_keep = train_static_concat.columns[filter_high_na.values]

#train_merged = train_balanced.merge(train_static_concat[columns_to_keep], on='case_id', validate='1:1')

# Exclude object columns
#train_merged = train_merged.loc[:, ~train_merged.columns.isin(train_merged.select_dtypes(include=['O', '<M8[ns]']).columns)]

#train_merged.head()

# 2. PREPROC

## 2.1 TABLE STATIC

In [5]:
train_with_static = train.join(train_static_concat, left_on="case_id", right_on="case_id", how="left")

In [6]:
train_with_static, _ = main_preproc_static(train_with_static)

In [7]:
train_with_static.head()

case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,…,payvacationpostpone_4187118D,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,previouscontdistrict_112M,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,validfrom_1069D,datefirstoffer_1144D_days_from_appl,datelastinstal40dpd_247D_days_from_appl,datelastunpaid_3546854D_days_from_appl,dtlastpmtallstes_4499206D_days_from_appl,firstclxcampaign_1125D_days_from_appl,firstdatedue_489D_days_from_appl,lastactivateddate_801D_days_from_appl,lastapplicationdate_877D_days_from_appl,lastapprdate_640D_days_from_appl,lastdelinqdate_224D_days_from_appl,lastrejectdate_50D_days_from_appl,lastrepayingdate_696D_days_from_appl,maxdpdinstldate_3546855D_days_from_appl,payvacationpostpone_4187118D_days_from_appl,validfrom_1069D_days_from_appl
i64,date,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,date,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,date,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,2019-01-03,201901,0,0,,,1917.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,,,,,,,24.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""",,,,,,,,,,,,,,,,,
1,2019-01-03,201901,0,0,,,3134.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,…,,,,,,,18.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""",,,,,,,,,,,,,,,,,
2,2019-01-04,201901,0,0,,,4937.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,,,,,,,36.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""","""AL""",,,,,,,,,2102.0,,,2102.0,,,,
3,2019-01-03,201901,0,0,,,4643.6,0.0,0.0,1.0,0.0,2.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,…,,,,,,,12.0,0.0,0.0,,"""a55475b1""",,1.0,1.0,,,0.0,0.0,,"""BO""","""AL""",,,,,,,,,-4.0,,,-4.0,,,,
4,2019-01-04,201901,0,1,,,3390.2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,,,,,,,24.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""","""AL""",,,,,,,,,-4.0,,,,,,,


In [8]:
train_with_static = train_with_static.drop(["date_decision", "MONTH", "case_id"])

In [9]:
train_static_concat = reduce_memory_usage_pl(train_static_concat)

Memory usage of dataframe is 2294.37 MB
Memory usage of dataframe became 1543.11 MB


In [10]:
train_with_static_pd = train_with_static.to_pandas()

: 

In [None]:
del train_static_0, train_static_1, train_static_concat, train

# 3. Simple model

In [None]:
print_memory()

In [None]:
train_with_static = train_with_static.drop("WEEK_NUM")

In [None]:
X_train, y_train = train_with_static_pd.drop("target", axis =1),train_with_static_pd["target"]

In [None]:
import gc
gc.collect()

In [None]:

del train_with_static_pd


In [None]:
print_memory()

In [None]:
sys.exit()

In [None]:
            
    imba_pipeline = make_pipeline(
                RandomUnderSampler(random_state=42),
                RandomForestClassifier(random_state=13)
    )

In [None]:
new_params = {'randomforestclassifier__' + key: params[key] for key in params}


In [None]:
from sklearn.linear_model import LogisticRegression

# Assuming X is your feature matrix, y is your target variable, and weights is your sample weights
clf = LogisticRegression(class_weight="balanced")
clf.fit(X_train, y_train)


# Output and model savings

In [None]:
# Counter for current lgb version
n = 1
timestamp =datetime.today().strftime('%Y%m%d%H%M')

oof_lgb = np.concatenate(lgb_oof_preds)
true_lgb = np.concatenate(lgb_oof_targets)
auc_cv_lgb = roc_auc_score(true_lgb, oof_lgb)

print('Overall OOF AUC LGBM = {:.5f}'.format(auc_cv_lgb))

results = pd.DataFrame({'oof': oof_lgb, 'truth': true_lgb})
results.to_csv('{}/{}/output/oof_lgb_{}_{}.csv'.format(ROOT,
                                                       DATA_FOLDER,
                                                       n,
                                                       timestamp), index=False)

for model in lgb_clfs:
    joblib.dump(model, '{}/{}/models/lgb_{}_{}.pkl'.format(ROOT,
                                                           DATA_FOLDER,
                                                           n,
                                                           datetime.today().strftime('%Y%m%d%H%M')))

In [None]:
# Counter for current xgb version
n = 1
timestamp =datetime.today().strftime('%Y%m%d%H%M')

oof_xb = np.concatenate(xb_oof_preds)
true_xb = np.concatenate(xb_oof_targets)
auc_cv_xb = roc_auc_score(true_xb, oof_xb)

print('Overall OOF AUC XGB = {:.5f}'.format(auc_cv_xb))

results = pd.DataFrame({'oof': oof_xb, 'truth': true_xb})
results.to_csv('{}/{}/output/oof_xb_{}_{}.csv'.format(ROOT,
                                                      DATA_FOLDER,
                                                      n,
                                                      timestamp), index=False)

for model in xb_clfs:
    joblib.dump(model, '{}/{}/models/xb_{}_{}.pkl'.format(ROOT,
                                                          DATA_FOLDER,
                                                          n,
                                                          datetime.today().strftime('%Y%m%d%H%M')))