In [1]:
!pip install --user -r /home/jupyter/【SIGANTE】金融分析/signate_financial/ito/requirements.txt
import sys
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import polars as pl
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import category_encoders as ce

import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier



In [2]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 28
    AUTHOR = 'Ito_final_7th'
    COMPETITION = 'FUDA2'
    DATA_PATH = Path('/home/jupyter/【SIGANTE】金融分析/signate_financial/ito/data')
    OOF_DATA_PATH = Path('/home/jupyter/【SIGANTE】金融分析/signate_financial/ito/oof')
    MODEL_DATA_PATH = Path('/home/jupyter/【SIGANTE】金融分析/signate_financial/ito/models')
    SUB_DATA_PATH = Path('/home/jupyter/【SIGANTE】金融分析/signate_financial/ito/submission')
    METHOD_LIST = ['lightgbm', 'xgboost', 'catboost']
    seed = 28
    n_folds = 5
    target_col = 'MIS_Status'
    metric = 'f1_score'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.05,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    model_weight_dict = {'lightgbm': 0.50, 'xgboost': 0.1, 'catboost': 0.4}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)

In [4]:
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')

In [5]:
def target_enc(df,test,col):
    features = [col for col in df.columns if col != CFG.target_col]
    kf = KFold(n_splits=CFG.n_folds,shuffle=True,random_state = CFG.seed)
    encoded_features = []

    for train_idx, val_idx in kf.split(df):
        X_train, X_valid = df[features].iloc[train_idx], df[features].iloc[val_idx]
        y_train = df[CFG.target_col].iloc[train_idx]

        target_encoder = ce.TargetEncoder()
        target_encoder.fit(X_train[col], y_train)

        X_valid[f'{col}_target_Encoded'] = target_encoder.transform(X_valid[col])
        encoded_features.append(X_valid)


    encoded_df = pd.concat(encoded_features).sort_index()
    df[f'{col}_target_Encoded'] = encoded_df[f'{col}_target_Encoded']
    
    target_encoder = ce.TargetEncoder()
    target_encoder.fit(df[[col]], df[CFG.target_col])

    test[f'{col}_target_Encoded'] = target_encoder.transform(test[[col]])
    
    return df, test

def encoder(df,test):
    object_columns = [col for col in df.columns if df[col].dtype == 'object']
    
    for col in object_columns:
        df,test = target_enc(df,test,col)
    
    df.drop(object_columns,axis=1,inplace=True)
    test.drop(object_columns,axis=1,inplace=True)
        
    return df,test

def preprocess(df,test):
    df[['ApprovalDate', 'DisbursementDate']] = df[['ApprovalDate', 'DisbursementDate']].apply(pd.to_datetime)
    df['DiffDays'] = (df['DisbursementDate'] - df['ApprovalDate']).dt.days
    days_mean = np.mean(df['DiffDays'].dropna())
    df['ApprovalToDisbursement_days'] = df['DiffDays'].fillna(days_mean).astype(int)
    df[['ApprovalDate', 'DisbursementDate']] = df[['ApprovalDate', 'DisbursementDate']].apply(lambda x: x.dt.strftime('%Y-%m-%d'))

    test[['ApprovalDate', 'DisbursementDate']] = test[['ApprovalDate', 'DisbursementDate']].apply(pd.to_datetime)
    test['DiffDays'] = (test['DisbursementDate'] - test['ApprovalDate']).dt.days
    days_mean = np.mean(test['DiffDays'].dropna())
    test['ApprovalToDisbursement_days'] = test['DiffDays'].fillna(days_mean).astype(int)
    test[['ApprovalDate', 'DisbursementDate']] = test[['ApprovalDate', 'DisbursementDate']].apply(lambda x: x.dt.strftime('%Y-%m-%d'))
    
    for col in ['RevLineCr', 'LowDoc', 'BankState', 'DisbursementDate']:
        df[col] = df[col].fillna('[UNK]')
        test[col] = test[col].fillna('[UNK]')
    
    for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
        df[col] = df[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
        test[col] = test[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
    
    df['NewExist'] = np.where(df['NewExist'] == 1, 1, 0)
    test['NewExist'] = np.where(test['NewExist'] == 1, 1, 0)
    
    df,test = encoder(df,test)
    features = [col for col in df.columns if col != CFG.target_col]
    
    return df,test,features

In [6]:
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list):
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    model = lgb.train(
                params = CFG.classification_lgb_params,
                train_set = lgb_train,
                num_boost_round = CFG.num_boost_round,
                valid_sets = [lgb_train, lgb_valid],
                feval = lgb_metric,
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                              verbose=CFG.verbose)]
            )
    valid_pred = model.predict(x_valid)
    
    importance_df = pd.DataFrame({
        'feature': features,
        'importance': model.feature_importance(importance_type='gain')
    })
    importance_df['importance'] = importance_df['importance'] / np.sum(importance_df['importance'])
    importance_df = importance_df.sort_values(by='importance', ascending=False)
    print(importance_df)
    return model, valid_pred
def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
    model = xgb.train(
                CFG.classification_xgb_params,
                dtrain = xgb_train,
                num_boost_round = CFG.num_boost_round,
                evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                early_stopping_rounds = CFG.early_stopping_round,
                verbose_eval = CFG.verbose,
                feval = xgb_metric,
                maximize = CFG.metric_maximize_flag,
        )
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred
def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list):
    cat_train = Pool(data=x_train, label=y_train)
    cat_valid = Pool(data=x_valid, label=y_valid)
    model = CatBoostClassifier(**CFG.classification_cat_params)
    model.fit(cat_train,
              eval_set = [cat_valid],
              early_stopping_rounds = CFG.early_stopping_round,
              verbose = CFG.verbose,
              use_best_model = True)
    valid_pred = model.predict_proba(x_valid)[:, 1]
    return model, valid_pred

def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list):
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):
        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features)
        if method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features)

        pickle.dump(model, open(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))

        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.77, average='macro')
    print(f'{method} our out of folds CV f1score is {score}')

    oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(CFG.OOF_DATA_PATH / f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)

def Learning(input_df: pd.DataFrame, features: list):
    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, input_df, features)

In [7]:
def unified_inference(method: str, x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model_path = CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl'
        model = pickle.load(open(model_path, 'rb'))

        if method == 'lightgbm':
            pred = model.predict(x_test)
        elif method == 'xgboost':
            pred = model.predict(xgb.DMatrix(x_test))
        elif method == 'catboost':
            pred = model.predict_proba(x_test)[:, 1]
        else:
            raise ValueError(f"Unsupported method: {method}")

        test_pred += pred
    
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list):
    x_test = test_df[features]
    test_pred = unified_inference(method, x_test)
    return test_pred

def Predicting(input_df: pd.DataFrame, features: list):
    output_df = input_df.copy()
    output_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        output_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, input_df, features)
        output_df['pred_prob'] += CFG.model_weight_dict[method] * output_df[f'{method}_pred_prob']
        output_df['target'] = np.where(output_df['pred_prob'] >= 0.77, 1, 0)
    return output_df

In [8]:
train_df = pd.read_csv(CFG.DATA_PATH / 'train.csv')
test_df = pd.read_csv(CFG.DATA_PATH / 'test.csv')

In [9]:
train_df,test_df,features = preprocess(train_df,test_df)

In [10]:
Learning(train_df, features)

--------------------------------------------------
lightgbm training fold 1
[LightGBM] [Info] Number of positive: 30230, number of negative: 3615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002636 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3247
[LightGBM] [Info] Number of data points in the train set: 33845, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.893190 -> initscore=2.123743
[LightGBM] [Info] Start training from score 2.123743
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[51]	training's auc: 0.834992	training's f1score: 0.653433	valid_1's auc: 0.7826	valid_1's f1score: 0.631085
                            feature  importance
18              City_target_Encoded    0.263439
17      ApprovalDate_target_Encoded    0.177227
15            LowDo

In [11]:
test_df = Predicting(test_df, features)
test_df

Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,Sector,ApprovalFY,DisbursementGross,GrAppv,...,DisbursementDate_target_Encoded,ApprovalDate_target_Encoded,City_target_Encoded,State_target_Encoded,BankState_target_Encoded,pred_prob,lightgbm_pred_prob,target,xgboost_pred_prob,catboost_pred_prob
0,5,2,1,1,0,0,23,2007,25000.0,25000.0,...,0.399807,0.790942,0.480521,0.881619,0.877239,0.316286,0.311477,0,0.323284,0.320547
1,235,13,1,9,14,77725,44,2004,15000.0,15000.0,...,0.933705,0.993242,0.872251,0.907687,0.886764,0.970341,0.969379,1,0.973361,0.970788
2,31,5,0,0,0,0,56,2007,28000.0,28000.0,...,0.626667,0.917029,0.761868,0.821707,0.814519,0.459781,0.444358,0,0.529078,0.461736
3,120,4,1,0,1,0,62,1998,7500.0,7500.0,...,0.948357,0.906250,0.761868,0.821707,0.814519,0.870976,0.867382,1,0.852727,0.880029
4,63,13,1,0,8,1,42,2009,91000.0,93000.0,...,0.927509,0.906651,0.917447,0.873057,0.910638,0.959515,0.957541,1,0.959294,0.962038
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42303,243,10,1,3,14,0,42,2012,390000.0,150000.0,...,0.930714,0.921549,0.891740,0.873057,0.874327,0.960306,0.957462,1,0.961104,0.963661
42304,178,0,0,0,0,1,0,2007,100000.0,100000.0,...,0.921665,0.934498,0.960784,0.907687,0.890594,0.916416,0.907266,1,0.893250,0.933645
42305,42,1,0,3,9,0,33,1989,17000.0,17000.0,...,0.967742,0.863991,0.928296,0.868835,0.925272,0.924140,0.917452,1,0.934279,0.929966
42306,76,15,1,0,0,0,0,2006,7500.0,7500.0,...,0.936047,0.907911,0.964942,0.930253,0.924855,0.966288,0.963490,1,0.966309,0.969780


In [12]:
new_index = range(42307, 42307 + len(test_df))
test_df.index = new_index
test_df[['target']].to_csv(CFG.SUB_DATA_PATH / f'seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_submission.csv', header=False)

In [13]:
test_df['target'].value_counts()

1    38465
0     3843
Name: target, dtype: int64