In [38]:
!pip install --user -r requirements.txt
import sys
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import polars as pl
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm

import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

In [47]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 2
    AUTHOR = 'Ito'
    COMPETITION = 'FUDA2'
    DATA_PATH = Path('data')

    METHOD_LIST = ['lightgbm']
    seed = 28
    n_folds = 5
    target_col = 'MIS_Status'
    metric = 'f1_score'
    metric_maximize_flag = True
    num_boost_round = 1000
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }

In [48]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)

In [49]:
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

In [62]:
train_df = pd.read_csv(CFG.DATA_PATH / 'train.csv')
test_df = pd.read_csv(CFG.DATA_PATH / 'test.csv')

In [61]:
train_df

Unnamed: 0.1,Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,FranchiseCode,Sector,ApprovalFY,target_RevLineCr,target_LowDoc,target_City,target_State,target_BankState,target_NewExist,target_UrbanRural,MIS_Status
0,0.0,163.0,21.0,0.0,0.0,1.0,0.0,2006.0,0.923335,0.906527,0.915323,0.909984,0.941145,1.0,0.0,1
1,1.0,84.0,6.0,4.0,0.0,0.0,62.0,1992.0,0.890754,0.906527,0.912825,0.908213,0.918539,1.0,0.0,1
2,2.0,242.0,45.0,4.0,90.0,0.0,42.0,2001.0,0.923335,0.906527,0.909583,0.956743,0.967949,1.0,1.0,1
3,3.0,237.0,4.0,0.0,0.0,0.0,33.0,2004.0,0.923335,0.906527,0.966874,0.926936,0.941145,1.0,0.0,1
4,4.0,184.0,0.0,0.0,0.0,0.0,0.0,2000.0,0.923335,0.906527,0.915217,0.878349,0.876301,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42302,42302.0,283.0,14.0,0.0,0.0,1.0,0.0,1995.0,0.923265,0.907161,0.959432,0.907930,0.885797,1.0,0.0,1
42303,42303.0,53.0,2.0,0.0,0.0,0.0,42.0,2007.0,0.813138,0.907161,0.905512,0.881172,0.941489,1.0,1.0,1
42304,42304.0,59.0,6.0,0.0,0.0,1.0,42.0,2003.0,0.923265,0.907161,0.929987,0.906639,0.884442,2.0,0.0,1
42305,42305.0,295.0,18.0,0.0,8.0,0.0,42.0,1989.0,0.923265,0.907161,0.892631,0.805452,0.781818,1.0,0.0,1


In [53]:
import category_encoders as ce
def target_enc(df,test,col):
    kf = KFold(n_splits=CFG.n_folds,shuffle=True,random_state = CFG.seed)
    encoded_features = []

    for train_idx, val_idx in kf.split(df):
        X_train, X_valid = df[features].iloc[train_idx], df[features].iloc[val_idx]
        y_train = df[CFG.target_col].iloc[train_idx]

        target_encoder = ce.TargetEncoder()
        target_encoder.fit(X_train[col], y_train)

        X_valid[f'{col}_target_Encoded'] = target_encoder.transform(X_valid[col])
        encoded_features.append(X_valid)


    encoded_df = pd.concat(encoded_features).sort_index()
    df[f'{col}_target_Encoded'] = encoded_df[f'{col}_target_Encoded']
    
    target_encoder = ce.TargetEncoder()
    target_encoder.fit(df[[col]], df[CFG.target_col])

    test[f'{col}_target_Encoded'] = target_encoder.transform(test[[col]])
    
    return df, test

def encoder(df,test):
    object_columns = [col for col in df.columns if df[col].dtype == 'object']
    
    for col in object_columns:
        df,test = target_enc(df,test, col)
    
    df.drop(object_columns,axis=1,inplace=True)
    test.drop(object_columns,axis=1,inplace=True)
        
    return df,test

features = [col for col in train_df.columns if col != CFG.target_col]
train_df,test_df = encoder(train_df,test_df)
features = [col for col in train_df.columns if col != CFG.target_col]

In [63]:
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list):
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    model = lgb.train(
                params = CFG.classification_lgb_params,
                train_set = lgb_train,
                num_boost_round = CFG.num_boost_round,
                valid_sets = [lgb_train, lgb_valid],
                feval = lgb_metric,
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                              verbose=CFG.verbose)]
            )
    # Predict validation
    valid_pred = model.predict(x_valid)
    
    importance_df = pd.DataFrame({
        'feature': features,
        'importance': model.feature_importance(importance_type='gain')
    })
    importance_df['importance'] = importance_df['importance'] / np.sum(importance_df['importance'])
    importance_df = importance_df.sort_values(by='importance', ascending=False)
    print(importance_df)
    return model, valid_pred

def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list):
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):
        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features)

        importance_df = pd.DataFrame(model.feature_importance(), index=features, columns=['importance'])
        importance_df['importance'] = importance_df['importance'] / np.sum(importance_df['importance'])
        importance_df.sort_values('importance', ascending=False)

        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
    print(f'{method} our out of folds CV f1score is {score}')

def Learning(input_df: pd.DataFrame, features: list):
    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, input_df, features)

In [68]:
print(train_df.columns)

Index(['ID', 'Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'FranchiseCode',
       'Sector', 'ApprovalFY', 'target_RevLineCr', 'target_LowDoc',
       'target_City', 'target_State', 'target_BankState', 'target_NewExist',
       'target_UrbanRural', 'MIS_Status'],
      dtype='object')


In [66]:
# 'Unnamed: 0' 列名を修正
train_df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
test_df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)

In [69]:
# 'Unnamed: 0' 列名を削除
features.remove('Unnamed: 0')

# トレーニング前に features リストを再度定義
features = [col for col in train_df.columns if col != CFG.target_col]

# トレーニングを実行
Learning(train_df, features)

--------------------------------------------------
lightgbm training fold 1
[LightGBM] [Info] Number of positive: 30230, number of negative: 3615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1795
[LightGBM] [Info] Number of data points in the train set: 33845, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.893190 -> initscore=2.123743
[LightGBM] [Info] Start training from score 2.123743
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[58]	training's auc: 0.827606	training's f1score: 0.648668	valid_1's auc: 0.771339	valid_1's f1score: 0.638862
              feature  importance
10        target_City    0.335045
9       target_LowDoc    0.161206
8    target_RevLineCr    0.131268
14  target_UrbanRural    