# LB 0.6752514

In [1]:
!pip install -r requirements.txt
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

Collecting polars (from -r requirements.txt (line 2))
  Obtaining dependency information for polars from https://files.pythonhosted.org/packages/f6/c7/412912cc735bec03de751e506c3380ae393032f2e786e2a93d160acbf1dd/polars-0.20.6-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached polars-0.20.6-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting mlflow (from -r requirements.txt (line 4))
  Obtaining dependency information for mlflow from https://files.pythonhosted.org/packages/b6/ae/06a299c661262aad52997e07d0e5fae5b5682401afc061fdb7c8f2103780/mlflow-2.10.0-py3-none-any.whl.metadata
  Using cached mlflow-2.10.0-py3-none-any.whl.metadata (13 kB)
Collecting kaleido>=0.2.1 (from pycaret->-r requirements.txt (line 3))
  Using cached kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
Collecting xxhash (from pycaret->-r requirements.txt (line 3))
  Obtaining dependency information for xxhash from https://files.pythonhosted.org/

In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample.csv', header=None)
new_index = range(42307, 42307 + len(test))
test.index = new_index

In [3]:
def preprocess(df, replace_dict=None, ce_dict=None):

    # 借り手の会社に関する変数（Sector, FranchiseCode）
    # 31-33, 44-45, 48-49 は同じらしい => 32,33を31に, 45を44に, 49を48に変換
    code_dict = {
        32: 31,
        33: 31,
        45: 44,
        49: 48
    }
    df["Sector"] = df["Sector"].replace(code_dict)

    # 今回の借り入れに関する変数（RevLineCr, LowDoc）
    # 公式ページには値の候補が2つ（YesとNoのYN）と記載があるが、実際の値の種類は2より多い。YN以外はNaNへ置換
    revline_dict = {'0': np.nan, 'T': np.nan}
    df["RevLineCr"] = df["RevLineCr"].replace(revline_dict)

    lowdoc_dict = {'C': np.nan, '0': np.nan, 'S': np.nan, 'A': np.nan}
    df["LowDoc"] = df["LowDoc"].replace(lowdoc_dict)

    # 日付系の変数（DisbursementDate, ApprovalDate）
    # 日付型へ変更 → 年を抽出（借りた月や日にはあまり意味はないと思われるため）
    df['DisbursementDate'] = pd.to_datetime(df['DisbursementDate'], format='%d-%b-%y')
    df["DisbursementYear"] = df["DisbursementDate"].dt.year

    # 本来数値型のものを変換する
    cols = ["DisbursementGross", "GrAppv", "SBA_Appv"]
    df[cols] = df[cols].applymap(lambda x: x.strip().replace('$', '').replace(',', '')).astype(float).astype(int)

    # 特徴量エンジニアリング
    df["FY_Diff"] = df["ApprovalFY"] - df["DisbursementYear"]
    df["State_is_BankState"] = (df["State"] == df["BankState"])
    df["State_is_BankState"] = df["State_is_BankState"].replace({True: 1, False: 0})

    df['SBA_Portion'] = df['SBA_Appv'] / df['GrAppv']
    df["DisbursementGrossRatio"] = df["DisbursementGross"] / df["GrAppv"]
    df["MonthlyRepayment"] = df["GrAppv"] / df["Term"]
    df["NullCount"] = df.isnull().sum(axis=1)

    # カテゴリカル変数の設定
    df[cols_category] = df[cols_category].fillna(-1)

    # train
    if replace_dict is None:
        # countencode, labelencode
        # ce_dict: 列名を入れるとそのカテゴリのデータがどのくらいあるかを返してくれます
        # replace_dict: 列名を入れるとlabelencodeのための数字を返してくれます
        ce_dict = {}
        replace_dict = {}
        for col in cols_category:
            replace_dict[col] = {}
            vc = df[col].value_counts()
            ce_dict[col] = vc
            replace_dict_in_dict = {}
            for i, k in enumerate(vc.keys()):
                replace_dict_in_dict[k] = i
            replace_dict[col] = replace_dict_in_dict
            df[f"{col}_CountEncode"] = df[col].replace(vc).astype(int)
            df[col] = df[col].replace(replace_dict_in_dict).astype(int)
        return df, replace_dict, ce_dict

    # test
    else:
        for col in cols_category:
            # カウントエンコード
            test_vals_uniq = df[col].unique()
            ce_dict_in_dict = ce_dict[col]
            for test_val in test_vals_uniq:
                if test_val not in ce_dict_in_dict.keys():
                    ce_dict_in_dict[test_val] = -1
            df[f"{col}_CountEncode"] = df[col].replace(ce_dict_in_dict).astype(int)

            # LabelEncode
            test_vals_uniq = df[col].unique()
            replace_dict_in_dict = replace_dict[col]
            for test_val in test_vals_uniq:
                if test_val not in replace_dict_in_dict.keys():
                    replace_dict_in_dict[test_val] = -1
            df[col] = df[col].replace(replace_dict_in_dict).astype(int)
        return df
    df.drop(["DisbursementDate", "ApprovalDate","City"], axis=1, inplace=True)

In [4]:
cols_category = []
for col in df.columns:
    if df[col].dtypes == 'object':
        cols_category.append(col)

In [5]:
df, replace_dict, ce_dict = preprocess(df)

test_df = preprocess(test, replace_dict, ce_dict)

In [6]:
X = df.drop('MIS_Status',axis=1)
y = df['MIS_Status']

In [7]:
from sklearn.utils import class_weight
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(zip(np.unique(y), weights))

In [8]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.model_selection import StratifiedKFold
import statistics
from scipy.stats import mode

In [9]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import statistics
from scipy.stats import mode

average_f1 = []
average_roc_auc_score = []
predictions = []
skf = StratifiedKFold(n_splits=5)
for train_idx, val_idx in skf.split(X,y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    params = {
        'objective': 'binary',
        'metric': 'f1',
        'class_weight': class_weights,
        "n_estimators": 3000,
        "learning_rate": 0.01,
        "colsample_bytree": 0.8,
        "subsample_freq": 1,
        "subsample": 0.8,
        "random_seed": 0,
        'verbose': -1,
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    y_pred = model.predict(X_val)
    mean_f1 = f1_score(y_val, y_pred, average='macro')
    roc_auc = roc_auc_score(y_val, y_pred)
    average_f1.append(mean_f1)
    average_roc_auc_score.append(roc_auc)
    feature_gains = model.booster_.feature_importance(importance_type='gain')
    feature = pd.DataFrame({'feature': X.columns, 'importance': feature_gains})
    feature.sort_values(by='importance',ascending=False)
    print(feature)
    print(f"Mean F1 Score (Macro F1 Score) : {mean_f1}")
    print(f"ROC_AUC_Score : {roc_auc}")
    
    
average_f1 = statistics.mean(average_f1)
average_roc_auc_score = statistics.mean(average_roc_auc_score)
print(' ')
print(f'Average_f1marco : {average_f1}')
print(f'Average_roc_auc : {average_roc_auc_score}')

                          feature     importance
0                            Term   79171.405820
1                           NoEmp   91309.713784
2                        NewExist    6219.099139
3                       CreateJob   29779.766485
4                     RetainedJob   35527.159718
5                   FranchiseCode   20953.661209
6                       RevLineCr   16984.197553
7                          LowDoc   41247.381439
8                DisbursementDate   65284.307626
9                          Sector   33056.961220
10                   ApprovalDate   84301.015760
11                     ApprovalFY   36213.887647
12                           City   69761.493505
13                          State   35324.321803
14                      BankState   34186.859103
15              DisbursementGross   53620.676730
16                         GrAppv   42374.873174
17                       SBA_Appv   59151.750692
18                     UrbanRural  126512.161096
19               Dis

In [10]:
from sklearn import metrics
def decide_cutoff(val_y, preds_y_proba):
    mean_f1_list = []
    fpr, tpr, thresholds = metrics.roc_curve(val_y, preds_y_proba)
    for threshold in thresholds:
        preds_y = [1 if prob > threshold else 0 for prob in preds_y_proba]
        mean_f1_list.append(f1_score(val_y, preds_y, average='macro'))
    return np.max(mean_f1_list), thresholds[np.argmax(mean_f1_list)]

In [11]:
list_metrics_auc = []
list_metrics_f1 = []
list_cutoff = []
list_models = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
for fold, (trn_idx, val_idx) in enumerate(cv.split(X, y), start=1):
    trn_x = X.iloc[trn_idx, :]
    trn_y = y[trn_idx]
    val_x = X.iloc[val_idx, :]
    val_y = y[val_idx]
    model_lgb = lgb.LGBMClassifier(**params)
    
    model_lgb.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)],
    )
    list_models.append(model_lgb)
    preds_y_proba = model_lgb.predict_proba(val_x)[:, 1]
    auc = roc_auc_score(val_y, preds_y_proba)
    f1, threshold = decide_cutoff(val_y, preds_y_proba)
    list_metrics_auc.append(auc)
    list_metrics_f1.append(f1)
    list_cutoff.append(threshold)
    print(f"Fold: {fold}, AUC: {auc}, f1 score: {f1} Threshold: {threshold}")

Fold: 1, AUC: 0.7620549088114816, f1 score: 0.6681857310163786 Threshold: 0.3401424796390054
Fold: 2, AUC: 0.7771182859468214, f1 score: 0.6761518490734275 Threshold: 0.23555850097892836
Fold: 3, AUC: 0.774128705927084, f1 score: 0.6764122165629486 Threshold: 0.33893217207184473


In [12]:
threshold = np.median(list_cutoff)
preds_y_proba = np.zeros(len(test))
for model in list_models:
    preds_y_proba += model.predict_proba(test[model.feature_name_])[:, 1] / len(list_models)
preds_y = [1 if prob > threshold else 0 for prob in preds_y_proba]

In [13]:
sample[1] = preds_y
sample[1] = sample[1].astype(int)
#sample.to_csv("submission.csv", header=False, index=False)