In [1]:
!pip install lightgbm
!pip install pytorch_tabnet
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from pytorch_tabnet.tab_model import TabNetClassifier

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/ba/11/cb8b67f3cbdca05b59a032bb57963d4fe8c8d18c3870f30bed005b7f174d/lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl.metadata
  Using cached lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl.metadata (19 kB)
Using cached lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.3.0
Collecting pytorch_tabnet
  Obtaining dependency information for pytorch_tabnet from https://files.pythonhosted.org/packages/0f/92/ed98b89b7cf5661656daa4cc88e578f712eb5eae41b8f46a56c1ece3a895/pytorch_tabnet-4.1.0-py3-none-any.whl.metadata
  Using cached pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting torch>=1.3 (from pytorch_tabnet)
  Obtaining dependency information for torch>=1.3 from https://files.pythonhosted.org/packages/03/f1/13137340776dd5d5bcfd2574c9c6dfcc7618285035cd77240496e5c1a79b/torch-2.1.2-cp

In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
new_index = range(42307, 42307 + len(test))
test.index = new_index

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42307 entries, 0 to 42306
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Term               42307 non-null  int64  
 1   NoEmp              42307 non-null  int64  
 2   NewExist           42307 non-null  float64
 3   CreateJob          42307 non-null  int64  
 4   RetainedJob        42307 non-null  int64  
 5   FranchiseCode      42307 non-null  int64  
 6   RevLineCr          41228 non-null  object 
 7   LowDoc             41776 non-null  object 
 8   DisbursementDate   42157 non-null  object 
 9   MIS_Status         42307 non-null  int64  
 10  Sector             42307 non-null  int64  
 11  ApprovalDate       42307 non-null  object 
 12  ApprovalFY         42307 non-null  int64  
 13  City               42307 non-null  object 
 14  State              42307 non-null  object 
 15  BankState          42296 non-null  object 
 16  DisbursementGross  423

In [4]:
def preprocess(df, replace_dict=None, ce_dict=None):

    # 借り手の会社に関する変数（Sector, FranchiseCode）
    # 31-33, 44-45, 48-49 は同じらしい => 32,33を31に, 45を44に, 49を48に変換
    code_dict = {
        32: 31,
        33: 31,
        45: 44,
        49: 48
    }
    df["Sector"] = df["Sector"].replace(code_dict)

    # 今回の借り入れに関する変数（RevLineCr, LowDoc）
    # 公式ページには値の候補が2つ（YesとNoのYN）と記載があるが、実際の値の種類は2より多い。YN以外はNaNへ置換
    revline_dict = {'0': np.nan, 'T': np.nan}
    df["RevLineCr"] = df["RevLineCr"].replace(revline_dict)

    lowdoc_dict = {'C': np.nan, '0': np.nan, 'S': np.nan, 'A': np.nan}
    df["LowDoc"] = df["LowDoc"].replace(lowdoc_dict)

    # 日付系の変数（DisbursementDate, ApprovalDate）
    # 日付型へ変更 → 年を抽出（借りた月や日にはあまり意味はないと思われるため）
    df['DisbursementDate'] = pd.to_datetime(df['DisbursementDate'], format='%d-%b-%y')
    df["DisbursementYear"] = df["DisbursementDate"].dt.year

    # 本来数値型のものを変換する
    cols = ["DisbursementGross", "GrAppv", "SBA_Appv"]
    df[cols] = df[cols].applymap(lambda x: x.strip().replace('$', '').replace(',', '')).astype(float).astype(int)

    # 特徴量エンジニアリング
    df["FY_Diff"] = df["ApprovalFY"] - df["DisbursementYear"]
    df["State_is_BankState"] = (df["State"] == df["BankState"])
    df["State_is_BankState"] = df["State_is_BankState"].replace({True: 1, False: 0})

    df['SBA_Portion'] = df['SBA_Appv'] / df['GrAppv']
    df["DisbursementGrossRatio"] = df["DisbursementGross"] / df["GrAppv"]
    df["MonthlyRepayment"] = df["GrAppv"] / df["Term"]
    df["NullCount"] = df.isnull().sum(axis=1)

    # カテゴリカル変数の設定
    df[cols_category] = df[cols_category].fillna(-1)

    # train
    if replace_dict is None:
        # countencode, labelencode
        # ce_dict: 列名を入れるとそのカテゴリのデータがどのくらいあるかを返してくれます
        # replace_dict: 列名を入れるとlabelencodeのための数字を返してくれます
        ce_dict = {}
        replace_dict = {}
        for col in cols_category:
            replace_dict[col] = {}
            vc = df[col].value_counts()
            ce_dict[col] = vc
            replace_dict_in_dict = {}
            for i, k in enumerate(vc.keys()):
                replace_dict_in_dict[k] = i
            replace_dict[col] = replace_dict_in_dict
            df[f"{col}_CountEncode"] = df[col].replace(vc).astype(int)
            df[col] = df[col].replace(replace_dict_in_dict).astype(int)
        return df, replace_dict, ce_dict

    # test
    else:
        for col in cols_category:
            # カウントエンコード
            test_vals_uniq = df[col].unique()
            ce_dict_in_dict = ce_dict[col]
            for test_val in test_vals_uniq:
                if test_val not in ce_dict_in_dict.keys():
                    ce_dict_in_dict[test_val] = -1
            df[f"{col}_CountEncode"] = df[col].replace(ce_dict_in_dict).astype(int)

            # LabelEncode
            test_vals_uniq = df[col].unique()
            replace_dict_in_dict = replace_dict[col]
            for test_val in test_vals_uniq:
                if test_val not in replace_dict_in_dict.keys():
                    replace_dict_in_dict[test_val] = -1
            df[col] = df[col].replace(replace_dict_in_dict).astype(int)
        return df
    df.drop(["DisbursementDate", "ApprovalDate","City"], axis=1, inplace=True)

In [5]:
cols_category = []
for col in df.columns:
    if df[col].dtypes == 'object':
        cols_category.append(col)

In [6]:
df, replace_dict, ce_dict = preprocess(df)

test_df = preprocess(test, replace_dict, ce_dict)

In [19]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score
import numpy as np

class F1Metric(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        y_pred = np.argmax(y_score, axis=1)
        return f1_score(y_true, y_pred, average='macro')

In [8]:
df = df.replace([np.inf, -np.inf], np.nan) 
df_dropna = df.dropna(axis=0)
X = df_dropna.drop('MIS_Status',axis=1)
y = df_dropna['MIS_Status']

In [9]:
inf_rows = df_dropna.isin([np.inf, -np.inf]).any(axis=1)
inf_cols = df_dropna.isin([np.inf, -np.inf]).any(axis=0)

print("無限大の値を含む行:")
print(X[inf_rows])

print("無限大の値を含む列:")
print(inf_cols)

無限大の値を含む行:
Empty DataFrame
Columns: [Term, NoEmp, NewExist, CreateJob, RetainedJob, FranchiseCode, RevLineCr, LowDoc, DisbursementDate, Sector, ApprovalDate, ApprovalFY, City, State, BankState, DisbursementGross, GrAppv, SBA_Appv, UrbanRural, DisbursementYear, FY_Diff, State_is_BankState, SBA_Portion, DisbursementGrossRatio, MonthlyRepayment, NullCount, RevLineCr_CountEncode, LowDoc_CountEncode, DisbursementDate_CountEncode, ApprovalDate_CountEncode, City_CountEncode, State_CountEncode, BankState_CountEncode, DisbursementGross_CountEncode, GrAppv_CountEncode, SBA_Appv_CountEncode]
Index: []

[0 rows x 36 columns]
無限大の値を含む列:
Term                             False
NoEmp                            False
NewExist                         False
CreateJob                        False
RetainedJob                      False
FranchiseCode                    False
RevLineCr                        False
LowDoc                           False
DisbursementDate                 False
MIS_Status       

In [10]:
from sklearn.preprocessing import MinMaxScaler
X.replace([np.inf, -np.inf], np.mean, inplace=True)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

In [11]:
from sklearn.utils import class_weight
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(zip(np.unique(y), weights))

In [31]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

average_f1 = []
predictions = []
skf = StratifiedKFold(n_splits=5)
f1_metric = F1Metric()

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx].values, X.iloc[val_idx].values
    y_train, y_val = y.iloc[train_idx].values, y.iloc[val_idx].values
    
    # TabNetモデルの設定
    model = TabNetClassifier(
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        scheduler_params={"step_size":50, "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='entmax' # "sparsemax"もオプション
    )

    # モデルの訓練
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        max_epochs=50,
        patience=100,
        batch_size=256, 
        virtual_batch_size=128,
        num_workers=0,
        weights=class_weights,
        drop_last=False
    )

    y_pred = model.predict(X_val)
    mean_f1 = f1_score(y_val, y_pred, average='macro')
    average_f1.append(mean_f1)

    feature_importances_ = model.feature_importances_
    feature = pd.DataFrame({'feature': X.columns, 'importance': feature_importances_})
    feature.sort_values(by='importance', ascending=False, inplace=True)
    print(feature)
    print("Mean F1 Score (Macro F1 Score):", mean_f1)

average_f1 = np.mean(average_f1)
print(' ')
print(f'Average F1 Score (Macro F1 Score): {average_f1}')

epoch 0  | loss: 0.64009 | val_0_auc: 0.70667 |  0:00:03s
epoch 1  | loss: 0.6048  | val_0_auc: 0.73119 |  0:00:07s
epoch 2  | loss: 0.58924 | val_0_auc: 0.74313 |  0:00:11s
epoch 3  | loss: 0.58406 | val_0_auc: 0.75015 |  0:00:15s
epoch 4  | loss: 0.57859 | val_0_auc: 0.75152 |  0:00:19s
epoch 5  | loss: 0.57929 | val_0_auc: 0.75835 |  0:00:23s
epoch 6  | loss: 0.57322 | val_0_auc: 0.75443 |  0:00:26s
epoch 7  | loss: 0.56774 | val_0_auc: 0.76175 |  0:00:30s
epoch 8  | loss: 0.57345 | val_0_auc: 0.75657 |  0:00:34s
epoch 9  | loss: 0.55865 | val_0_auc: 0.76133 |  0:00:38s
epoch 10 | loss: 0.56258 | val_0_auc: 0.7611  |  0:00:42s
epoch 11 | loss: 0.55672 | val_0_auc: 0.75037 |  0:00:46s
epoch 12 | loss: 0.56239 | val_0_auc: 0.7564  |  0:00:50s
epoch 13 | loss: 0.55343 | val_0_auc: 0.76181 |  0:00:53s
epoch 14 | loss: 0.54918 | val_0_auc: 0.76098 |  0:00:57s
epoch 15 | loss: 0.54994 | val_0_auc: 0.76145 |  0:01:01s
epoch 16 | loss: 0.54265 | val_0_auc: 0.76087 |  0:01:05s
epoch 17 | los