In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# データの読み込み
data = pd.read_csv('../data/processed_train_data.csv')
# 特徴量とターゲットの分離
X = data.drop(columns=['MIS_Status'])
y = data['MIS_Status']

# トレーニングセットとテストセットに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TabNetモデルの初期化
model = TabNetClassifier()

# モデルのトレーニング
model.fit(X_train.values, y_train.values, max_epochs=100, batch_size=256, virtual_batch_size=128)

# モデルの評価
preds = model.predict(X_test.values)
accuracy = (preds == y_test.values).mean()
print(f"Accuracy: {accuracy}")



epoch 0  | loss: 0.34539 |  0:00:03s
epoch 1  | loss: 0.31558 |  0:00:06s
epoch 2  | loss: 0.31415 |  0:00:09s
epoch 3  | loss: 0.31359 |  0:00:12s
epoch 4  | loss: 0.31018 |  0:00:15s
epoch 5  | loss: 0.30481 |  0:00:18s
epoch 6  | loss: 0.30393 |  0:00:20s
epoch 7  | loss: 0.30401 |  0:00:23s
epoch 8  | loss: 0.30421 |  0:00:25s
epoch 9  | loss: 0.30496 |  0:00:28s
epoch 10 | loss: 0.30085 |  0:00:30s
epoch 11 | loss: 0.29948 |  0:00:33s
epoch 12 | loss: 0.29762 |  0:00:35s
epoch 13 | loss: 0.29881 |  0:00:38s
epoch 14 | loss: 0.30012 |  0:00:41s
epoch 15 | loss: 0.29781 |  0:00:43s
epoch 16 | loss: 0.29702 |  0:00:46s
epoch 17 | loss: 0.29541 |  0:00:48s
epoch 18 | loss: 0.29466 |  0:00:51s
epoch 19 | loss: 0.29421 |  0:00:53s
epoch 20 | loss: 0.2936  |  0:00:56s
epoch 21 | loss: 0.29289 |  0:00:58s
epoch 22 | loss: 0.2933  |  0:01:01s
epoch 23 | loss: 0.29237 |  0:01:04s
epoch 24 | loss: 0.29251 |  0:01:06s
epoch 25 | loss: 0.29205 |  0:01:09s
epoch 26 | loss: 0.29255 |  0:01:11s
e

In [17]:
test = pd.read_csv('../data/processed_test_data_v3.csv')
test = test.drop(['City','State','BankState'],axis=1)
test.dtypes

Term                      float64
NoEmp                     float64
NewExist                  float64
CreateJob                 float64
RetainedJob               float64
FranchiseCode             float64
RevLineCr                 float64
LowDoc                    float64
Sector                    float64
ApprovalFY                float64
DisbursementGross         float64
GrAppv                    float64
SBA_Appv                  float64
UrbanRural                float64
DisbursementDate_Year     float64
DisbursementDate_Month    float64
DisbursementDate_Day      float64
ApprovalDate_Year         float64
ApprovalDate_Month        float64
ApprovalDate_Day          float64
dtype: object

In [18]:
# モデルを使用して予測
predictions = model.predict(test.values)
predictions

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [19]:
submit = pd.read_csv('../data/sample_submission.csv', header=None)

# 予測結果の組み込み
submit.iloc[:, 1] = predictions

# 提出用ファイルの保存
filename = 'tabnet_predictions'
submit.to_csv('outputs/' + filename + '.csv', index=False, header=None)

In [20]:
# 特徴量の重要度の取得
feature_importances = model.feature_importances_

# 特徴量の重要度を表示
for feature, importance in zip(X_train.columns, feature_importances):
    print(f"{feature}: {importance}")

Term: 0.09001853805388511
NoEmp: 0.2502581749596592
NewExist: 0.04762007764732473
CreateJob: 0.0
RetainedJob: 0.0
FranchiseCode: 0.0
RevLineCr: 0.15263194485933346
LowDoc: 0.0
Sector: 0.0024625466889392697
ApprovalFY: 0.0
DisbursementGross: 0.03337732478565205
GrAppv: 0.0
SBA_Appv: 0.0
UrbanRural: 0.015252136022041804
DisbursementDate_Year: 0.4083792569831643
DisbursementDate_Month: 0.0
DisbursementDate_Day: 0.0
ApprovalDate_Year: 0.0
ApprovalDate_Month: 0.0
ApprovalDate_Day: 0.0
