# Import

In [1]:
%pip install pytorch-tabnet

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor

Note: you may need to restart the kernel to use updated packages.


# Data Load

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Data Preprocessing

In [3]:
#특성과 타겟 변수 분리
train = train.drop(columns=['ID'], axis = 1)
test = test.drop(columns=['ID'], axis = 1)

In [11]:
# 설립연도 타입 변환 (int -> object)
train['설립연도'] =train['설립연도'].astype('object')
test['설립연도'] =test['설립연도'].astype('object')

category_features = ['설립연도','국가','분야','투자단계','기업가치(백억원)']
numeric_features = ['직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)']
bool_features = ['인수여부','상장여부']

# LabelEncoder 객체를 각 범주형 feature별로 따로 저장하여 사용
encoders = {}

# 범주형 데이터를 encoding
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = train[feature].fillna('Missing')
    test[feature] = test[feature].fillna('Missing')
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# 불리언 값을 0과 1로 변환 ('Yes' → 1, 'No' → 0 으로 변환)
bool_map = {'Yes': 1, 'No': 0}

for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)

# 수치형 변수 결측치를 평균값으로 대체
for feature in numeric_features:
    mean_value = train[feature].mean()
    train[feature] = train[feature].fillna(mean_value)
    test[feature] = test[feature].fillna(mean_value)

# TabNet용 범주형 변수 인덱스(cat_idxs) 및 차원(cat_dims) 설정
features = [col for col in train.columns if col != '성공확률']
cat_idxs = [features.index(col) for col in category_features]
cat_dims = [train[col].nunique() for col in category_features]

  train[feature] = train[feature].fillna('Missing')
  test[feature] = test[feature].fillna('Missing')


# K-Fold Model Training

In [17]:
# 타겟 지정
target = train['성공확률']  
X = train[features]
y = target

# KFold 설정
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

models = [] # 모델 저장 리스트
cv_scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/{N_FOLDS}")
    
    X_train = X.iloc[train_idx].values
    y_train = y.iloc[train_idx].values.reshape(-1, 1)
    
    X_valid = X.iloc[valid_idx].values
    y_valid = y.iloc[valid_idx].values.reshape(-1, 1)

    X_train = np.nan_to_num(X_train, nan=0)
    X_valid = np.nan_to_num(X_valid, nan=0)

    for i, idx in enumerate(cat_idxs):
        col_values = X_train[:, idx]
        max_val = col_values.max()
        expected_dim = cat_dims[i]
        print(f"Feature {features[idx]}: max value={max_val}, expected dim={expected_dim}")
    
    # 비지도 사전학습
    print("▶ Pretraining...")

    pretrainer = TabNetPretrainer(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0
    )


    pretrainer.fit(
        X_train=X_train,
        max_epochs=100,
        batch_size=512,
        virtual_batch_size=64
    )
    # pretrainer.fit(
    #     X_train=X_train,
    #     eval_set=[X_valid],
    #     max_epochs=100,
    #     batch_size=512,
    #     virtual_batch_size=64,
    #     patience=10
    # )

    # 지도 학습 
    print("▶ Fine-tuning...")
    model = TabNetRegressor(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0,
        optimizer_fn=torch.optim.AdamW 
    )

    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        from_unsupervised=pretrainer,
        eval_metric=['mae'],
        max_epochs=100,
        patience=10
    )

    # 모델을 메모리에 저장
    models.append(model)
    cv_scores.append(model.best_cost)

print("\n✅ 모든 fold 모델 학습 완료!")
print("CV Scores (MAE):", cv_scores)
print("Mean CV Score (MAE):", np.mean(cv_scores))


🔁 Fold 1/5
Feature 설립연도: max value=22.0, expected dim=23
Feature 국가: max value=9.0, expected dim=10
Feature 분야: max value=10.0, expected dim=11
Feature 투자단계: max value=4.0, expected dim=5
Feature 기업가치(백억원): max value=5.0, expected dim=6
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 30 with best_epoch = 20 and best_val_0_mae = 0.20555

🔁 Fold 2/5
Feature 설립연도: max value=22.0, expected dim=23
Feature 국가: max value=9.0, expected dim=10
Feature 분야: max value=10.0, expected dim=11
Feature 투자단계: max value=4.0, expected dim=5
Feature 기업가치(백억원): max value=5.0, expected dim=6
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 61 with best_epoch = 51 and best_val_0_mae = 0.20428

🔁 Fold 3/5
Feature 설립연도: max value=22.0, expected dim=23
Feature 국가: max value=9.0, expected dim=10
Feature 분야: max value=10.0, expected dim=11
Feature 투자단계: max value=4.0, expected dim=5
Feature 기업가치(백억원): max value=5.0, expected dim=6
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 46 with best_epoch = 36 and best_val_0_mae = 0.20185

🔁 Fold 4/5
Feature 설립연도: max value=22.0, expected dim=23
Feature 국가: max value=9.0, expected dim=10
Feature 분야: max value=10.0, expected dim=11
Feature 투자단계: max value=4.0, expected dim=5
Feature 기업가치(백억원): max value=5.0, expected dim=6
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 34 with best_epoch = 24 and best_val_0_mae = 0.20287





🔁 Fold 5/5
Feature 설립연도: max value=22.0, expected dim=23
Feature 국가: max value=9.0, expected dim=10
Feature 분야: max value=10.0, expected dim=11
Feature 투자단계: max value=4.0, expected dim=5
Feature 기업가치(백억원): max value=5.0, expected dim=6
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 31 with best_epoch = 21 and best_val_0_mae = 0.20569

✅ 모든 fold 모델 학습 완료!
CV Scores (MAE): [0.20554833059033306, 0.2042812625476292, 0.20185168299674988, 0.20286698811394827, 0.20569271735463826]
Mean CV Score (MAE): 0.20404819632065974




# K-Fold Model Prediction

In [18]:
# 저장된 모델들로 예측
predictions_list = []

for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(test[features].values)
    predictions_list.append(preds)

# 평균 예측
final_predictions = np.mean(predictions_list, axis=0)

Predict with fold 1


RuntimeError: index -1 is out of bounds for dimension 1 with size 12

# Submission

In [9]:
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')