# Import

In [45]:
%pip install pytorch-tabnet

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor

import matplotlib.pyplot as plt


Note: you may need to restart the kernel to use updated packages.


# Data Load

In [46]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Data Preprocessing

In [47]:
#특성과 타겟 변수 분리
train = train.drop(columns=['ID'], axis = 1)
test = test.drop(columns=['ID'], axis = 1)

In [66]:
# 1. 설립연도 → 나이 변환
CURRENT_YEAR = 2025
train['나이'] = CURRENT_YEAR - train['설립연도']
test['나이'] = CURRENT_YEAR - test['설립연도']

# 2. 국가 Encoding
le_country = LabelEncoder()
train['국가'] = le_country.fit_transform(train['국가'].fillna('Missing'))
test['국가'] = le_country.transform(test['국가'].fillna('Missing'))

# 3. 분야 점수화
분야_점수 = {
    '핀테크': 8,
    0: 8,               # NaN 처리된 0
    '기술': 8,
    '물류': 7,
    '에듀테크': 7,
    'AI': 7,
    '푸드테크': 7,
    '게임': 6,
    '에너지': 6,
    '이커머스': 6,
    '헬스케어': 6
}

# 분야 점수 맵핑
train['분야점수'] = train['분야'].map(분야_점수)
test['분야점수'] = test['분야'].map(분야_점수)

# 혹시 NaN 있으면 0으로
train['분야점수'] = train['분야점수'].fillna(0)
test['분야점수'] = test['분야점수'].fillna(0)

# 4. 투자단계 수치화
stage_mapping = {'Seed': 0, 'Series A': 1, 'Series B': 2, 'Series C': 3, 'IPO': 4}
train['투자단계'] = train['투자단계'].map(stage_mapping).fillna(0)
test['투자단계'] = test['투자단계'].map(stage_mapping).fillna(0)

# 5. 인수/상장 여부 0/1 매핑
bool_map = {'Yes': 1, 'No': 0}
for feature in ['인수여부', '상장여부']:
    train[feature] = train[feature].map(bool_map).fillna(0)
    test[feature] = test[feature].map(bool_map).fillna(0)

# 6. 투자대비매출 Feature 추가
train['투자대비매출'] = train['연매출(억원)'].fillna(0) / (train['총 투자금(억원)'].fillna(0) + 1)
test['투자대비매출'] = test['연매출(억원)'].fillna(0) / (test['총 투자금(억원)'].fillna(0) + 1)

# 7. SNS 팔로워 수 구간화
def sns_bin(x):
    if x < 1:
        return 0
    elif x < 3:
        return 1
    elif x < 5:
        return 2
    else:
        return 3

train['SNS팔로워구간'] = train['SNS 팔로워 수(백만명)'].apply(lambda x: sns_bin(x) if not pd.isna(x) else 0)
test['SNS팔로워구간'] = test['SNS 팔로워 수(백만명)'].apply(lambda x: sns_bin(x) if not pd.isna(x) else 0)

# 8. 기업가치 수치화
def parse_value(x):
    if pd.isnull(x):
        return 0
    elif '6000' in str(x):
        return 6
    elif '4500' in str(x):
        return 5
    elif '3500' in str(x):
        return 4
    elif '2500' in str(x):
        return 3
    elif '1500' in str(x):
        return 2
    else:
        return 1

train['기업가치_클래스'] = train['기업가치(백억원)'].apply(parse_value)
test['기업가치_클래스'] = test['기업가치(백억원)'].apply(parse_value)

train['직원수대비매출'] = train['연매출(억원)'] / (train['직원 수'] + 1)
test['직원수대비매출'] = test['연매출(억원)'] / (test['직원 수'] + 1)

train['투자금대비매출'] = train['연매출(억원)'] / (train['총 투자금(억원)'] + 1)
test['투자금대비매출'] = test['연매출(억원)'] / (test['총 투자금(억원)'] + 1)

train['고객수대비매출'] = train['연매출(억원)'] / (train['고객수(백만명)'] + 1)
test['고객수대비매출'] = test['연매출(억원)'] / (test['고객수(백만명)'] + 1)


# 9. 결측치는 모두 0으로 대체
train = train.fillna(0)
test = test.fillna(0)

# 10. 최종 feature 리스트
features = [
    '나이', '국가', '분야점수', '투자단계', '직원 수',
    '인수여부', '상장여부', '고객수(백만명)', '투자대비매출',
    'SNS팔로워구간', '기업가치_클래스', '직원수대비매출', '투자금대비매출', '고객수대비매출'
]

# 11. TabNet용 cat_idxs, cat_dims 설정
category_features = ['국가', '분야점수', '투자단계', 'SNS팔로워구간', '기업가치_클래스']
cat_idxs = [features.index(col) for col in category_features]
cat_dims = [int(train[col].max()) + 2 for col in category_features]


models = []  # 모델 저장할 리스트

# 학습용 코드 (간단 버전)
model = TabNetRegressor(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    seed=42222,
    verbose=0
)

model.fit(
    X_train=train[features].values,
    y_train=train['성공확률'].values.reshape(-1, 1),
    max_epochs=50,
    patience=10,
    batch_size=512,
    virtual_batch_size=128,
    eval_metric=['mae']
)
models.append(model) 
# 예를 들어 모델 하나 (fold 0번 모델)로 해볼게
model = models[0]

# 1. Feature Importance 가져오기
feature_importance = model.feature_importances_

# 2. Feature 이름과 importance 매칭
importance_df = pd.DataFrame({
    'feature': features,
    'importance': feature_importance
})

# 3. 중요도 기준 내림차순 정렬
importance_df = importance_df.sort_values(by='importance', ascending=False)

# 4. 결과 출력
print(importance_df)

분야별_성공률 = train.groupby('분야')['성공확률'].mean().sort_values(ascending=False)
print(분야별_성공률)




     feature  importance
4       직원 수    0.318028
10  기업가치_클래스    0.141000
8     투자대비매출    0.102152
1         국가    0.101361
9   SNS팔로워구간    0.075463
7   고객수(백만명)    0.071945
13   고객수대비매출    0.065618
0         나이    0.059264
12   투자금대비매출    0.021470
2       분야점수    0.014619
11   직원수대비매출    0.013443
6       상장여부    0.008346
3       투자단계    0.004768
5       인수여부    0.002525
분야
핀테크     0.567151
0       0.552042
기술      0.540103
물류      0.539939
에듀테크    0.539011
AI      0.537079
푸드테크    0.533731
게임      0.532869
에너지     0.529545
이커머스    0.520482
헬스케어    0.493671
Name: 성공확률, dtype: float64


# K-Fold Model Training

In [65]:
# 1. 타겟 지정
target = train['성공확률']  
X = train[features]
y = target

# 2. KFold 설정
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

models = [] # 모델 저장 리스트
cv_scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/{N_FOLDS}")
    
    X_train = X.iloc[train_idx].values   # 무조건 .values 붙여
    y_train = y.iloc[train_idx].values.reshape(-1, 1)

    X_valid = X.iloc[valid_idx].values   # 무조건 .values 붙여
    y_valid = y.iloc[valid_idx].values.reshape(-1, 1)
    
    # for i, idx in enumerate(cat_idxs):
    #     col_values = X_train[:, idx]
    #     max_val = col_values.max()
    #     expected_dim = cat_dims[i]
    #     print(f"Feature {features[idx]}: max value={max_val}, expected dim={expected_dim}")

    # 3. 비지도 사전학습 (Pretraining)
    print("▶ Pretraining...")
    pretrainer = TabNetPretrainer(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42 + fold,
        verbose=0
    )

    pretrainer.fit(
        X_train=X_train,
        eval_set=[X_valid],
        max_epochs=100,
        batch_size=512,
        virtual_batch_size=64,
        patience=10
    )
    
    # 4. 지도 Fine-tuning
    print("▶ Fine-tuning...")
    model = TabNetRegressor(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42 + fold,
        verbose=0,
        optimizer_fn=torch.optim.AdamW
    )

    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        from_unsupervised=pretrainer,
        eval_metric=['mae'],
        max_epochs=100,
        patience=10
    )

    # 5. 모델 저장
    models.append(model)
    cv_scores.append(model.best_cost)

print("\n✅ 모든 fold 모델 학습 완료!")
print("CV Scores (MAE):", cv_scores)
print("Mean CV Score (MAE):", np.mean(cv_scores))


🔁 Fold 1/5
▶ Pretraining...

Early stopping occurred at epoch 24 with best_epoch = 14 and best_val_0_unsup_loss_numpy = 10107287.0
▶ Fine-tuning...





Early stopping occurred at epoch 21 with best_epoch = 11 and best_val_0_mae = 0.22078

🔁 Fold 2/5
▶ Pretraining...





Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_unsup_loss_numpy = 1155383689216.0
▶ Fine-tuning...





Early stopping occurred at epoch 56 with best_epoch = 46 and best_val_0_mae = 0.20788

🔁 Fold 3/5
▶ Pretraining...





Early stopping occurred at epoch 53 with best_epoch = 43 and best_val_0_unsup_loss_numpy = 3255687.5
▶ Fine-tuning...





Early stopping occurred at epoch 31 with best_epoch = 21 and best_val_0_mae = 0.20303

🔁 Fold 4/5
▶ Pretraining...





Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_unsup_loss_numpy = 1876983808000.0
▶ Fine-tuning...





Early stopping occurred at epoch 57 with best_epoch = 47 and best_val_0_mae = 0.20281

🔁 Fold 5/5
▶ Pretraining...





Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_unsup_loss_numpy = -40.24195861816406
▶ Fine-tuning...





Early stopping occurred at epoch 37 with best_epoch = 27 and best_val_0_mae = 0.20727

✅ 모든 fold 모델 학습 완료!
CV Scores (MAE): [0.22077590191064905, 0.20787698324748446, 0.2030319442340306, 0.20280840533801486, 0.20726974452563696]
Mean CV Score (MAE): 0.20835259585116317




# K-Fold Model Prediction

In [69]:
# 저장된 모델들로 예측
predictions_list = []

for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(test[features].values)
    predictions_list.append(preds)

# 평균 예측
final_predictions = np.mean(predictions_list, axis=0)

Predict with fold 1


# Submission

In [70]:
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')