In [None]:
import numpy as np
import pandas as pd
import random
import torch
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import os

In [None]:
# GPU 사용 설정 (Kaggle 환경에서 GPU 사용 시)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Seed 설정
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)

# 데이터 불러오기 (Kaggle 경로)
data_path = "/kaggle/input/dataset-enterprise"
train = pd.read_csv(f"{data_path}/train.csv")
test = pd.read_csv(f"{data_path}/test.csv")
sample_submission = pd.read_csv(f"{data_path}/sample_submission.csv")

In [None]:
train = train.drop(columns=['ID'], axis=1)
test = test.drop(columns=['ID'], axis=1)

In [None]:
#기업가치 구간화
def parse_value(x):
    if pd.isnull(x): return 0
    elif '6000' in str(x): return 6
    elif '4500' in str(x): return 5
    elif '3500' in str(x): return 4
    elif '2500' in str(x): return 3
    elif '1500' in str(x): return 2
    else: return 1

#라벨 NaN값 처리
def fit_label_encoder(train_series, test_series):
    train_series = train_series.fillna('Missing').astype(str)
    test_series = test_series.fillna('Missing').astype(str)

    le = LabelEncoder()
    full_data = pd.concat([train_series, test_series])
    le.fit(full_data)

    return le, le.transform(train_series), le.transform(test_series)

In [None]:
# 설립년도 현재기준으로 나이로 변환
CURRENT_YEAR = 2025
train['나이'] = CURRENT_YEAR - train['설립연도']
test['나이'] = CURRENT_YEAR - test['설립연도']

# # 수치형 결측치 처리
# numeric_features = ['직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)']
# for feature in numeric_features:
#     mean_value = train[feature].mean()
#     train[feature] = train[feature].fillna(mean_value)
#     test[feature] = test[feature].fillna(mean_value)

# 1. KNNImputer 적용 (수치형에만)
from sklearn.impute import KNNImputer
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('성공확률')  # 타겟 제외

imputer = KNNImputer(n_neighbors=5)
train_num = pd.DataFrame(imputer.fit_transform(train[numeric_cols]), columns=numeric_cols)
test_num = pd.DataFrame(imputer.transform(test[numeric_cols]), columns=numeric_cols)

# 기존 범주형/문자형 컬럼만 유지해서 따로 저장
train_cat = train.drop(columns=numeric_cols + ['성공확률'])
test_cat = test.drop(columns=numeric_cols)

# 수치형/범주형 다시 합치기
train = pd.concat([train_cat.reset_index(drop=True), train_num.reset_index(drop=True), train['성공확률'].reset_index(drop=True)], axis=1)
test = pd.concat([test_cat.reset_index(drop=True), test_num.reset_index(drop=True)], axis=1)


# 라벨 인코딩
le_country, train['국가'], test['국가'] = fit_label_encoder(train['국가'], test['국가'])
le_field, train['분야'], test['분야'] = fit_label_encoder(train['분야'], test['분야'])

# 범주형 매핑
stage_mapping = {'Seed': 0, 'Series A': 1, 'Series B': 2, 'Series C': 3, 'IPO': 4}
train['투자단계'] = train['투자단계'].map(stage_mapping).fillna(0)
test['투자단계'] = test['투자단계'].map(stage_mapping).fillna(0)

# 기업가치 구간화

train['기업가치_클래스'] = train['기업가치(백억원)'].apply(parse_value)
test['기업가치_클래스'] = test['기업가치(백억원)'].apply(parse_value)

# 연매출 이상치 로그화로 최소화
train['log_연매출'] = np.log1p(train['연매출(억원)'])
train['log_투자금'] = np.log1p(train['총 투자금(억원)'])
test['log_연매출'] = np.log1p(test['연매출(억원)'])
test['log_투자금'] = np.log1p(test['총 투자금(억원)'])

# xx여부 불리언 매핑
bool_map = {'Yes': 1, 'No': 0}
for feature in ['인수여부', '상장여부']:
    train[feature] = train[feature].map(bool_map).fillna(0)
    test[feature] = test[feature].map(bool_map).fillna(0)

# SNS 구간화
def sns_bin(x):
    if pd.isna(x): return 0
    if x < 1: return 0
    elif x < 3: return 1
    elif x < 5: return 2
    else: return 3
train['SNS팔로워구간'] = train['SNS 팔로워 수(백만명)'].apply(sns_bin)
test['SNS팔로워구간'] = test['SNS 팔로워 수(백만명)'].apply(sns_bin)


In [None]:
# 파생변수 추가
train['투자대비매출'] = train['log_연매출'] / (train['log_투자금'] + 1)
test['투자대비매출'] = test['log_연매출'] / (test['log_투자금'] + 1)

train['직원당투자금'] = train['log_투자금'] / (train['직원 수'] + 1)
test['직원당투자금'] = test['log_투자금'] / (test['직원 수'] + 1)

train['고객당매출'] = train['log_연매출'] / (train['고객수(백만명)'] + 1)
test['고객당매출'] = test['log_연매출'] / (test['고객수(백만명)'] + 1)

train['기업가치대비매출'] = train['log_연매출'] / (train['기업가치_클래스'] + 1)
test['기업가치대비매출'] = test['log_연매출'] / (test['기업가치_클래스'] + 1)

train['설립후투자'] = train['log_투자금'] / (train['나이'] + 1)
test['설립후투자'] = test['log_투자금'] / (test['나이'] + 1)

train['가치대비단계'] = train['기업가치_클래스'] / (train['투자단계'] + 1)
test['가치대비단계'] = test['기업가치_클래스'] / (test['투자단계'] + 1)

train['설립기간매출'] = train['log_연매출'] / (train['나이'] + 1)
test['설립기간매출'] = test['log_연매출'] / (test['나이'] + 1)

train['설립기간고객'] = train['고객수(백만명)'] / (train['나이'] + 1)
test['설립기간고객'] = test['고객수(백만명)'] / (test['나이'] + 1)

train['SNS대비투자'] = train['SNS 팔로워 수(백만명)'] / (train['log_투자금'] + 1)
test['SNS대비투자'] = test['SNS 팔로워 수(백만명)'] / (test['log_투자금'] + 1)

In [None]:
features = [
    '나이', '국가', '분야', '투자단계', '직원 수', '고객수(백만명)', '총 투자금(억원)', '연매출(억원)',
    'SNS팔로워구간', '기업가치_클래스', '인수여부', '상장여부', 
    '투자대비매출', '직원당투자금', '고객당매출',
    '기업가치대비매출', '설립후투자',
    '가치대비단계', '설립기간매출', '설립기간고객', 'SNS대비투자'
]

cat_features = ['국가', '분야', '투자단계']
cat_idxs = [features.index(col) for col in cat_features]
cat_dims = [
    len(le_country.classes_),
    len(le_field.classes_),
    train['투자단계'].nunique()  # 또는 len(train['투자단계'].unique())로도 OK
]

In [None]:
# # 모델 훈련
# X = train[features].values
# y = train['성공확률'].values.reshape(-1, 1)
# X_test = test[features].values

# # TabNet 모델 훈련 (GPU 사용)
# from sklearn.metrics import mean_absolute_error
# kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
# cv_scores = []
# tabnet_preds = []

# for train_idx, valid_idx in kf.split(X):
#     model = TabNetRegressor(
#     n_d=32, n_a=32, n_steps=5, gamma=1.5, seed=SEED+10,
#     cat_idxs=cat_idxs,
#     cat_dims=cat_dims,
#     device_name=device
#     )
#     model.fit(
#         X[train_idx], y[train_idx],
#         eval_set=[(X[valid_idx], y[valid_idx])],
#         max_epochs=200,
#         patience=10,
#         batch_size=512,
#         virtual_batch_size=128,
#         eval_metric=['mae'],
#         #verbose=0
#     )
#     val_preds = model.predict(X[valid_idx]).ravel()
#     fold_mae = mean_absolute_error(y[valid_idx].ravel(), val_preds)
#     cv_scores.append(fold_mae)
#     tabnet_preds.append(model.predict(X_test))
        
#     # LightGBM
#     lgb_model = lgb.LGBMRegressor(random_state=SEED)
#     lgb_model.fit(X, y.ravel())
#     lgb_preds = lgb_model.predict(X_test)

#     # GradientBoosting
#     gbr = GradientBoostingRegressor(random_state=SEED)
#     gbr.fit(X, y.ravel())
#     gbr_preds = gbr.predict(X_test)

# tabnet_final = np.mean(tabnet_preds, axis=0)
# lgb_final = np.mean(lgb_preds, axis=0)
# gbr_final = np.mean(gbr_preds, axis=0)

# # 앙상블
# final_preds = (tabnet_final.ravel() + lgb_preds + gbr_preds) / 3
# estimated_lb_score = np.mean(np.abs(final_preds - y.mean()))
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
# 데이터 준비
X = train[features]
y = train['성공확률']
X_test = test[features]
# KFold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
models = []
test_preds = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}")

    X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

    model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.03,
        max_depth=15,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_tr, y_tr)

    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    cv_scores.append(mae)
    models.append(model)

    # 테스트셋 예측 저장 (optional)
    test_preds.append(model.predict(X_test))
    print(f"Fold {fold+1} MAE: {mae:.4f}")

print(f"\nAverage MAE across folds: {np.mean(cv_scores):.5f}")

# 테스트 예측 평균 (optional)
final_test_pred = np.mean(test_preds, axis=0)



In [None]:
test_preds = np.zeros(len(X_test))

for model in models:
    test_preds += model.predict(X_test) / kf.get_n_splits()

final_preds = test_preds

print("\n✅ Test 예측 완료 (KFold 모델 평균)")
print(final_preds[:10])

In [None]:
assert len(test) == len(final_preds), "예측값과 테스트셋 길이가 일치하지 않습니다."

sample_submission['성공확률'] = final_preds
sample_submission.to_csv('./submission_xgb_kfold.csv', index=False)

print("✅ 제출 파일 저장 완료: submission_xgb_kfold.csv")
sample_submission.head()