In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor

# Weighted MAE 함수 (여기서는 uniform weight 사용)
def weighted_mae(y_true, y_pred, weights=None):
    if weights is None:
        weights = np.ones_like(y_true)
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

In [None]:
pip install pytorch_tabnet

In [None]:
import kagglehub

# Download latest version
# path = kagglehub.dataset_download("kaiyoo88/fake-real-estate")

path = "/kaggle/input/open1231231233"
print("Path to dataset files:", path)

In [None]:
import os
import pandas as pd

# 데이터셋 경로 설정
dataset_path = "/kaggle/input/open1231231233"

# 파일 경로 결합하여 읽어오기
train = pd.read_csv(os.path.join(dataset_path, "train.csv"))
test = pd.read_csv(os.path.join(dataset_path, "test.csv"))
sample_submission = pd.read_csv(os.path.join(dataset_path, "sample_submission.csv"))

# 데이터셋 크기 출력하여 확인
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Submission shape:", sample_submission.shape)

In [None]:
if 'ID' in train.columns:
    train = train.drop(columns=['ID'])
if 'ID' in test.columns:
    test = test.drop(columns=['ID'])


In [None]:
print(train.columns)

In [None]:
# "설립연도" 컬럼을 숫자형으로 변환 (변환 불가능한 값은 NaN 처리)
train['설립연도'] = pd.to_numeric(train['설립연도'], errors='coerce')
test['설립연도'] = pd.to_numeric(test['설립연도'], errors='coerce')

# 파생 변수 생성
# 1. 업력: 기업의 생애를 의미 (2025 - 설립연도)
train['업력'] = 2025 - train['설립연도']
test['업력'] = 2025 - test['설립연도']

# 2. 손익분기율: 연매출(억원) ÷ 총 투자금(억원)
train['매출투자비율'] = train['연매출(억원)'] / train['총 투자금(억원)']
test['매출투자비율'] = test['연매출(억원)'] / test['총 투자금(억원)']

# 3. 직원당 매출: 연매출(억원) ÷ 직원 수
train['직원당매출'] = train['연매출(억원)'] / train['직원 수']
test['직원당매출'] = test['연매출(억원)'] / test['직원 수']


In [None]:
# ---------------------------
# Define Feature Groups
# ---------------------------
# 범주형 변수: 원본 기업가치 컬럼은 'convert_기업가치' 없이 그대로 사용
category_features = ['설립연도', '국가', '분야', '투자단계', '기업가치(백억원)']
# 수치형 변수: 직원 수, 고객수(백만명), 총 투자금(억원), 연매출(억원), SNS 팔로워 수(백만명),
#              파생 변수: 매출투자비율, 직원당매출, 업력
numeric_features = ['직원 수', '고객수(백만명)', '총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)',
                    '매출투자비율', '직원당매출', '업력']
# Boolean 변수: 인수여부, 상장여부
bool_features = ['인수여부', '상장여부']


In [None]:
# Data Preprocessing: Categorical Variables
# ---------------------------
# 모든 범주형 변수는 문자열로 변환 후 결측치는 'Missing'으로 처리
for feature in category_features:
    train[feature] = train[feature].astype(str).fillna('Missing')
    test[feature] = test[feature].astype(str).fillna('Missing')

# Label Encoding for categorical features
encoders = {}
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# ---------------------------
# Data Preprocessing: Boolean Variables
# ---------------------------
bool_map = {'Yes': 1, 'No': 0}
for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)

# ---------------------------
# Data Preprocessing: Numeric & Boolean Missing Values using KNNImputer (n_neighbors=5)
# ---------------------------
impute_features = numeric_features + bool_features
imputer = KNNImputer(n_neighbors=5)
train[impute_features] = imputer.fit_transform(train[impute_features])
test[impute_features] = imputer.transform(test[impute_features])

In [None]:
# Define Final Feature Set & Target
# ---------------------------
features = [col for col in train.columns if col != '성공확률']
target = train['성공확률']

X = train[features]
y = target

In [None]:
# TabNet: 카테고리 변수 관련 설정
# ---------------------------
# TabNet은 범주형 변수 정보를 별도로 설정할 필요가 있음.
# features 리스트에서 범주형 변수들의 인덱스 구하기.
cat_idxs = [features.index(col) for col in category_features]
# 각 범주형 변수의 차원: 보통 Label Encoding 후 최대값 + 1 (즉, 고유 클래스 수)
cat_dims = [int(train[col].max() + 1) for col in category_features]

In [None]:
# ---------------------------
# KFold Cross Validation & Model Training using TabNet
# ---------------------------
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

models = []
cv_scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/{N_FOLDS}")
    
    X_train = X.iloc[train_idx].values
    y_train = y.iloc[train_idx].values.reshape(-1, 1)
    X_valid = X.iloc[valid_idx].values
    y_valid = y.iloc[valid_idx].values.reshape(-1, 1)
    
    # ---------------------------
    # 비지도 사전학습: TabNetPretrainer 사용
    # ---------------------------
    print("▶ Pretraining...")
    pretrainer = TabNetPretrainer(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42
    )
    pretrainer.fit(
        X_train=X_train,
        max_epochs=100,
        batch_size=512,
        virtual_batch_size=64
    )
    
    # ---------------------------
    # 지도 학습: TabNetRegressor (사전학습 모델로부터 초기화)
    # ---------------------------
    print("▶ Fine-tuning...")
    model = TabNetRegressor(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        optimizer_fn=torch.optim.AdamW
    )
    model.fit(
        X_train=X_train, 
        y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        from_unsupervised=pretrainer,
        eval_metric=['mae'],
        max_epochs=100,
        patience=10,
        batch_size=512,
        virtual_batch_size=64
    )
    
    preds = model.predict(X_valid)
    score = weighted_mae(y_valid, preds)
    print(f"Fold {fold+1} Weighted MAE: {score:.4f}")
    models.append(model)
    cv_scores.append(score)

print("\n✅ All folds completed!")
print("Average Weighted MAE:", np.mean(cv_scores))

In [None]:
# Prediction on Test Set using TabNet models
# ---------------------------
predictions_list = []
for fold, model in enumerate(models):
    print(f"Predicting with fold {fold+1}...")
    preds = model.predict(test[features].values)
    predictions_list.append(preds)

final_predictions = np.mean(predictions_list, axis=0)

# ---------------------------
# Create Submission File
# ---------------------------
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('baseline_submission.csv', index=False, encoding='utf-8-sig')
print("Submission file saved as 'baseline_submission.csv'.")