In [18]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold,RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor,HistGradientBoostingRegressor
from category_encoders import TargetEncoder
from sklearn.metrics import make_scorer
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from scipy.stats import randint
from sklearn.linear_model import LinearRegression

In [2]:
train = pd.read_csv('./train.csv')
test  = pd.read_csv('./test.csv')

In [3]:
# 파생변수 만들기 

## 숫자형 컬럼을 명시적으로 float 타입으로 변환
numeric_cols = ['연매출(억원)', '총 투자금(억원)', '기업가치(백억원)']
for col in numeric_cols:
    train[col] = pd.to_numeric(train[col], errors='coerce').astype(float)
    test[col]  = pd.to_numeric(test[col], errors='coerce').astype(float)

# 회사_나이
train['회사_나이']  = 2025 - train['설립연도']
test ['회사_나이'] = 2025 - test ['설립연도']

# 투자효율 (연매출 ÷ (총 투자금 + 1))
train['투자효율']   = train['연매출(억원)'] / (train['총 투자금(억원)'] + 1)
test ['투자효율']   = test ['연매출(억원)']  / (test ['총 투자금(억원)'] + 1)

# 고객당 매출 (연매출 ÷ (고객수 + 1))
train['고객당_매출'] = train['연매출(억원)'] / (train['고객수(백만명)'] + 1)
test ['고객당_매출'] = test ['연매출(억원)']  / (test ['고객수(백만명)'] + 1)

# 원당 매출 (연매출 ÷ (직원 수 + 1))
train['직원당_매출'] = train['연매출(억원)'] / (train['직원 수'] + 1)
test ['직원당_매출'] = test ['연매출(억원)']  / (test ['직원 수'] + 1)

# 로그 변환 (왜곡 안정화)
for col in numeric_cols:
    train[f'log_{col}'] = np.log1p(train[col].fillna(0.0))
    test [f'log_{col}'] = np.log1p(test [col].fillna(0.0))

# 생성된 피처 확인
engineered = [
    '회사_나이', '투자효율', '고객당_매출', '직원당_매출',
    'log_연매출(억원)', 'log_총 투자금(억원)', 'log_기업가치(백억원)'
]
print(train[engineered].head())

   회사_나이       투자효율      고객당_매출    직원당_매출  log_연매출(억원)  log_총 투자금(억원)  \
0     16   1.415330   83.578947  1.154349     8.469053       8.121480   
1      2   0.068550    3.444444  0.066939     5.634790       8.311398   
2      7   1.881159  220.745455  3.875199     9.404426       8.772455   
3      9  15.836336         NaN  3.249230     9.263692       6.501290   
4      5  11.819277  103.263158  4.979695     9.191259       6.721426   

   log_기업가치(백억원)  
0            0.0  
1            0.0  
2            0.0  
3            0.0  
4            0.0  


In [4]:
# 가중치 변경

# 빈도 기반 샘플 가중치 생성
counts = train['성공확률'].value_counts()
train['weight'] = train['성공확률'].map(lambda y: 1/counts[y])
train['weight'] /= train['weight'].mean()

# 수치형 컬럼명 정의 & 결측 플래그 파생
numeric_cols = train.select_dtypes(include=['int64','float64']) \
                    .columns.drop(['성공확률','weight'])
for col in numeric_cols:
    train[f'is_na_{col}'] = train[col].isna().astype(int)
    test [f'is_na_{col}'] = test [col].isna().astype(int)

# 범주형 NaN → 'Missing'
cat_cols = train.select_dtypes(include=['object']) \
                .columns.drop('ID', errors='ignore')
train[cat_cols] = train[cat_cols].fillna('Missing')
test [cat_cols] = test [cat_cols].fillna('Missing')

# 평가 함수 정의
def weighted_mae(y_true, y_pred, w):
    return np.sum(w * np.abs(y_true - y_pred)) / np.sum(w)

# Imputer 비교 설정 (Median vs KNN)
imputers = {
    'median': SimpleImputer(strategy='median'),
    'knn'   : KNNImputer(n_neighbors=5)
}

features = (
    list(numeric_cols)
  + [f'is_na_{c}' for c in numeric_cols]
  + list(cat_cols)
)

results = []

for name, imp in imputers.items():
    # 전처리 파이프라인
    preprocessor = ColumnTransformer([
        ('num',  imp,                           numeric_cols.tolist()),
        ('flag', SimpleImputer(fill_value=0),  [f'is_na_{c}' for c in numeric_cols]),
        ('cat',  OneHotEncoder(handle_unknown='ignore'), cat_cols.tolist())
    ])
    pipe = Pipeline([
        ('prep', preprocessor),
        ('rf',   RandomForestRegressor(
                     n_estimators=100,
                     max_depth=6,
                     random_state=42
                 ))
    ])
    
    # 5-Fold CV
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    X = train[features]
    y = train['성공확률']
    w = train['weight']
    
    for tr_ix, va_ix in kf.split(X):
        pipe.fit(
            X.iloc[tr_ix], y.iloc[tr_ix],
            rf__sample_weight=w.iloc[tr_ix]
        )
        preds = pipe.predict(X.iloc[va_ix])
        cv_scores.append(weighted_mae(y.iloc[va_ix], preds, w.iloc[va_ix]))
    
    results.append((name, np.mean(cv_scores)))

#결과 출력
results_df = pd.DataFrame(results, columns=['Imputer','Mean CV wMAE'])
print(results_df)



  Imputer  Mean CV wMAE
0  median      0.211957
1     knn      0.212027


In [5]:
# 범주형 변수 target encoding

# 타깃(Y)과 샘플 가중치
y = train['성공확률'].copy()
counts = y.value_counts()
train['weight'] = y.map(lambda v: 1/counts[v])
train['weight'] /= train['weight'].mean()

# 인코딩할 컬럼 지정
cat_cols = ['분야', '국가', '투자단계']  # High-cardinality 범주 세 개

# CV-aware TargetEncoder 초기화
kf = KFold(n_splits=5, shuffle=True, random_state=42)
te = TargetEncoder(cols=cat_cols, smoothing=0.3)  

# 빈 데이터프레임에 컬럼 생성
for col in cat_cols:
    train[f'{col}_TE'] = np.nan

# Fold별로 fit→transform
for tr_idx, va_idx in kf.split(train):
    te.fit(train.iloc[tr_idx][cat_cols], y.iloc[tr_idx])
    train.iloc[va_idx, train.columns.get_indexer([f'{col}_TE' for col in cat_cols])] = \
        te.transform(train.iloc[va_idx][cat_cols])

# Test 셋에도 전체 데이터로 인코딩 적용
te.fit(train[cat_cols], y)
test_te = te.transform(test[cat_cols])
for col in cat_cols:
    test[f'{col}_TE'] = test_te[col]

# 결과 확인
engineered_te = [f'{col}_TE' for col in cat_cols]
print(train[engineered_te].head())
print(test[engineered_te].head())

      분야_TE     국가_TE   투자단계_TE
0  0.527132  0.535475  0.539092
1  0.561887  0.539879  0.542075
2  0.534967  0.536494  0.543516
3  0.554118  0.561027  0.546581
4  0.536082  0.521348  0.554082
      분야_TE     국가_TE   투자단계_TE
0  0.567151  0.533333  0.538425
1  0.533731  0.531303  0.538425
2  0.539011  0.548276  0.536141
3  0.529545  0.531303  0.546171
4  0.567151  0.533333  0.546171


In [6]:
# model 풀 확장 : catboost, xgboost
# 모델 정의
models = {
    'CatBoost': CatBoostRegressor(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        eval_metric='MAE',
        random_seed=42,
        verbose=False
    ),
    'XGBoost': XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        objective='reg:squarederror',
        random_state=42
    )
}

# 5-Fold CV 수행
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

for name, mdl in models.items():
    cv_scores = []
    for tr_idx, va_idx in kf.split(train):
        X_tr = train.iloc[tr_idx][features]
        y_tr = train.iloc[tr_idx]['성공확률']
        w_tr = train.iloc[tr_idx]['weight']
        
        X_va = train.iloc[va_idx][features]
        y_va = train.iloc[va_idx]['성공확률']
        w_va = train.iloc[va_idx]['weight']
        
        # 파이프라인은 이미 preprocessor 변수로 정의되어있음
        pipe = Pipeline([
            ('prep', preprocessor),
            (name, mdl)
        ])
        
        # sample_weight 파라미터는 모델마다 이름이 다르므로
        sw_param = {f'{name}__sample_weight': w_tr.values}
        pipe.fit(X_tr, y_tr, **sw_param)
        
        preds = pipe.predict(X_va)
        # weighted_mae 함수도 이미 정의되어있으니까
        cv_scores.append(weighted_mae(y_va.values, preds, w_va.values))
    
    results[name] = np.mean(cv_scores)

# 결과 출력
for name, score in results.items():
    print(f'{name} Mean CV Weighted MAE: {score:.4f}')

CatBoost Mean CV Weighted MAE: 0.2129
XGBoost Mean CV Weighted MAE: 0.2144


In [7]:
# 하이퍼파라미터 튜닝

# 파이프라인
pipe = Pipeline([
    ('prep', preprocessor),
    ('rf',   RandomForestRegressor(random_state=42))
])

# 파라미터 분포
param_dist = {
    'rf__n_estimators': randint(100, 1001),
    'rf__max_depth': randint(3, 21),
    'rf__min_samples_split': randint(2, 11),
    'rf__min_samples_leaf': randint(1, 11)
}

# RandomizedSearchCV 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=30,
    scoring=lambda est, X, y: -weighted_mae(y, est.predict(X), train.loc[X.index, 'weight']),
    cv=kf,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# 실행
X = train[features]
y = train['성공확률']
random_search.fit(X, y)

# 결과 출력
print("Best params:", random_search.best_params_)
print("Best CV wMAE:", -random_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best params: {'rf__max_depth': 17, 'rf__min_samples_leaf': 3, 'rf__min_samples_split': 2, 'rf__n_estimators': 104}
Best CV wMAE: 0.20930381817374402


In [26]:
# stacking, blending 프로토타입
X = train[features]
y = train['성공확률']
w = train['weight']

# 1) Base 모델 풀 정의
base_models = {
    'rf' : RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42),
    'hgb': HistGradientBoostingRegressor(max_iter=300, random_state=42)
}

# 2) OOF(preds) 및 테스트(preds) 저장 공간
oof_preds  = pd.DataFrame(index=train.index)
test_preds = pd.DataFrame(index=test.index)

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

for name, model in base_models.items():
    oof      = np.zeros(len(train))
    test_fold = np.zeros((len(test), n_folds))
    
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X)):
        pipe = Pipeline([
            ('prep', preprocessor),
            (name, model)
        ])
        # 학습
        pipe.fit(
            X.iloc[tr_idx], y.iloc[tr_idx],
            **{f'{name}__sample_weight': w.iloc[tr_idx].values}
        )
        # OOF 예측
        oof[va_idx] = pipe.predict(X.iloc[va_idx])
        # 테스트 폴드 예측
        test_fold[:, fold] = pipe.predict(test[features])
    
    oof_preds[name]  = oof
    test_preds[name] = test_fold.mean(axis=1)

# 3) 메타 모델 학습 (Linear Regression)
meta = LinearRegression()
meta.fit(oof_preds, y, sample_weight=w.values)

# 4) Stacking CV 성능
stack_oof   = meta.predict(oof_preds)
stack_wmae  = weighted_mae(y.values, stack_oof, w.values)
print("Stacking CV Weighted MAE:", stack_wmae)

# 5) Weighted Blending CV 성능 (Meta 계수 기반)
coef = np.clip(meta.coef_, 0, None)
blend_weights = coef / coef.sum()

blend_oof  = oof_preds.dot(blend_weights)
blend_wmae = weighted_mae(y.values, blend_oof, w.values)
print("Weighted Blending CV Weighted MAE:", blend_wmae)

# 6) 최종 예측 & 제출
blend_test_pred = test_preds.dot(blend_weights)
blend_test_pred = np.clip(blend_test_pred, 0.1, 0.9)

submission = pd.DataFrame({
    'ID': test['ID'],
    '성공확률': blend_test_pred
})
submission.to_csv('submission_2.csv', index=False)
print("submission_2.csv 생성 완료")

Stacking CV Weighted MAE: 0.2099987875344944
Weighted Blending CV Weighted MAE: 0.2149369588099207
submission_2.csv 생성 완료
