data load, preprocess, import

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

# 데이터 로딩
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# ID 제거
train.drop(columns=['ID'], inplace=True)
test.drop(columns=['ID'], inplace=True)


결측 처리 및 인코딩

In [3]:
# 기업가치(백억원) 숫자화
# 기업가치 컬럼도 범위 문자열 처리 추가

def convert_range_to_float(value):
    if isinstance(value, str) and '-' in value:
        try:
            low, high = map(float, value.split('-'))
            return (low + high) / 2
        except:
            return np.nan
    try:
        return float(value)
    except:
        return np.nan
    
def encode_categoricals(df, cols):
    df = df.copy()
    for col in cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    return df

def fill_missing_values_v3(df, is_train=True):
    df = df.copy()

    # 범위 문자열 → 평균 숫자 처리
    for col in ['연매출(억원)', '총 투자금(억원)', '기업가치(백억원)']:
        df[col] = df[col].apply(convert_range_to_float)

    # 분야 결측 및 인코딩
    if '분야' in df.columns:
        df['분야'] = df['분야'].fillna('Unknown')
        df['분야'] = LabelEncoder().fit_transform(df['분야'])

    # 국가, 투자단계 인코딩
    df = encode_categoricals(df, ['국가', '투자단계'])

    # ✅ 결측 플래그 추가 함수
    def add_missing_flag(column):
        flag_col = f'{column}_결측'
        df[flag_col] = df[column].isnull().astype(int)

    # ✅ 피처셋 생성 함수
    def get_features(base):
        return base + (['성공확률'] if is_train else [])

    # 1. 직원 수
    if '직원 수' in df.columns:
        add_missing_flag('직원 수')
        features = get_features(['설립연도', '국가', '투자단계', '연매출(억원)', '총 투자금(억원)', 'SNS 팔로워 수(백만명)'])
        complete = df[df['직원 수'].notnull()]
        missing = df[df['직원 수'].isnull()]
        if not complete.empty and not missing.empty:
            model = GradientBoostingRegressor()
            model.fit(complete[features], complete['직원 수'])
            df.loc[df['직원 수'].isnull(), '직원 수'] = model.predict(missing[features])

    # 2. 고객 수
    if '고객수(백만명)' in df.columns:
        add_missing_flag('고객수(백만명)')
        features = get_features(['설립연도', '직원 수', '분야', '연매출(억원)', '총 투자금(억원)', 'SNS 팔로워 수(백만명)'])
        complete = df[df['고객수(백만명)'].notnull()]
        missing = df[df['고객수(백만명)'].isnull()]
        if not complete.empty and not missing.empty:
            model = GradientBoostingRegressor()
            model.fit(complete[features], complete['고객수(백만명)'])
            df.loc[df['고객수(백만명)'].isnull(), '고객수(백만명)'] = model.predict(missing[features])

    # 3. 기업가치
    if '기업가치(백억원)' in df.columns:
        add_missing_flag('기업가치(백억원)')
        features = get_features(['설립연도', '직원 수', '고객수(백만명)', '분야', '연매출(억원)', '총 투자금(억원)', 'SNS 팔로워 수(백만명)'])
        complete = df[df['기업가치(백억원)'].notnull()]
        missing = df[df['기업가치(백억원)'].isnull()]
        if not complete.empty and not missing.empty:
            model = GradientBoostingRegressor()
            model.fit(complete[features], complete['기업가치(백억원)'])
            df.loc[df['기업가치(백억원)'].isnull(), '기업가치(백억원)'] = model.predict(missing[features])

    return df

# 최종 결측치 보간 시도
train_filled = fill_missing_values_v3(train, is_train=True)
train_filled.isnull().sum()  # 모든 결측치가 잘 채워졌는지 확인

test_filled = fill_missing_values_v3(test, is_train=False)
test_filled.isnull().sum()  # 모든 결측치가 잘 채워졌는지 확인




설립연도              0
국가                0
분야                0
투자단계              0
직원 수              0
인수여부              0
상장여부              0
고객수(백만명)          0
총 투자금(억원)         0
연매출(억원)           0
SNS 팔로워 수(백만명)    0
기업가치(백억원)         0
직원 수_결측           0
고객수(백만명)_결측       0
기업가치(백억원)_결측      0
dtype: int64

이상치 처리 및 파생변수 생성

In [4]:
def process_outliers_train_test(train_df, test_df, num_cols, method='flag+clip'):
    train_processed = train_df.copy()
    test_processed = test_df.copy()
    for col in num_cols:
        Q1 = train_df[col].quantile(0.25)
        Q3 = train_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        if 'flag' in method:
            train_processed[f'{col}_이상치여부'] = ((train_df[col] < lower) | (train_df[col] > upper)).astype(int)
            test_processed[f'{col}_이상치여부'] = ((test_df[col] < lower) | (test_df[col] > upper)).astype(int)
        if 'clip' in method:
            train_processed[col] = train_df[col].clip(lower, upper)
            test_processed[col] = test_df[col].clip(lower, upper)
    return train_processed, test_processed

num_cols = train_filled.select_dtypes(include='number').columns.difference(['성공확률']).tolist()
train_processed, test_processed = process_outliers_train_test(train_filled, test_filled, num_cols)

def create_features(df):
    df = df.copy()
    df['직원 수_로그'] = np.log1p(df['직원 수'])
    df['연매출_로그'] = np.log1p(df['연매출(억원)'])
    df['총 투자금_로그'] = np.log1p(df['총 투자금(억원)'])
    df['고객수_직원비'] = df['고객수(백만명)'] / (df['직원 수'] + 1)
    df['연매출_직원비'] = df['연매출(억원)'] / (df['직원 수'] + 1)
    df['투자대비매출'] = df['연매출(억원)'] / (df['총 투자금(억원)'] + 1)
    df['SNS당고객'] = df['고객수(백만명)'] / (df['SNS 팔로워 수(백만명)'] + 1)
    df['기업가치대비투자'] = df['기업가치(백억원)'] / (df['총 투자금(억원)'] + 1)
    df['설립년차'] = 2025 - df['설립연도']
    return df

X = create_features(train_processed)
X_test = create_features(test_processed)
y = train_processed['성공확률']


피처 제거 + 파생변수 추가

In [5]:
columns_to_remove = [
    '직원 수_결측_이상치여부', '고객수(백만명)_결측', '고객수(백만명)_결측_이상치여부',
    '기업가치(백억원)_결측', '기업가치(백억원)_결측_이상치여부',
    '투자단계_이상치여부', '분야_이상치여부'
]
if '성공확률' in X.columns:
    X = X.drop(columns=['성공확률'])
X_reduced = X.drop(columns=[col for col in columns_to_remove if col in X.columns])
X_test_reduced = X_test.drop(columns=[col for col in columns_to_remove if col in X_test.columns])
X_test_reduced = X_test_reduced[X_reduced.columns]

def add_extra_features(df):
    df = df.copy()
    df['총투자_직원비'] = df['총 투자금(억원)'] / (df['직원 수_로그'] + 1)
    df['SNS당매출'] = df['연매출_로그'] / (df['SNS 팔로워 수(백만명)'] + 1)
    #df['설립년차_제곱'] = df['설립년차'] ** 2
    df['고객당가치'] = df['기업가치(백억원)'] / (df['고객수(백만명)'] + 1)
    df['연매출_기업가치비'] = df['연매출_로그'] / (df['기업가치(백억원)'] + 1)
    # 설립년차 * 투자단계
    # if '설립년차' in df.columns and '투자단계' in df.columns:
    #     df['설립X투자단계'] = df['설립년차'] * df['투자단계']
    
    # 고객수_직원비 * 연매출_직원비
    if '고객수_직원비' in df.columns and '연매출_직원비' in df.columns:
        df['고객X매출직원비'] = df['고객수_직원비'] * df['연매출_직원비']
    
    # # log(SNS 수) + log(연매출)
    # if 'SNS 팔로워 수(백만명)' in df.columns and '연매출(억원)' in df.columns:
    #     df['로그SNS'] = np.log1p(df['SNS 팔로워 수(백만명)'])
    #     df['로그매출'] = np.log1p(df['연매출(억원)'])
    #     df['로그SNS+매출'] = df['로그SNS'] + df['로그매출']
    return df

X_enhanced = add_extra_features(X_reduced)
X_test_enhanced = add_extra_features(X_test_reduced)


In [None]:
# import optuna
# from sklearn.model_selection import cross_val_score

# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05, log=True),
#         'max_depth': trial.suggest_int('max_depth', 5, 20),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
#         'random_state': 42,
#         'n_jobs': -1
#     }

#     model = XGBRegressor(**params)
#     scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
#     return -scores.mean()

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)

# print("Best MAE:", study.best_value)
# print("Best parameters:", study.best_params)

[I 2025-05-01 13:59:12,413] A new study created in memory with name: no-name-18f29a7b-8d85-4824-87cb-f3acb07b2eb3
[W 2025-05-01 13:59:12,492] Trial 0 failed with parameters: {'n_estimators': 559, 'learning_rate': 0.004491255333070179, 'max_depth': 10, 'subsample': 0.8903363528873041, 'colsample_bytree': 0.9740530745125215, 'reg_alpha': 3.887784360494307, 'reg_lambda': 8.003240366717359} because of the following error: ValueError('\nAll the 3 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting error_score=\'raise\'.\n\nBelow are more details about the failures:\n--------------------------------------------------------------------------------\n3 fits failed with the following error:\nTraceback (most recent call last):\n  File "c:\\Users\\human\\.conda\\envs\\dacon\\Lib\\site-packages\\xgboost\\data.py", line 407, in pandas_feature_info\n    new_feature_types.append(_pandas_dtype_mapper[dtype.name])\n                             ~~~

ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\data.py", line 407, in pandas_feature_info
    new_feature_types.append(_pandas_dtype_mapper[dtype.name])
                             ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^
KeyError: 'object'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\sklearn.py", line 1222, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\sklearn.py", line 628, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
                    ^^^^^^^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\sklearn.py", line 1137, in _create_dmatrix
    return QuantileDMatrix(
           ^^^^^^^^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\core.py", line 1614, in __init__
    self._init(
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\core.py", line 1678, in _init
    it.reraise()
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\core.py", line 572, in reraise
    raise exc  # pylint: disable=raising-bad-type
    ^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\core.py", line 553, in _handle_exception
    return fn()
           ^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\core.py", line 640, in <lambda>
    return self._handle_exception(lambda: int(self.next(input_data)), 0)
                                              ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\data.py", line 1654, in next
    input_data(**self.kwargs)
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\core.py", line 620, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
                                                   ^^^^^^^^^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\data.py", line 1707, in _proxy_transform
    df, feature_names, feature_types = _transform_pandas_df(
                                       ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\data.py", line 640, in _transform_pandas_df
    feature_names, feature_types = pandas_feature_info(
                                   ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\data.py", line 409, in pandas_feature_info
    _invalid_dataframe_dtype(data)
  File "c:\Users\human\.conda\envs\dacon\Lib\site-packages\xgboost\data.py", line 372, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:인수여부: object, 상장여부: object


모델 학습 및 예측

In [10]:
# 이진형 인코딩
for col in ['인수여부', '상장여부']:
    if col in X_enhanced.columns:
        X_enhanced[col] = X_enhanced[col].map({'No': 0, 'Yes': 1})
    if col in X_test_enhanced.columns:
        X_test_enhanced[col] = X_test_enhanced[col].map({'No': 0, 'Yes': 1})

n_repeats = 3
n_splits = 5
bins = np.linspace(0, 1, 6)
y_binned = np.digitize(y, bins)

#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

cv_scores = []
test_preds = []

# for fold, (train_idx, val_idx) in enumerate(skf.split(X_enhanced, y_binned)):
# for fold, (train_idx, val_idx) in enumerate(rkf.split(X_enhanced, y)):
for repeat in range(n_repeats):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42 + repeat)
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_enhanced, y_binned)):
        print(f"\nFold {fold+1}")
        X_tr, y_tr = X_enhanced.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X_enhanced.iloc[val_idx], y.iloc[val_idx]

        model = XGBRegressor(
        n_estimators=1325,
        learning_rate=0.00375,
        max_depth=15,
        subsample=0.58315,
        colsample_bytree=0.75715,
        random_state=42,
        n_jobs=-1
    )

        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        print(f"✅ Fold {fold+1} MAE: {mae:.5f}")
        cv_scores.append(mae)

        test_preds.append(model.predict(X_test_enhanced))

print("\n📉 평균 MAE:", np.mean(cv_scores))

final_preds = np.mean(test_preds, axis=0)



Fold 1
✅ Fold 1 MAE: 0.19701

Fold 2
✅ Fold 2 MAE: 0.19584

Fold 3


KeyboardInterrupt: 

제출파일

In [None]:
sample_submission['성공확률'] = final_preds
sample_submission.to_csv('클린코드1차.csv', index=False, encoding='utf-8-sig')
print("✅ 최종 제출 파일 저장 완료")
print(sample_submission.head)

✅ 최종 제출 파일 저장 완료
<bound method NDFrame.head of              ID      성공확률
0     TEST_0000  0.495430
1     TEST_0001  0.476088
2     TEST_0002  0.423867
3     TEST_0003  0.518815
4     TEST_0004  0.641490
...         ...       ...
1750  TEST_1750  0.534823
1751  TEST_1751  0.566253
1752  TEST_1752  0.487948
1753  TEST_1753  0.476240
1754  TEST_1754  0.524418

[1755 rows x 2 columns]>


해석1차 - 폐기

In [None]:
def add_interaction_features(df):
    df = df.copy()
    
    # 설립년차 * 투자단계
    if '설립년차' in df.columns and '투자단계' in df.columns:
        df['설립X투자단계'] = df['설립년차'] * df['투자단계']
    
    # 고객수_직원비 * 연매출_직원비
    if '고객수_직원비' in df.columns and '연매출_직원비' in df.columns:
        df['고객X매출직원비'] = df['고객수_직원비'] * df['연매출_직원비']
    
    # log(SNS 수) + log(연매출)
    if 'SNS 팔로워 수(백만명)' in df.columns and '연매출(억원)' in df.columns:
        df['로그SNS'] = np.log1p(df['SNS 팔로워 수(백만명)'])
        df['로그매출'] = np.log1p(df['연매출(억원)'])
        df['로그SNS+매출'] = df['로그SNS'] + df['로그매출']
    
    return df

X_enhanced = add_interaction_features(X_enhanced)
X_test_enhanced = add_interaction_features(X_test_enhanced)



회사랑 국가별?

In [None]:
# train 기준으로 국가별 평균 성공률 구하기
country_success_mean = train_filled.groupby('국가')['성공확률'].mean()

# train/test 모두에 국가 평균 성공률 피처로 추가
X_enhanced['국가_성공률평균'] = X_enhanced['국가'].map(country_success_mean)
X_test_enhanced['국가_성공률평균'] = X_test_enhanced['국가'].map(country_success_mean)

In [None]:
bins = np.linspace(0, 1, 6)
y_binned = np.digitize(y, bins)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
test_preds = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_enhanced, y_binned)):
    print(f"\nFold {fold+1}")
    X_tr, y_tr = X_enhanced.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X_enhanced.iloc[val_idx], y.iloc[val_idx]

    model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.03,
        max_depth=15,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    print(f"✅ Fold {fold+1} MAE: {mae:.5f}")
    cv_scores.append(mae)

    test_preds.append(model.predict(X_test_enhanced))

print("\n📉 평균 MAE:", np.mean(cv_scores))

final_preds = np.mean(test_preds, axis=0)

NameError: name 'np' is not defined