In [18]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드 함수
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 함수
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 존재하지 않습니다.")

    # ID 제거
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])
    else:
        train_id, test_id = None, None

    # 숫자 / 범주형 컬럼 구분
    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    # target_col 제외
    if target_col in num_cols:
        num_cols.remove(target_col)
    if target_col in cat_cols:
        cat_cols.remove(target_col)

    # 결측치 처리: 숫자는 평균, 범주형은 최빈값
    for col in num_cols:
        train[col] = train[col].fillna(train[col].mean())
        test[col] = test[col].fillna(train[col].mean())
    for col in cat_cols:
        mode = train[col].mode()[0]
        train[col] = train[col].fillna(mode)
        test[col] = test[col].fillna(mode)

    # 범주형 원-핫 인코딩
    train = pd.get_dummies(train, columns=cat_cols)
    test = pd.get_dummies(test, columns=cat_cols)

    # train/test 컬럼 맞추기
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = train[target_col]

    return X, y, test, train_id, test_id

# ------------------------------
# 4. 모델 학습 및 예측 함수 (KFold + LGBM)
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.05,
            "num_leaves": 31,
            "seed": seed,
            "verbose": -1
        }

        model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_val],
        num_boost_round=10000,
        callbacks=[
            early_stopping(stopping_rounds=100),
            log_evaluation(period=200)
        ]
    )

        oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_preds += model.predict(test, num_iteration=model.best_iteration) / n_splits

    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.5f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장 함수
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0  # 컬럼 없으면 생성

    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission.csv", index=False)
    print("✅ submission.csv 저장 완료")

# ------------------------------
# 6. 전체 파이프라인 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.12474	valid_1's l1: 0.202508
[400]	training's l1: 0.0746857	valid_1's l1: 0.186713
[600]	training's l1: 0.0484242	valid_1's l1: 0.178418
[800]	training's l1: 0.0328702	valid_1's l1: 0.173155
[1000]	training's l1: 0.0231374	valid_1's l1: 0.17003
[1200]	training's l1: 0.0170938	valid_1's l1: 0.167826
[1400]	training's l1: 0.0129644	valid_1's l1: 0.166492
[1600]	training's l1: 0.00998153	valid_1's l1: 0.165687
[1800]	training's l1: 0.00780908	valid_1's l1: 0.165174
[2000]	training's l1: 0.00626257	valid_1's l1: 0.164797
[2200]	training's l1: 0.00511298	valid_1's l1: 0.16453
[2400]	training's l1: 0.00423438	valid_1's l1: 0.164344
[2600]	training's l1: 0.00352876	valid_1's l1: 0.164225
[2800]	training's l1: 0.00298945	valid_1's l1: 0.164111
[3000]	training's l1: 0.00253804	valid_1's l1: 0.164034
[3200]	training's l1: 0.00217207	v

In [22]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드 함수
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 함수
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 존재하지 않습니다.")

    # ID 제거
    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    # 숫자 / 범주형 컬럼 구분
    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    # target_col 제외
    if target_col in num_cols:
        num_cols.remove(target_col)
    if target_col in cat_cols:
        cat_cols.remove(target_col)

    # 결측치 처리
    for col in num_cols:
        train[col] = train[col].fillna(train[col].mean())
        test[col] = test[col].fillna(train[col].mean())
    for col in cat_cols:
        mode = train[col].mode()[0]
        train[col] = train[col].fillna(mode)
        test[col] = test[col].fillna(mode)

    # 범주형 원-핫 인코딩
    train = pd.get_dummies(train, columns=cat_cols)
    test = pd.get_dummies(test, columns=cat_cols)

    # train/test 컬럼 맞추기
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = train[target_col]

    return X, y, test, train_id, test_id

# ------------------------------
# 4. 모델 학습 및 예측 함수 (KFold + LGBM)
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.05,
            "num_leaves": 31,
            "seed": seed,
            "verbose": -1
        }

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[
                early_stopping(stopping_rounds=100),
                log_evaluation(period=200)
            ]
        )

        oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_preds += model.predict(test, num_iteration=model.best_iteration) / n_splits

    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.5f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장 함수
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0

    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission2.csv", index=False, encoding="utf-8")
    print("✅ submission2.csv 저장 완료")

# ------------------------------
# 6. 전체 파이프라인 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.12474	valid_1's l1: 0.202508
[400]	training's l1: 0.0746857	valid_1's l1: 0.186713
[600]	training's l1: 0.0484242	valid_1's l1: 0.178418
[800]	training's l1: 0.0328702	valid_1's l1: 0.173155
[1000]	training's l1: 0.0231374	valid_1's l1: 0.17003
[1200]	training's l1: 0.0170938	valid_1's l1: 0.167826
[1400]	training's l1: 0.0129644	valid_1's l1: 0.166492
[1600]	training's l1: 0.00998153	valid_1's l1: 0.165687
[1800]	training's l1: 0.00780908	valid_1's l1: 0.165174
[2000]	training's l1: 0.00626257	valid_1's l1: 0.164797
[2200]	training's l1: 0.00511298	valid_1's l1: 0.16453
[2400]	training's l1: 0.00423438	valid_1's l1: 0.164344
[2600]	training's l1: 0.00352876	valid_1's l1: 0.164225
[2800]	training's l1: 0.00298945	valid_1's l1: 0.164111
[3000]	training's l1: 0.00253804	valid_1's l1: 0.164034
[3200]	training's l1: 0.00217207	v

In [24]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 존재하지 않습니다.")

    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 결측치 처리
    for col in num_cols:
        train[col].fillna(train[col].mean(), inplace=True)
        test[col].fillna(train[col].mean(), inplace=True)
    for col in cat_cols:
        mode = train[col].mode()[0]
        train[col].fillna(mode, inplace=True)
        test[col].fillna(mode, inplace=True)

    # 범주형 원-핫
    train = pd.get_dummies(train, columns=cat_cols)
    test = pd.get_dummies(test, columns=cat_cols)

    # train/test 컬럼 맞추기
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = train[target_col]

    return X, y, test, train_id, test_id

# ------------------------------
# 4. LightGBM 학습 및 예측
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    params = {
        "objective": "regression",
        "metric": "mae",
        "learning_rate": 0.03,     # 낮춰서 안정적 학습
        "num_leaves": 64,
        "max_depth": 7,
        "min_data_in_leaf": 20,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "seed": seed,
        "verbose": -1
    }

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[
                early_stopping(stopping_rounds=200),
                log_evaluation(period=200)
            ]
        )

        oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_preds += model.predict(test, num_iteration=model.best_iteration) / n_splits

    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.5f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission.csv", index=False, encoding="utf-8")
    print("✅ submission.csv 저장 완료")

# ------------------------------
# 6. 전체 파이프라인 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 0.174952	valid_1's l1: 0.217251


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

[400]	training's l1: 0.134041	valid_1's l1: 0.20606
[600]	training's l1: 0.103838	valid_1's l1: 0.197413
[800]	training's l1: 0.0801091	valid_1's l1: 0.189885
[1000]	training's l1: 0.0638253	valid_1's l1: 0.184591
[1200]	training's l1: 0.0517832	valid_1's l1: 0.180545
[1400]	training's l1: 0.0422522	valid_1's l1: 0.177167
[1600]	training's l1: 0.0346805	valid_1's l1: 0.174563
[1800]	training's l1: 0.0287087	valid_1's l1: 0.172774
[2000]	training's l1: 0.0238217	valid_1's l1: 0.171057
[2200]	training's l1: 0.0200541	valid_1's l1: 0.16979
[2400]	training's l1: 0.0170706	valid_1's l1: 0.168741
[2600]	training's l1: 0.0146074	valid_1's l1: 0.167854
[2800]	training's l1: 0.0126327	valid_1's l1: 0.167179
[3000]	training's l1: 0.0109953	valid_1's l1: 0.166563
[3200]	training's l1: 0.00960318	valid_1's l1: 0.166158
[3400]	training's l1: 0.0084702	valid_1's l1: 0.165785
[3600]	training's l1: 0.00749994	valid_1's l1: 0.165531
[3800]	training's l1: 0.00666955	valid_1's l1: 0.165334
[4000]	trainin

In [27]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 존재하지 않습니다.")

    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 결측치 처리
    for col in num_cols:
        train[col].fillna(train[col].mean(), inplace=True)
        test[col].fillna(train[col].mean(), inplace=True)
    for col in cat_cols:
        mode = train[col].mode()[0]
        train[col].fillna(mode, inplace=True)
        test[col].fillna(mode, inplace=True)

    # 범주형 원-핫
    train = pd.get_dummies(train, columns=cat_cols)
    test = pd.get_dummies(test, columns=cat_cols)

    # train/test 컬럼 맞추기
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = train[target_col]

    return X, y, test, train_id, test_id

# ------------------------------
# 4. LightGBM 학습 및 예측
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    params = {
        "objective": "regression",
        "metric": "mae",
        "learning_rate": 0.03,     # 낮춰서 안정적 학습
        "num_leaves": 64,
        "max_depth": 7,
        "min_data_in_leaf": 20,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "seed": seed,
        "verbose": -1
    }

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[
                early_stopping(stopping_rounds=200),
                log_evaluation(period=200)
            ]
        )

        oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_preds += model.predict(test, num_iteration=model.best_iteration) / n_splits

    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.5f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission3.csv", index=False, encoding="utf-8")
    print("✅ submission3.csv 저장 완료")

# ------------------------------
# 6. 전체 파이프라인 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 200 rounds


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

[200]	training's l1: 0.174952	valid_1's l1: 0.217251
[400]	training's l1: 0.134041	valid_1's l1: 0.20606
[600]	training's l1: 0.103838	valid_1's l1: 0.197413
[800]	training's l1: 0.0801091	valid_1's l1: 0.189885
[1000]	training's l1: 0.0638253	valid_1's l1: 0.184591
[1200]	training's l1: 0.0517832	valid_1's l1: 0.180545
[1400]	training's l1: 0.0422522	valid_1's l1: 0.177167
[1600]	training's l1: 0.0346805	valid_1's l1: 0.174563
[1800]	training's l1: 0.0287087	valid_1's l1: 0.172774
[2000]	training's l1: 0.0238217	valid_1's l1: 0.171057
[2200]	training's l1: 0.0200541	valid_1's l1: 0.16979
[2400]	training's l1: 0.0170706	valid_1's l1: 0.168741
[2600]	training's l1: 0.0146074	valid_1's l1: 0.167854
[2800]	training's l1: 0.0126327	valid_1's l1: 0.167179
[3000]	training's l1: 0.0109953	valid_1's l1: 0.166563
[3200]	training's l1: 0.00960318	valid_1's l1: 0.166158
[3400]	training's l1: 0.0084702	valid_1's l1: 0.165785
[3600]	training's l1: 0.00749994	valid_1's l1: 0.165531
[3800]	training's

In [28]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + Feature Engineering
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 존재하지 않습니다.")

    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    # 결측치 처리
    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()
    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    for col in num_cols:
        train[col].fillna(train[col].mean(), inplace=True)
        test[col].fillna(train[col].mean(), inplace=True)
    for col in cat_cols:
        mode = train[col].mode()[0]
        train[col].fillna(mode, inplace=True)
        test[col].fillna(mode, inplace=True)

    # Feature Engineering
    # BMI
    if "weight" in train.columns and "height" in train.columns:
        train["BMI"] = train["weight"] / ((train["height"]/100)**2)
        test["BMI"] = test["weight"] / ((test["height"]/100)**2)
    # 혈압 비율
    if "systolic_blood_pressure" in train.columns and "diastolic_blood_pressure" in train.columns:
        train["bp_ratio"] = train["systolic_blood_pressure"] / train["diastolic_blood_pressure"]
        test["bp_ratio"] = test["systolic_blood_pressure"] / test["diastolic_blood_pressure"]

    # 범주형 원-핫
    train = pd.get_dummies(train, columns=cat_cols)
    test = pd.get_dummies(test, columns=cat_cols)

    # 컬럼 맞추기
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = train[target_col]

    return X, y, test, train_id, test_id

# ------------------------------
# 4. LGBM 학습
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    params = {
        "objective": "regression",
        "metric": "mae",
        "learning_rate": 0.02,
        "num_leaves": 128,
        "max_depth": 8,
        "min_data_in_leaf": 15,
        "feature_fraction": 0.85,
        "bagging_fraction": 0.85,
        "bagging_freq": 1,
        "lambda_l1": 0.5,
        "lambda_l2": 0.5,
        "seed": seed,
        "verbose": -1
    }

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=20000,
            callbacks=[
                early_stopping(stopping_rounds=300),
                log_evaluation(period=200)
            ]
        )

        oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_preds += model.predict(test, num_iteration=model.best_iteration) / n_splits

    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.5f}")
    return test_preds

# ------------------------------
# 5. 제출 파일
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission4.csv", index=False, encoding="utf-8")
    print("✅ submission4.csv 저장 완료")

# ------------------------------
# 6. 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 300 rounds


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

[200]	training's l1: 0.178438	valid_1's l1: 0.218953
[400]	training's l1: 0.13092	valid_1's l1: 0.205418
[600]	training's l1: 0.0997589	valid_1's l1: 0.195767
[800]	training's l1: 0.076828	valid_1's l1: 0.188041
[1000]	training's l1: 0.0617598	valid_1's l1: 0.183541
[1200]	training's l1: 0.0515541	valid_1's l1: 0.180131
[1400]	training's l1: 0.0445498	valid_1's l1: 0.177921
[1600]	training's l1: 0.0398258	valid_1's l1: 0.176447
[1800]	training's l1: 0.03637	valid_1's l1: 0.175484
[2000]	training's l1: 0.0338195	valid_1's l1: 0.174686
[2200]	training's l1: 0.0317818	valid_1's l1: 0.174079
[2400]	training's l1: 0.0302	valid_1's l1: 0.173575
[2600]	training's l1: 0.0289386	valid_1's l1: 0.173139
[2800]	training's l1: 0.027865	valid_1's l1: 0.172787
[3000]	training's l1: 0.0269673	valid_1's l1: 0.17252
[3200]	training's l1: 0.0261694	valid_1's l1: 0.17223
[3400]	training's l1: 0.0254707	valid_1's l1: 0.172001
[3600]	training's l1: 0.0248345	valid_1's l1: 0.171786
[3800]	training's l1: 0.02

In [30]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 없습니다.")

    # ID 분리
    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    # 숫자 / 범주형 컬럼 구분
    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 수치형 컬럼 문자열 → 숫자형 변환
    for col in num_cols:
        train[col] = pd.to_numeric(train[col], errors="coerce")
        test[col]  = pd.to_numeric(test[col], errors="coerce")

    # 결측치 처리
    for col in num_cols:
        mean_val = train[col].mean()
        train[col] = train[col].fillna(mean_val)
        test[col]  = test[col].fillna(mean_val)
    for col in cat_cols:
        mode_val = train[col].mode()[0]
        train[col] = train[col].fillna(mode_val)
        test[col]  = test[col].fillna(mode_val)

    # 범주형 원-핫 인코딩
    train = pd.get_dummies(train, columns=cat_cols)
    test  = pd.get_dummies(test,  columns=cat_cols)

    # train/test 컬럼 맞추기
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = train[target_col]

    return X, y, test, train_id, test_id

# ------------------------------
# 4. 모델 학습 및 예측
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val   = lgb.Dataset(X_val, y_val, reference=lgb_train)

        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.05,
            "num_leaves": 31,
            "seed": seed,
            "verbose": -1
        }

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[
                early_stopping(stopping_rounds=100),
                log_evaluation(period=200)
            ]
        )

        oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_preds += model.predict(test, num_iteration=model.best_iteration) / n_splits

    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.5f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_final5.csv", index=False, encoding="utf-8")
    print("✅ submission_final5.csv 저장 완료")

# ------------------------------
# 6. 전체 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.12474	valid_1's l1: 0.202508
[400]	training's l1: 0.0746857	valid_1's l1: 0.186713
[600]	training's l1: 0.0484242	valid_1's l1: 0.178418
[800]	training's l1: 0.0328702	valid_1's l1: 0.173155
[1000]	training's l1: 0.0231374	valid_1's l1: 0.17003
[1200]	training's l1: 0.0170938	valid_1's l1: 0.167826
[1400]	training's l1: 0.0129644	valid_1's l1: 0.166492
[1600]	training's l1: 0.00998153	valid_1's l1: 0.165687
[1800]	training's l1: 0.00780908	valid_1's l1: 0.165174
[2000]	training's l1: 0.00626257	valid_1's l1: 0.164797
[2200]	training's l1: 0.00511298	valid_1's l1: 0.16453
[2400]	training's l1: 0.00423438	valid_1's l1: 0.164344
[2600]	training's l1: 0.00352876	valid_1's l1: 0.164225
[2800]	training's l1: 0.00298945	valid_1's l1: 0.164111
[3000]	training's l1: 0.00253804	valid_1's l1: 0.164034
[3200]	training's l1: 0.00217207	v

In [31]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + 파생변수
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 없습니다.")

    # ID 분리
    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    # 수치형/범주형 컬럼 구분
    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 수치형 문자열 → 숫자
    for col in num_cols:
        train[col] = pd.to_numeric(train[col], errors="coerce")
        test[col]  = pd.to_numeric(test[col], errors="coerce")

    # 파생 변수 생성
    for df in [train, test]:
        df["BMI"] = df["weight"] / ((df["height"]/100)**2)
        df["bp_ratio"] = df["systolic_blood_pressure"] / df["diastolic_blood_pressure"]

    # 결측치 처리
    for col in num_cols + ["BMI","bp_ratio"]:
        mean_val = train[col].mean()
        train[col] = train[col].fillna(mean_val)
        test[col]  = test[col].fillna(mean_val)

    for col in cat_cols:
        mode_val = train[col].mode()[0]
        train[col] = train[col].fillna(mode_val)
        test[col]  = test[col].fillna(mode_val)

    # 범주형 원-핫 인코딩
    train = pd.get_dummies(train, columns=cat_cols)
    test  = pd.get_dummies(test, columns=cat_cols)

    # train/test 컬럼 맞추기
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = train[target_col]

    return X, y, test, train_id, test_id

# ------------------------------
# 4. 모델 학습 및 예측
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val   = lgb.Dataset(X_val, y_val, reference=lgb_train)

        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 63,
            "max_depth": 8,
            "min_data_in_leaf": 20,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[
                early_stopping(stopping_rounds=200),
                log_evaluation(period=200)
            ]
        )

        oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_preds += model.predict(test, num_iteration=model.best_iteration) / n_splits

    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.5f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_upgrade.csv", index=False, encoding="utf-8")
    print("✅ submission_upgrade.csv 저장 완료")

# ------------------------------
# 6. 전체 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 0.161673	valid_1's l1: 0.213423
[400]	training's l1: 0.110674	valid_1's l1: 0.198642
[600]	training's l1: 0.0788124	valid_1's l1: 0.188029
[800]	training's l1: 0.0580446	valid_1's l1: 0.182076
[1000]	training's l1: 0.0428326	valid_1's l1: 0.177431
[1200]	training's l1: 0.0335409	valid_1's l1: 0.174141
[1400]	training's l1: 0.0260026	valid_1's l1: 0.171755
[1600]	training's l1: 0.0202569	valid_1's l1: 0.169801
[1800]	training's l1: 0.0160592	valid_1's l1: 0.168264
[2000]	training's l1: 0.0128696	valid_1's l1: 0.167239
[2200]	training's l1: 0.0105459	valid_1's l1: 0.166577
[2400]	training's l1: 0.00859967	valid_1's l1: 0.166076
[2600]	training's l1: 0.00726469	valid_1's l1: 0.165686
[2800]	training's l1: 0.00619297	valid_1's l1: 0.165406
[3000]	training's l1: 0.00525413	valid_1's l1: 0.165134
[3200]	training's l1: 0.00452161	val

In [33]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + 파생변수 + Target Encoding
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 없습니다.")

    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    # 수치형 컬럼 (연산에 필요한 모든 컬럼 포함)
    num_cols = ["age","height","weight","cholesterol","systolic_blood_pressure",
                "diastolic_blood_pressure","glucose","bone_density","activity",
                "sleep_pattern","mean_working"]

    # 숫자형으로 변환
    for col in num_cols:
        train[col] = pd.to_numeric(train[col], errors="coerce")
        test[col]  = pd.to_numeric(test[col], errors="coerce")

    # 결측치 처리
    for col in num_cols:
        mean_val = train[col].mean()
        train[col] = train[col].fillna(mean_val)
        test[col]  = test[col].fillna(mean_val)

    # 파생 변수 생성
    for df in [train, test]:
        df["BMI"] = df["weight"] / ((df["height"]/100)**2)
        df["bp_ratio"] = df["systolic_blood_pressure"] / df["diastolic_blood_pressure"]
        df["activity_sleep"] = df["activity"] * df["sleep_pattern"]
        df["age_weight_ratio"] = df["age"] / df["weight"]
        df["chol_glu_ratio"] = df["cholesterol"] / (df["glucose"] + 1e-6)  # 0 나누기 방지

    # 범주형 컬럼 추출
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    # Target Encoding
    for col in cat_cols:
        mapping = train.groupby(col)[target_col].mean()
        train[col+"_TE"] = train[col].map(mapping)
        test[col+"_TE"]  = test[col].map(mapping)

    cat_cols_TE = [c+"_TE" for c in cat_cols]

    # Label Encoding
    for col in cat_cols:
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined)
        train[col+"_LE"] = le.transform(train[col].astype(str))
        test[col+"_LE"]  = le.transform(test[col].astype(str))
    cat_cols_LE = [c+"_LE" for c in cat_cols]

    # 로그 변환: 왜곡 큰 컬럼
    for col in ["cholesterol","glucose","bone_density","mean_working"]:
        for df in [train, test]:
            df[col] = np.log1p(df[col])

    # 최종 feature 선택
    features = num_cols + ["BMI","bp_ratio","activity_sleep","age_weight_ratio","chol_glu_ratio"] + cat_cols_TE + cat_cols_LE
    X = train[features]
    y = train[target_col]
    test_final = test[features]

    return X, y, test_final, train_id, test_id

# ------------------------------
# 4. 모델 학습 및 예측 (LightGBM + ExtraTrees 앙상블)
# ------------------------------
def train_and_predict(X, y, test, n_splits=10, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds_lgb = np.zeros(len(test))
    test_preds_et = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        # LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val   = lgb.Dataset(X_val, y_val, reference=lgb_train)
        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 63,
            "max_depth": 8,
            "min_data_in_leaf": 20,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }
        model_lgb = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[early_stopping(stopping_rounds=200), log_evaluation(period=200)]
        )
        oof_preds[val_idx] = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
        test_preds_lgb += model_lgb.predict(test, num_iteration=model_lgb.best_iteration) / n_splits

        # ExtraTreesRegressor
        model_et = ExtraTreesRegressor(n_estimators=500, max_depth=12, random_state=seed)
        model_et.fit(X_train, y_train)
        test_preds_et += model_et.predict(test) / n_splits

    # 앙상블
    test_preds = (test_preds_lgb + test_preds_et) / 2
    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.5f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_ensemble_final.csv", index=False, encoding="utf-8")
    print("✅ submission_ensemble_final.csv 저장 완료")

# ------------------------------
# 6. 전체 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 0.155412	valid_1's l1: 0.211364
[400]	training's l1: 0.103708	valid_1's l1: 0.195744
[600]	training's l1: 0.0746017	valid_1's l1: 0.186451
[800]	training's l1: 0.0526738	valid_1's l1: 0.180136
[1000]	training's l1: 0.0390904	valid_1's l1: 0.176093
[1200]	training's l1: 0.0303305	valid_1's l1: 0.172754
[1400]	training's l1: 0.0234532	valid_1's l1: 0.170489
[1600]	training's l1: 0.0182659	valid_1's l1: 0.168841
[1800]	training's l1: 0.0143555	valid_1's l1: 0.167394
[2000]	training's l1: 0.0113838	valid_1's l1: 0.166371
[2200]	training's l1: 0.00925963	valid_1's l1: 0.165827
[2400]	training's l1: 0.00761224	valid_1's l1: 0.165347
[2600]	training's l1: 0.00637102	valid_1's l1: 0.164986
[2800]	training's l1: 0.00540573	valid_1's l1: 0.164725
[3000]	training's l1: 0.00458162	valid_1's l1: 0.16447
[3200]	training's l1: 0.00394885	val

In [35]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + 파생변수 + Target Encoding
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 없습니다.")

    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 수치형 컬럼 안전 변환 (문자열 → 숫자)
    for col in num_cols:
        train[col] = pd.to_numeric(train[col], errors="coerce")
        test[col]  = pd.to_numeric(test[col], errors="coerce")

    # 파생 변수 생성
    for df in [train, test]:
        df["BMI"] = df["weight"] / ((df["height"]/100)**2)
        df["bp_ratio"] = df["systolic_blood_pressure"] / df["diastolic_blood_pressure"]
        df["activity_sleep"] = pd.to_numeric(df["activity"], errors="coerce") * pd.to_numeric(df["sleep_pattern"], errors="coerce")
        df["age_weight_ratio"] = df["age"] / df["weight"]

    # 결측치 처리
    for col in num_cols + ["BMI","bp_ratio","activity_sleep","age_weight_ratio"]:
        mean_val = train[col].mean()
        train[col] = train[col].fillna(mean_val)
        test[col]  = test[col].fillna(mean_val)

    # 범주형 Target Encoding
    for col in cat_cols:
        mapping = train.groupby(col)[target_col].mean()
        train[col+"_TE"] = train[col].map(mapping)
        test[col+"_TE"]  = test[col].map(mapping)
    cat_cols_TE = [c+"_TE" for c in cat_cols]

    # Label Encoding
    for col in cat_cols:
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined)
        train[col+"_LE"] = le.transform(train[col].astype(str))
        test[col+"_LE"]  = le.transform(test[col].astype(str))
    cat_cols_LE = [c+"_LE" for c in cat_cols]

    # 로그 변환: 왜곡이 큰 컬럼
    for col in ["cholesterol","glucose","bone_density","mean_working"]:
        for df in [train, test]:
            df[col] = np.log1p(df[col])

    # 최종 feature 선택
    features = num_cols + ["BMI","bp_ratio","activity_sleep","age_weight_ratio"] + cat_cols_TE + cat_cols_LE
    X = train[features]
    y = train[target_col]
    test_final = test[features]

    return X, y, test_final, train_id, test_id

# ------------------------------
# 4. 모델 학습 및 예측 (10-Fold, LightGBM + ExtraTrees 앙상블)
# ------------------------------
def train_and_predict(X, y, test, n_splits=10, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds_lgb = np.zeros(len(test))
    test_preds_et = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        # LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val   = lgb.Dataset(X_val, y_val, reference=lgb_train)
        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 63,
            "max_depth": 8,
            "min_data_in_leaf": 20,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }
        model_lgb = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[early_stopping(stopping_rounds=200), log_evaluation(period=200)]
        )

        oof_preds[val_idx] = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
        test_preds_lgb += model_lgb.predict(test, num_iteration=model_lgb.best_iteration) / n_splits

        # ExtraTreesRegressor
        model_et = ExtraTreesRegressor(n_estimators=500, max_depth=12, random_state=seed)
        model_et.fit(X_train, y_train)
        test_preds_et += model_et.predict(test) / n_splits

    # 앙상블: LightGBM + ExtraTrees
    test_preds = (test_preds_lgb + test_preds_et) / 2
    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.5f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_ensemble8.csv", index=False, encoding="utf-8")
    print("✅ submission_ensemble8.csv 저장 완료")

# ------------------------------
# 6. 전체 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=10, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 0.152771	valid_1's l1: 0.203118
[400]	training's l1: 0.101963	valid_1's l1: 0.187286
[600]	training's l1: 0.0694408	valid_1's l1: 0.177144
[800]	training's l1: 0.0499546	valid_1's l1: 0.170418
[1000]	training's l1: 0.0372989	valid_1's l1: 0.166627
[1200]	training's l1: 0.0280691	valid_1's l1: 0.164153
[1400]	training's l1: 0.0217255	valid_1's l1: 0.161834
[1600]	training's l1: 0.0169799	valid_1's l1: 0.160142
[1800]	training's l1: 0.013388	valid_1's l1: 0.158779
[2000]	training's l1: 0.0109113	valid_1's l1: 0.157655
[2200]	training's l1: 0.00905093	valid_1's l1: 0.15699
[2400]	training's l1: 0.0074581	valid_1's l1: 0.156617
[2600]	training's l1: 0.0063024	valid_1's l1: 0.156172
[2800]	training's l1: 0.00532688	valid_1's l1: 0.155911
[3000]	training's l1: 0.00461257	valid_1's l1: 0.155705
[3200]	training's l1: 0.00406532	valid_

In [40]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + 파생변수 + Target/Label Encoding
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 없습니다.")

    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    # 컬럼 구분
    num_cols = ["age","height","weight","cholesterol","glucose","bone_density",
                "systolic_blood_pressure","diastolic_blood_pressure","mean_working"]

    cat_cols = ["gender","smoke_status","medical_history","family_medical_history",
                "edu_level","activity","sleep_pattern"]

    # 수치형 변환 및 결측치 처리
    for col in num_cols:
        train[col] = pd.to_numeric(train[col], errors="coerce")
        test[col]  = pd.to_numeric(test[col], errors="coerce")
        mean_val = train[col].mean()
        train[col] = train[col].fillna(mean_val)
        test[col]  = test[col].fillna(mean_val)

    # 파생 변수
    for df in [train, test]:
        df["BMI"] = df["weight"] / ((df["height"]/100)**2)
        df["bp_ratio"] = df["systolic_blood_pressure"] / df["diastolic_blood_pressure"]
        df["activity_sleep"] = pd.to_numeric(df["activity"], errors="coerce") * pd.to_numeric(df.get("sleep_pattern_num", df["sleep_pattern"].astype('category').cat.codes), errors="coerce")
        df["age_weight_ratio"] = df["age"] / df["weight"]

    # 범주형 Target Encoding + Label Encoding
    for col in cat_cols:
        # Target Encoding
        mapping = train.groupby(col)[target_col].mean()
        train[col+"_TE"] = train[col].map(mapping)
        test[col+"_TE"]  = test[col].map(mapping)

        # Label Encoding
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined)
        train[col+"_LE"] = le.transform(train[col].astype(str))
        test[col+"_LE"]  = le.transform(test[col].astype(str))

    # 로그 변환
    for col in ["cholesterol","glucose","bone_density","mean_working"]:
        for df in [train, test]:
            df[col] = np.log1p(df[col])

    # 최종 feature 선택
    features = num_cols + ["BMI","bp_ratio","activity_sleep","age_weight_ratio"] + \
               [c+"_TE" for c in cat_cols] + [c+"_LE" for c in cat_cols]
    X = train[features]
    y = train[target_col]
    test_final = test[features]

    return X, y, test_final, train_id, test_id

# ------------------------------
# 4. 모델 학습 및 예측 (LightGBM + ExtraTrees 앙상블)
# ------------------------------
def train_and_predict(X, y, test, n_splits=10, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds_lgb = np.zeros(len(test))
    test_preds_et = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        # LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val   = lgb.Dataset(X_val, y_val, reference=lgb_train)
        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 63,
            "max_depth": 8,
            "min_data_in_leaf": 20,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }
        model_lgb = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[early_stopping(stopping_rounds=200), log_evaluation(period=200)]
        )

        oof_preds[val_idx] = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
        test_preds_lgb += model_lgb.predict(test, num_iteration=model_lgb.best_iteration) / n_splits

        # ExtraTreesRegressor
        model_et = ExtraTreesRegressor(n_estimators=500, max_depth=12, random_state=seed)
        model_et.fit(X_train, y_train)
        test_preds_et += model_et.predict(test) / n_splits

    # 앙상블
    test_preds = (test_preds_lgb + test_preds_et) / 2
    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.10f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_ensemble9.csv", index=False, encoding="utf-8")
    print("✅ submission_ensemble9.csv 저장 완료")

# ------------------------------
# 6. 전체 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=10, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 0.158798	valid_1's l1: 0.20073
[400]	training's l1: 0.105163	valid_1's l1: 0.18469
[600]	training's l1: 0.0724455	valid_1's l1: 0.176327
[800]	training's l1: 0.0530599	valid_1's l1: 0.169152
[1000]	training's l1: 0.039061	valid_1's l1: 0.164864
[1200]	training's l1: 0.0290139	valid_1's l1: 0.161432
[1400]	training's l1: 0.0222019	valid_1's l1: 0.159184
[1600]	training's l1: 0.0172637	valid_1's l1: 0.157477
[1800]	training's l1: 0.0136142	valid_1's l1: 0.156241
[2000]	training's l1: 0.0111878	valid_1's l1: 0.155488
[2200]	training's l1: 0.00931187	valid_1's l1: 0.154964
[2400]	training's l1: 0.00765657	valid_1's l1: 0.154475
[2600]	training's l1: 0.00645422	valid_1's l1: 0.154102
[2800]	training's l1: 0.00555521	valid_1's l1: 0.153794
[3000]	training's l1: 0.00480564	valid_1's l1: 0.153614
[3200]	training's l1: 0.00425523	valid

In [43]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + 파생변수 + Target Encoding + 안정화
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 없습니다.")

    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 수치형 컬럼으로 강제 변환
    for col in num_cols + ["activity", "sleep_pattern"]:
        train[col] = pd.to_numeric(train[col], errors="coerce")
        test[col]  = pd.to_numeric(test[col], errors="coerce")

    # 결측치 처리
    for col in num_cols + ["activity", "sleep_pattern"]:
        mean_val = train[col].mean()
        train[col] = train[col].fillna(mean_val)
        test[col]  = test[col].fillna(mean_val)

    # 파생 변수 생성
    for df in [train, test]:
        df["BMI"] = df["weight"] / ((df["height"]/100)**2)
        df["bp_ratio"] = df["systolic_blood_pressure"] / df["diastolic_blood_pressure"]
        df["activity_sleep"] = df["activity"] * df["sleep_pattern"]
        df["age_weight_ratio"] = df["age"] / df["weight"]

    # 결측치 처리 후 파생 변수 NaN 처리
    derived_cols = ["BMI","bp_ratio","activity_sleep","age_weight_ratio"]
    for col in derived_cols:
        mean_val = train[col].mean()
        train[col] = train[col].fillna(mean_val)
        test[col]  = test[col].fillna(mean_val)

    # 범주형 Target Encoding
    for col in cat_cols:
        mapping = train.groupby(col)[target_col].mean()
        train[col+"_TE"] = train[col].map(mapping)
        test[col+"_TE"]  = test[col].map(mapping)

    cat_cols_TE = [c+"_TE" for c in cat_cols]

    # Label Encoding
    for col in cat_cols:
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined)
        train[col+"_LE"] = le.transform(train[col].astype(str))
        test[col+"_LE"]  = le.transform(test[col].astype(str))

    cat_cols_LE = [c+"_LE" for c in cat_cols]

    # 로그 변환: 왜곡이 큰 컬럼
    for col in ["cholesterol","glucose","bone_density","mean_working"]:
        for df in [train, test]:
            df[col] = pd.to_numeric(df[col], errors="coerce")
            df[col] = df[col].fillna(df[col].mean())
            df[col] = np.log1p(df[col])

    # 최종 feature 선택
    features = num_cols + derived_cols + cat_cols_TE + cat_cols_LE
    X = train[features]
    y = train[target_col]
    test_final = test[features]

    return X, y, test_final, train_id, test_id

# ------------------------------
# 4. 모델 학습 및 예측 (LightGBM + ExtraTrees 앙상블)
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds_lgb = np.zeros(len(test))
    test_preds_et  = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        # LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val   = lgb.Dataset(X_val, y_val, reference=lgb_train)
        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 63,
            "max_depth": 8,
            "min_data_in_leaf": 20,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }
        model_lgb = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[early_stopping(stopping_rounds=200), log_evaluation(period=200)]
        )

        oof_preds[val_idx] = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
        test_preds_lgb += model_lgb.predict(test, num_iteration=model_lgb.best_iteration) / n_splits

        # ExtraTreesRegressor
        model_et = ExtraTreesRegressor(n_estimators=500, max_depth=12, random_state=seed)
        model_et.fit(X_train, y_train)
        test_preds_et += model_et.predict(test) / n_splits

    # 앙상블
    test_preds = (test_preds_lgb + test_preds_et) / 2
    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.10f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_stable10.csv", index=False, encoding="utf-8")
    print("✅ submission_stable10.csv 저장 완료")

# ------------------------------
# 6. 전체 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 0.157243	valid_1's l1: 0.212996
[400]	training's l1: 0.109731	valid_1's l1: 0.197115
[600]	training's l1: 0.0786692	valid_1's l1: 0.187444
[800]	training's l1: 0.0562663	valid_1's l1: 0.180902
[1000]	training's l1: 0.0422886	valid_1's l1: 0.175927
[1200]	training's l1: 0.032645	valid_1's l1: 0.172817
[1400]	training's l1: 0.0252244	valid_1's l1: 0.169915
[1600]	training's l1: 0.0200686	valid_1's l1: 0.168218
[1800]	training's l1: 0.0159076	valid_1's l1: 0.166862
[2000]	training's l1: 0.0126134	valid_1's l1: 0.165923
[2200]	training's l1: 0.0102109	valid_1's l1: 0.165199
[2400]	training's l1: 0.0083976	valid_1's l1: 0.164583
[2600]	training's l1: 0.00698515	valid_1's l1: 0.164157
[2800]	training's l1: 0.00590525	valid_1's l1: 0.16394
[3000]	training's l1: 0.00506106	valid_1's l1: 0.16373
[3200]	training's l1: 0.0043926	valid_1'

In [44]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 1. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 2. 전처리 + 파생변수 + Target Encoding (Out-of-Fold)
# ------------------------------
def preprocess(train, test, target_col="stress_score", n_splits=5, seed=42):
    train_id, test_id = train.get("ID"), test.get("ID")
    if train_id is not None:
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()
    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 수치형 문자열 → 숫자
    for col in num_cols:
        train[col] = pd.to_numeric(train[col], errors="coerce")
        test[col]  = pd.to_numeric(test[col], errors="coerce")

    # 파생 변수: 안전하게 숫자형만 사용
    for df in [train, test]:
        df["BMI"] = df["weight"] / ((df["height"]/100)**2)
        df["bp_ratio"] = df["systolic_blood_pressure"] / df["diastolic_blood_pressure"]

    # 결측치 처리
    all_num = num_cols + ["BMI", "bp_ratio"]
    for col in all_num:
        mean_val = train[col].mean()
        train[col] = train[col].fillna(mean_val)
        test[col]  = test[col].fillna(mean_val)

    # Label Encoding + Out-of-Fold Target Encoding
    X = train.copy()
    y = train[target_col].copy()
    test_proc = test.copy()

    oof_te = pd.DataFrame(index=train.index)
    for col in cat_cols:
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined)
        X[col+"_LE"] = le.transform(train[col].astype(str))
        test_proc[col+"_LE"] = le.transform(test[col].astype(str))

        # Out-of-Fold Target Encoding
        te_col = col+"_TE"
        oof_te[te_col] = 0
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        for tr_idx, val_idx in kf.split(X):
            mapping = X.iloc[tr_idx].groupby(col)[target_col].mean()
            oof_te.iloc[val_idx][te_col] = X.iloc[val_idx][col].map(mapping)
        # test set TE
        mapping_full = X.groupby(col)[target_col].mean()
        test_proc[te_col] = test[col].map(mapping_full).fillna(y.mean())

    # 최종 feature 선택
    features = all_num + [c+"_LE" for c in cat_cols] + [c+"_TE" for c in cat_cols]
    X_final = pd.concat([X[all_num], oof_te, X[[c+"_LE" for c in cat_cols]]], axis=1)
    test_final = pd.concat([test_proc[all_num], test_proc[[c+"_TE" for c in cat_cols]], test_proc[[c+"_LE" for c in cat_cols]]], axis=1)

    return X_final, y, test_final, train_id, test_id

# ------------------------------
# 3. 모델 학습 및 예측
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds_lgb = np.zeros(len(test))
    test_preds_et = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        # LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val   = lgb.Dataset(X_val, y_val, reference=lgb_train)
        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 63,
            "max_depth": 8,
            "min_data_in_leaf": 20,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }
        model_lgb = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_val],
                              num_boost_round=10000, callbacks=[early_stopping(200), log_evaluation(200)])
        oof_preds[val_idx] = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
        test_preds_lgb += model_lgb.predict(test, num_iteration=model_lgb.best_iteration)/n_splits

        # ExtraTrees
        model_et = ExtraTreesRegressor(n_estimators=500, max_depth=12, random_state=seed)
        model_et.fit(X_train, y_train)
        test_preds_et += model_et.predict(test)/n_splits

    test_preds = (test_preds_lgb + test_preds_et)/2
    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.10f}")
    return test_preds

# ------------------------------
# 4. 제출 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_stable11.csv", index=False, encoding="utf-8")
    print("✅ submission_stable11.csv 저장 완료")

# ------------------------------
# 5. 실행
# ------------------------------
if __name__=="__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score", n_splits=5)
    preds = train_and_predict(X, y, test_proc, n_splits=5)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 200 rounds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_te.iloc[val_idx][te_col] = X.iloc[val_idx][col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_te.iloc[val_idx][te_col] = X.iloc[val_idx][col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_te.iloc[val_idx][te_col] = X.iloc[val_idx][col].map(mapping)
A value is

[200]	training's l1: 0.158733	valid_1's l1: 0.212867
[400]	training's l1: 0.108072	valid_1's l1: 0.197941
[600]	training's l1: 0.0761342	valid_1's l1: 0.187993
[800]	training's l1: 0.055692	valid_1's l1: 0.1822
[1000]	training's l1: 0.0421217	valid_1's l1: 0.177856
[1200]	training's l1: 0.0325783	valid_1's l1: 0.174538
[1400]	training's l1: 0.0253634	valid_1's l1: 0.172099
[1600]	training's l1: 0.0200709	valid_1's l1: 0.170679
[1800]	training's l1: 0.0159182	valid_1's l1: 0.169213
[2000]	training's l1: 0.0129131	valid_1's l1: 0.168186
[2200]	training's l1: 0.0105298	valid_1's l1: 0.167322
[2400]	training's l1: 0.00866518	valid_1's l1: 0.166744
[2600]	training's l1: 0.00722881	valid_1's l1: 0.166338
[2800]	training's l1: 0.00617474	valid_1's l1: 0.166033
[3000]	training's l1: 0.00529218	valid_1's l1: 0.165816
[3200]	training's l1: 0.00459172	valid_1's l1: 0.165646
[3400]	training's l1: 0.00402351	valid_1's l1: 0.165516
[3600]	training's l1: 0.0035165	valid_1's l1: 0.165361
[3800]	traini

In [49]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + 파생변수 + Target Encoding
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 없습니다.")

    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 문자열 컬럼 숫자 매핑 (activity, sleep_pattern)
    activity_map = {'light':1, 'moderate':2, 'intense':3}
    sleep_map    = {'sleep difficulty':1, 'normal':2, 'oversleeping':3}

    for df in [train, test]:
        if "activity" in df.columns:
            df["activity_num"] = df["activity"].map(activity_map)
            df["activity_num"].fillna(df["activity_num"].mean(), inplace=True)
        else:
            df["activity_num"] = 0

        if "sleep_pattern" in df.columns:
            df["sleep_num"] = df["sleep_pattern"].map(sleep_map)
            df["sleep_num"].fillna(df["sleep_num"].mean(), inplace=True)
        else:
            df["sleep_num"] = 0

        # 파생 변수
        df["activity_sleep"] = df["activity_num"] * df["sleep_num"]
        df["BMI"]            = df["weight"] / ((df["height"]/100)**2)
        df["bp_ratio"]       = df["systolic_blood_pressure"] / df["diastolic_blood_pressure"]
        df["age_weight_ratio"]= df["age"] / df["weight"]

    # 수치형 결측치 처리
    for col in num_cols + ["BMI","bp_ratio","activity_sleep","age_weight_ratio"]:
        mean_val = train[col].mean()
        train[col] = train[col].fillna(mean_val)
        test[col]  = test[col].fillna(mean_val)

    # 범주형 Target Encoding
    for col in cat_cols:
        mapping = train.groupby(col)[target_col].mean()
        train[col+"_TE"] = train[col].map(mapping)
        test[col+"_TE"]  = test[col].map(mapping)
    cat_cols_TE = [c+"_TE" for c in cat_cols]

    # Label Encoding
    for col in cat_cols:
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined)
        train[col+"_LE"] = le.transform(train[col].astype(str))
        test[col+"_LE"]  = le.transform(test[col].astype(str))
    cat_cols_LE = [c+"_LE" for c in cat_cols]

    # 로그 변환: 왜곡이 큰 컬럼
    for col in ["cholesterol","glucose","bone_density","mean_working"]:
        if col in train.columns:
            train[col] = np.log1p(train[col])
            test[col]  = np.log1p(test[col])

    # 최종 feature 선택
    features = num_cols + ["BMI","bp_ratio","activity_sleep","age_weight_ratio"] + cat_cols_TE + cat_cols_LE
    X = train[features]
    y = train[target_col]
    test_final = test[features]

    return X, y, test_final, train_id, test_id

# ------------------------------
# 4. 모델 학습 및 예측 (LightGBM + ExtraTrees 앙상블)
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds_lgb = np.zeros(len(test))
    test_preds_et  = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        # LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val   = lgb.Dataset(X_val, y_val, reference=lgb_train)
        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 63,
            "max_depth": 8,
            "min_data_in_leaf": 20,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }
        model_lgb = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[early_stopping(stopping_rounds=200), log_evaluation(period=200)]
        )

        oof_preds[val_idx] = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
        test_preds_lgb += model_lgb.predict(test, num_iteration=model_lgb.best_iteration) / n_splits

        # ExtraTreesRegressor
        model_et = ExtraTreesRegressor(n_estimators=500, max_depth=12, random_state=seed)
        model_et.fit(X_train, y_train)
        test_preds_et += model_et.predict(test) / n_splits

    # 앙상블: LightGBM + ExtraTrees
    test_preds = (test_preds_lgb + test_preds_et) / 2
    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.10f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_stable_final12.csv", index=False, encoding="utf-8")
    print("✅ submission_stable_final12.csv 저장 완료")

# ------------------------------
# 6. 전체 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)  # KFold 5
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 200 rounds


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["activity_num"].fillna(df["activity_num"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["sleep_num"].fillna(df["sleep_num"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obj

[200]	training's l1: 0.155613	valid_1's l1: 0.212971
[400]	training's l1: 0.107217	valid_1's l1: 0.19947
[600]	training's l1: 0.0749955	valid_1's l1: 0.188296
[800]	training's l1: 0.0534595	valid_1's l1: 0.180494
[1000]	training's l1: 0.0392856	valid_1's l1: 0.17553
[1200]	training's l1: 0.0302429	valid_1's l1: 0.172131
[1400]	training's l1: 0.0231905	valid_1's l1: 0.169667
[1600]	training's l1: 0.0179688	valid_1's l1: 0.167463
[1800]	training's l1: 0.0139769	valid_1's l1: 0.166072
[2000]	training's l1: 0.0110618	valid_1's l1: 0.165147
[2200]	training's l1: 0.00882452	valid_1's l1: 0.164576
[2400]	training's l1: 0.00728902	valid_1's l1: 0.164141
[2600]	training's l1: 0.00601608	valid_1's l1: 0.163785
[2800]	training's l1: 0.00506943	valid_1's l1: 0.163459
[3000]	training's l1: 0.00432016	valid_1's l1: 0.163351
[3200]	training's l1: 0.00374967	valid_1's l1: 0.163247
[3400]	training's l1: 0.0033045	valid_1's l1: 0.163143
[3600]	training's l1: 0.0028994	valid_1's l1: 0.163028
[3800]	train

In [50]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import early_stopping, log_evaluation

TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 1. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 2. 전처리 + 파생변수
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 문자열 컬럼 숫자화
    activity_map = {'light':1, 'moderate':2, 'intense':3}
    sleep_map    = {'sleep difficulty':1, 'normal':2, 'oversleeping':3}

    for df in [train, test]:
        df["activity_num"] = df.get("activity", "").map(activity_map).fillna(2)
        df["sleep_num"] = df.get("sleep_pattern", "").map(sleep_map).fillna(2)
        df["activity_sleep"] = df["activity_num"] * df["sleep_num"]
        df["BMI"] = df["weight"] / ((df["height"]/100)**2)
        df["bp_ratio"] = df["systolic_blood_pressure"] / df["diastolic_blood_pressure"]
        df["age_weight_ratio"] = df["age"] / df["weight"]

    # 결측치 처리
    for col in num_cols + ["BMI","bp_ratio","activity_sleep","age_weight_ratio"]:
        mean_val = train[col].mean()
        train[col] = train[col].fillna(mean_val)
        test[col]  = test[col].fillna(mean_val)

    # 범주형 Target Encoding
    for col in cat_cols:
        mapping = train.groupby(col)[target_col].mean()
        train[col+"_TE"] = train[col].map(mapping)
        test[col+"_TE"]  = test[col].map(mapping)

    # Label Encoding
    for col in cat_cols:
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined)
        train[col+"_LE"] = le.transform(train[col].astype(str))
        test[col+"_LE"]  = le.transform(test[col].astype(str))

    features = num_cols + ["BMI","bp_ratio","activity_sleep","age_weight_ratio"] + [c+"_TE" for c in cat_cols] + [c+"_LE" for c in cat_cols]
    X = train[features]
    y = train[target_col]
    test_final = test[features]
    return X, y, test_final, train_id, test_id

# ------------------------------
# 3. 모델 학습 + 앙상블
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds_lgb = np.zeros(len(test))
    test_preds_et  = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        # LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val   = lgb.Dataset(X_val, y_val, reference=lgb_train)
        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 63,
            "max_depth": 8,
            "min_data_in_leaf": 20,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }
        model_lgb = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[early_stopping(stopping_rounds=200), log_evaluation(period=200)]
        )

        oof_preds[val_idx] = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
        test_preds_lgb += model_lgb.predict(test, num_iteration=model_lgb.best_iteration) / n_splits

        # ExtraTrees
        model_et = ExtraTreesRegressor(n_estimators=500, max_depth=12, random_state=seed)
        model_et.fit(X_train, y_train)
        test_preds_et += model_et.predict(test) / n_splits

    test_preds = (test_preds_lgb + test_preds_et)/2
    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.10f}")
    return test_preds

# ------------------------------
# 4. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_stable13.csv", index=False)
    print("✅ submission_stable13.csv 저장 완료")

# ------------------------------
# 5. 전체 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 0.155613	valid_1's l1: 0.212978
[400]	training's l1: 0.107217	valid_1's l1: 0.199495
[600]	training's l1: 0.0749955	valid_1's l1: 0.188332
[800]	training's l1: 0.0534595	valid_1's l1: 0.180539
[1000]	training's l1: 0.0392856	valid_1's l1: 0.175584
[1200]	training's l1: 0.0302429	valid_1's l1: 0.172183
[1400]	training's l1: 0.0231905	valid_1's l1: 0.169718
[1600]	training's l1: 0.0179688	valid_1's l1: 0.167517
[1800]	training's l1: 0.0139769	valid_1's l1: 0.166126
[2000]	training's l1: 0.0110618	valid_1's l1: 0.165207
[2200]	training's l1: 0.00882452	valid_1's l1: 0.164635
[2400]	training's l1: 0.00728902	valid_1's l1: 0.164202
[2600]	training's l1: 0.00601608	valid_1's l1: 0.163846
[2800]	training's l1: 0.00506943	valid_1's l1: 0.16352
[3000]	training's l1: 0.00432016	valid_1's l1: 0.163412
[3200]	training's l1: 0.00374967	val

In [53]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드 함수
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + 파생변수 함수
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 존재하지 않습니다.")

    # ID 제거
    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id = train["ID"]
        test_id = test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    # 숫자 / 범주형 컬럼 구분
    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()
    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 숫자 컬럼 변환
    for col in num_cols:
        train[col] = pd.to_numeric(train[col], errors="coerce").fillna(train[col].mean())
        test[col]  = pd.to_numeric(test[col], errors="coerce").fillna(train[col].mean())

    # 범주형 결측치 처리
    for col in cat_cols:
        mode = train[col].mode()[0]
        train[col] = train[col].fillna(mode)
        test[col]  = test[col].fillna(mode)

    # 파생변수 생성
    for df in [train, test]:
        df["BMI"] = df["weight"] / ((df["height"]/100)**2 + 1e-6)
        df["bp_ratio"] = df["systolic_blood_pressure"] / (df["diastolic_blood_pressure"] + 1e-6)
        df["activity_sleep"] = pd.to_numeric(df["activity"], errors="coerce").fillna(0) * pd.to_numeric(df["sleep_pattern"], errors="coerce").fillna(0)
        df["age_weight_ratio"] = df["age"] / (df["weight"] + 1e-6)

    # 범주형 원-핫 인코딩
    train = pd.get_dummies(train, columns=cat_cols)
    test = pd.get_dummies(test, columns=cat_cols)
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = train[target_col]
    return X, y, test, train_id, test_id

# ------------------------------
# 4. 모델 학습 + 예측 (KFold + LGBM)
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.05,
            "num_leaves": 50,
            "feature_fraction": 0.9,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[
                early_stopping(stopping_rounds=100),
                log_evaluation(period=200)
            ]
        )

        oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_preds += model.predict(test, num_iteration=model.best_iteration) / n_splits

    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.10f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_stable114.csv", index=False)
    print("✅ submission_stable14.csv 저장 완료")

# ------------------------------
# 6. 전체 파이프라인 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.0780643	valid_1's l1: 0.19049
[400]	training's l1: 0.0346026	valid_1's l1: 0.176302
[600]	training's l1: 0.0181017	valid_1's l1: 0.170847
[800]	training's l1: 0.0103361	valid_1's l1: 0.168592
[1000]	training's l1: 0.00635712	valid_1's l1: 0.167532
[1200]	training's l1: 0.0041836	valid_1's l1: 0.167075
[1400]	training's l1: 0.00292437	valid_1's l1: 0.16676
[1600]	training's l1: 0.00213607	valid_1's l1: 0.166582
[1800]	training's l1: 0.00161625	valid_1's l1: 0.166482
[2000]	training's l1: 0.00125822	valid_1's l1: 0.166432
[2200]	training's l1: 0.00100079	valid_1's l1: 0.166377
[2400]	training's l1: 0.000808962	valid_1's l1: 0.166356
[2600]	training's l1: 0.000657053	valid_1's l1: 0.166334
Early stopping, best iteration is:
[2620]	training's l1: 0.000643373	valid_1's l1: 0.166328

===== Fold 2 =====
Training until validation sc

In [54]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드 함수
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + 파생변수
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    if target_col not in train.columns:
        raise ValueError(f"{target_col} 컬럼이 train 데이터에 존재하지 않습니다.")

    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id = train["ID"]
        test_id = test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    # 숫자 / 범주형 컬럼 구분
    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()
    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 숫자 컬럼 처리
    for col in num_cols:
        train[col] = pd.to_numeric(train[col], errors="coerce").fillna(train[col].mean())
        test[col] = pd.to_numeric(test[col], errors="coerce").fillna(train[col].mean())

    # 범주형 결측치 처리
    for col in cat_cols:
        mode = train[col].mode()[0]
        train[col] = train[col].fillna(mode)
        test[col] = test[col].fillna(mode)

    # 파생변수
    for df in [train, test]:
        df["BMI"] = df["weight"] / ((df["height"]/100)**2 + 1e-6)
        df["bp_ratio"] = df["systolic_blood_pressure"] / (df["diastolic_blood_pressure"] + 1e-6)
        df["activity_sleep"] = pd.to_numeric(df["activity"], errors="coerce").fillna(0) * pd.to_numeric(df["sleep_pattern"], errors="coerce").fillna(0)
        df["age_weight_ratio"] = df["age"] / (df["weight"] + 1e-6)

    # 범주형 원-핫 인코딩
    train = pd.get_dummies(train, columns=cat_cols)
    test = pd.get_dummies(test, columns=cat_cols)
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = np.log1p(train[target_col])  # target scaling

    return X, y, test, train_id, test_id

# ------------------------------
# 4. 모델 학습 + 예측 (LGBM + ExtraTrees)
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        # LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
        lgb_params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 64,
            "feature_fraction": 0.9,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }
        lgb_model = lgb.train(
            lgb_params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=500)]
        )
        lgb_val_pred = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
        lgb_test_pred = lgb_model.predict(test, num_iteration=lgb_model.best_iteration)

        # ExtraTrees
        et_model = ExtraTreesRegressor(n_estimators=500, random_state=seed, n_jobs=-1)
        et_model.fit(X_train, y_train)
        et_val_pred = et_model.predict(X_val)
        et_test_pred = et_model.predict(test)

        # Fold 별 앙상블 (0.6 LGBM + 0.4 ET)
        val_pred = 0.6*lgb_val_pred + 0.4*et_val_pred
        test_pred = 0.6*lgb_test_pred + 0.4*et_test_pred

        oof_preds[val_idx] = val_pred
        test_preds += test_pred / n_splits

    mae = mean_absolute_error(np.expm1(y), np.expm1(oof_preds))  # 스케일 원복
    print(f"\n✅ CV MAE: {mae:.10f}")
    return np.expm1(test_preds)  # 제출용 스케일 원복

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_stable15.csv", index=False)
    print("✅ submission_stable15.csv 저장 완료")

# ------------------------------
# 6. 전체 파이프라인 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0250615	valid_1's l1: 0.121009
[1000]	training's l1: 0.00739319	valid_1's l1: 0.114673
[1500]	training's l1: 0.00296279	valid_1's l1: 0.113084
[2000]	training's l1: 0.00146787	valid_1's l1: 0.112659
[2500]	training's l1: 0.000851143	valid_1's l1: 0.112519
[3000]	training's l1: 0.000546174	valid_1's l1: 0.112475
Early stopping, best iteration is:
[3271]	training's l1: 0.000439186	valid_1's l1: 0.112465

===== Fold 2 =====
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0236695	valid_1's l1: 0.126254
[1000]	training's l1: 0.00688538	valid_1's l1: 0.120121
[1500]	training's l1: 0.00283079	valid_1's l1: 0.118793
[2000]	training's l1: 0.00142387	valid_1's l1: 0.118457
[2500]	training's l1: 0.000825909	valid_1's l1: 0.118346
[3000]	training's l1: 0.000518465	valid_1's l1: 0.118305
Early stoppin

In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로 설정
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + 파생변수
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id = train["ID"]
        test_id = test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    # 숫자/범주형 구분
    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()
    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 숫자 결측치 처리
    for col in num_cols:
        train[col] = pd.to_numeric(train[col], errors="coerce").fillna(train[col].mean())
        test[col] = pd.to_numeric(test[col], errors="coerce").fillna(train[col].mean())

    # 범주형 결측치 처리
    for col in cat_cols:
        mode = train[col].mode()[0]
        train[col] = train[col].fillna(mode)
        test[col] = test[col].fillna(mode)

    # 파생변수
    for df in [train, test]:
        df["BMI"] = df["weight"] / ((df["height"]/100)**2 + 1e-6)
        df["bp_ratio"] = df["systolic_blood_pressure"] / (df["diastolic_blood_pressure"] + 1e-6)
        df["activity_sleep"] = pd.to_numeric(df["activity"], errors="coerce").fillna(0) * pd.to_numeric(df["sleep_pattern"], errors="coerce").fillna(0)
        df["age_weight_ratio"] = df["age"] / (df["weight"] + 1e-6)

    # 원-핫 인코딩
    train = pd.get_dummies(train, columns=cat_cols)
    test = pd.get_dummies(test, columns=cat_cols)
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = train[target_col]
    return X, y, test, train_id, test_id

# ------------------------------
# 4. 모델 학습 + 예측 (LGBM + ExtraTrees 앙상블)
# ------------------------------
def train_and_predict(X, y, test, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        # LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
        lgb_params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.05,
            "num_leaves": 50,
            "feature_fraction": 0.9,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": seed,
            "verbose": -1
        }
        lgb_model = lgb.train(
            lgb_params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=200)]
        )

        # ExtraTrees
        et_model = ExtraTreesRegressor(
            n_estimators=500,
            max_depth=None,
            min_samples_split=4,
            min_samples_leaf=2,
            n_jobs=-1,
            random_state=seed
        )
        et_model.fit(X_train, y_train)

        # 앙상블 예측
        oof_preds[val_idx] += 0.55*lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration) + 0.45*et_model.predict(X_val)
        test_preds += (0.55*lgb_model.predict(test, num_iteration=lgb_model.best_iteration) + 0.45*et_model.predict(test)) / n_splits

    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.10f}")
    return test_preds

# ------------------------------
# 5. 제출 파일 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_stable_no_catboost16.csv", index=False)
    print("✅ submission_stable_no_catboost16.csv 저장 완료")

# ------------------------------
# 6. 전체 파이프라인 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=5, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.0780643	valid_1's l1: 0.19049
[400]	training's l1: 0.0346026	valid_1's l1: 0.176302
[600]	training's l1: 0.0181017	valid_1's l1: 0.170847
[800]	training's l1: 0.0103361	valid_1's l1: 0.168592
[1000]	training's l1: 0.00635712	valid_1's l1: 0.167532
[1200]	training's l1: 0.0041836	valid_1's l1: 0.167075
[1400]	training's l1: 0.00292437	valid_1's l1: 0.16676
[1600]	training's l1: 0.00213607	valid_1's l1: 0.166582
[1800]	training's l1: 0.00161625	valid_1's l1: 0.166482
[2000]	training's l1: 0.00125822	valid_1's l1: 0.166432
[2200]	training's l1: 0.00100079	valid_1's l1: 0.166377
[2400]	training's l1: 0.000808962	valid_1's l1: 0.166356
[2600]	training's l1: 0.000657053	valid_1's l1: 0.166334
Early stopping, best iteration is:
[2620]	training's l1: 0.000643373	valid_1's l1: 0.166328

===== Fold 2 =====
Training until validation sc

In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import ExtraTreesRegressor
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

# ------------------------------
# 1. 경로
# ------------------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUB_CSV   = "sample_submission.csv"

# ------------------------------
# 2. 데이터 로드
# ------------------------------
def load_data(train_path, test_path, sub_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    submission = pd.read_csv(sub_path)
    print(f"Train: {train.shape}, Test: {test.shape}, Submission: {submission.shape}")
    return train, test, submission

# ------------------------------
# 3. 전처리 + 파생변수
# ------------------------------
def preprocess(train, test, target_col="stress_score"):
    train_id, test_id = None, None
    if "ID" in train.columns:
        train_id, test_id = train["ID"], test["ID"]
        train = train.drop(columns=["ID"])
        test = test.drop(columns=["ID"])

    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()
    if target_col in num_cols: num_cols.remove(target_col)
    if target_col in cat_cols: cat_cols.remove(target_col)

    # 숫자 결측치 처리
    for col in num_cols:
        train[col] = pd.to_numeric(train[col], errors="coerce").fillna(train[col].mean())
        test[col]  = pd.to_numeric(test[col], errors="coerce").fillna(train[col].mean())

    # 범주형 결측치 처리
    for col in cat_cols:
        mode = train[col].mode()[0]
        train[col] = train[col].fillna(mode)
        test[col]  = test[col].fillna(mode)

    # 파생변수
    for df in [train, test]:
        df["BMI"] = df["weight"] / ((df["height"]/100)**2 + 1e-6)
        df["bp_ratio"] = df["systolic_blood_pressure"] / (df["diastolic_blood_pressure"] + 1e-6)
        df["pulse_pressure"] = df["systolic_blood_pressure"] - df["diastolic_blood_pressure"]
        df["activity_sleep"] = pd.to_numeric(df["activity"], errors="coerce").fillna(0) * pd.to_numeric(df["sleep_pattern"], errors="coerce").fillna(0)
        df["age_weight_ratio"] = df["age"] / (df["weight"] + 1e-6)
        df["height_weight_interaction"] = df["height"] * df["weight"]

    # 숫자형 정규화
    pt = PowerTransformer()
    train[num_cols] = pt.fit_transform(train[num_cols])
    test[num_cols] = pt.transform(test[num_cols])

    # 원-핫 인코딩
    train = pd.get_dummies(train, columns=cat_cols)
    test = pd.get_dummies(test, columns=cat_cols)
    test = test.reindex(columns=train.drop(columns=[target_col]).columns, fill_value=0)

    X = train.drop(columns=[target_col])
    y = train[target_col]
    return X, y, test, train_id, test_id

# ------------------------------
# 4. 모델 학습 + 예측 (KFold 10 + LGBM + ExtraTrees)
# ------------------------------
def train_and_predict(X, y, test, n_splits=10, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"\n===== Fold {fold+1} =====")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        # LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
        lgb_params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 64,
            "max_depth": 8,
            "feature_fraction": 0.85,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "lambda_l1": 0.5,
            "lambda_l2": 0.5,
            "seed": seed,
            "verbose": -1
        }
        lgb_model = lgb.train(
            lgb_params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            num_boost_round=10000,
            callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=200)]
        )

        # ExtraTrees
        et_model = ExtraTreesRegressor(
            n_estimators=800,
            max_depth=None,
            min_samples_split=4,
            min_samples_leaf=2,
            n_jobs=-1,
            random_state=seed
        )
        et_model.fit(X_train, y_train)

        # 앙상블 예측
        oof_preds[val_idx] += 0.55*lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration) + 0.45*et_model.predict(X_val)
        test_preds += (0.55*lgb_model.predict(test, num_iteration=lgb_model.best_iteration) + 0.45*et_model.predict(test)) / n_splits

    mae = mean_absolute_error(y, oof_preds)
    print(f"\n✅ CV MAE: {mae:.10f}")
    return test_preds

# ------------------------------
# 5. 제출 저장
# ------------------------------
def save_submission(submission, preds, target_col="stress_score", test_id=None):
    if target_col not in submission.columns:
        submission[target_col] = 0.0
    submission[target_col] = preds
    if test_id is not None:
        submission["ID"] = test_id
    submission.to_csv("submission_ensemble_kfold17.csv", index=False)
    print("✅ submission_ensemble_kfold17.csv 저장 완료")

# ------------------------------
# 6. 실행
# ------------------------------
if __name__ == "__main__":
    np.random.seed(42)
    train, test, submission = load_data(TRAIN_CSV, TEST_CSV, SUB_CSV)
    X, y, test_proc, train_id, test_id = preprocess(train, test, target_col="stress_score")
    preds = train_and_predict(X, y, test_proc, n_splits=10, seed=42)
    save_submission(submission, preds, target_col="stress_score", test_id=test_id)


Train: (3000, 18), Test: (3000, 17), Submission: (3000, 2)

===== Fold 1 =====
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.160303	valid_1's l1: 0.201423
[400]	training's l1: 0.108391	valid_1's l1: 0.187561
[600]	training's l1: 0.0771801	valid_1's l1: 0.178092
[800]	training's l1: 0.0597429	valid_1's l1: 0.173022
[1000]	training's l1: 0.0477679	valid_1's l1: 0.169625
[1200]	training's l1: 0.0402488	valid_1's l1: 0.167466
[1400]	training's l1: 0.0356047	valid_1's l1: 0.166186
[1600]	training's l1: 0.0322667	valid_1's l1: 0.165188
[1800]	training's l1: 0.0297167	valid_1's l1: 0.164368
[2000]	training's l1: 0.0278391	valid_1's l1: 0.163772
[2200]	training's l1: 0.0264257	valid_1's l1: 0.163468
[2400]	training's l1: 0.0252323	valid_1's l1: 0.163153
[2600]	training's l1: 0.0242368	valid_1's l1: 0.162739
[2800]	training's l1: 0.0233145	valid_1's l1: 0.162503
[3000]	training's l1: 0.0225815	valid_1's l1: 0.162306
[3200]	training's l1: 0.0219829	valid_1'

In [23]:
print(train.columns.tolist())


['ID', 'gender', 'age', 'height', 'weight', 'cholesterol', 'systolic_blood_pressure', 'diastolic_blood_pressure', 'glucose', 'bone_density', 'activity', 'smoke_status', 'medical_history', 'family_medical_history', 'sleep_pattern', 'edu_level', 'mean_working', 'stress_score']


In [6]:
import pandas as pd

train = pd.read_csv("/content/train.csv")
print(train.columns)


Index(['ID', 'gender', 'age', 'height', 'weight', 'cholesterol',
       'systolic_blood_pressure', 'diastolic_blood_pressure', 'glucose',
       'bone_density', 'activity', 'smoke_status', 'medical_history',
       'family_medical_history', 'sleep_pattern', 'edu_level', 'mean_working',
       'stress_score'],
      dtype='object')
