In [1]:
# 1. 라이브러리 임포트
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# 2. 데이터 불러오기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

# 3. 라벨 컬럼 자동 탐지
label_col = None
for col in train.columns:
    if train[col].nunique() == 2 and train[col].dtype in [np.int64, np.int32, np.int8]:
        label_col = col
        print(f"🔍 라벨 컬럼 자동 감지됨: {label_col}")
        break
if label_col is None:
    raise ValueError("⚠️ 이진 분류용 라벨 컬럼을 찾을 수 없습니다.")

# 4. 피처/타겟 분리
X = train.drop(columns=[label_col])
y = train[label_col]
X_test = test.copy()

# 5. 범주형 변수 안전하게 인코딩
cat_cols = X.select_dtypes(include='object').columns
for col in cat_cols:
    X[col], _ = pd.factorize(X[col])
    X_test[col], _ = pd.factorize(X_test[col])  # 따로 factorize해서 오류 방지

# 6. 데이터 정규화 (선택 사항)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# 7. 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

# 8. LightGBM 하이퍼파라미터 튜닝
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [-1, 10, 20]
}

# 그리드 서치를 사용하여 최적 파라미터 찾기
grid_search = GridSearchCV(estimator=lgb.LGBMClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3, scoring='f1', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 9. 최적 파라미터 출력
print(f"최적 파라미터: {grid_search.best_params_}")

# 10. 모델 학습
model = grid_search.best_estimator_
model.fit(X_train, y_train)

# 11. 검증 F1 점수 출력
val_pred = model.predict(X_val)
score = f1_score(y_val, val_pred)
print(f"✅ 검증 F1 Score: {score:.4f}")

# 12. 예측 및 제출파일 저장
submission['label'] = model.predict(X_test_scaled)
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv 저장 완료")


🔍 라벨 컬럼 자동 감지됨: Cancer
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002850 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1388
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463
최적 파라미터: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'num_leaves': 100}
[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008682 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1388
[Li



✅ 검증 F1 Score: 0.2884
✅ submission.csv 저장 완료


In [6]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score

# 데이터 불러오기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

# 라벨 컬럼 자동 감지
label_col = None
for col in train.columns:
    if train[col].nunique() == 2 and train[col].dtype in [np.int64, np.int32, np.int8]:
        label_col = col
        print(f"🔍 라벨 컬럼 자동 감지됨: {label_col}")
        break
if label_col is None:
    raise ValueError("⚠️ 라벨 컬럼을 찾을 수 없습니다.")

# 데이터 분리
X = train.drop(columns=[label_col])
y = train[label_col]
X_test = test.copy()

# 범주형 변수 인코딩 (pd.factorize 사용)
cat_cols = X.select_dtypes(include='object').columns
for col in cat_cols:
    X[col], _ = pd.factorize(X[col])
    X_test[col], _ = pd.factorize(X_test[col])  # 테스트 데이터의 새로운 범주 처리

# 특성 엔지니어링 (수치형 변수 곱셈)
num_cols = X.select_dtypes(include=np.number).columns
for i in range(len(num_cols)):
    for j in range(i + 1, len(num_cols)):
        X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
        X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]

# Stratified K-Fold 설정
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# LightGBM 모델 초기화
lgbm = lgb.LGBMClassifier(random_state=42, n_jobs=-1)

# 하이퍼파라미터 그리드
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 50],
    'max_depth': [-1, 5]
}

# GridSearchCV 설정
grid_search = GridSearchCV(lgbm, param_grid, scoring='f1', cv=skf, verbose=1)

# GridSearchCV 실행
grid_search.fit(X, y)

# 최적 파라미터 및 점수 출력
print("최적 파라미터:", grid_search.best_params_)
print("✅ 검증 F1 Score:", grid_search.best_score_)

# 최적 모델로 예측
best_model = grid_search.best_estimator_
test_preds = best_model.predict(X_test)

# 결과 반영 및 저장
submission['label'] = test_preds.astype(int)
submission.to_csv("submission_fe.csv", index=False)
print("✅ submission_fe.csv 저장 완료")

🔍 라벨 컬럼 자동 감지됨: Cancer


  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Number of positive: 6972, number of negative: 51134
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035060 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17803
[LightGBM] [Info] Number of data points in the train set: 58106, number of used features: 120
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119988 -> initscore=-1.992548
[LightGBM] [Info] Start training from score -1.992548
[LightGBM] [Info] Number of positive: 6973, number of negative: 51133
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014663 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17802
[LightGBM] [Info] Number of data points in the train set: 58106, number of used features: 120
[LightGBM] [Info] [

In [9]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score

# 데이터 불러오기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

# 라벨 컬럼 자동 감지
label_col = None
for col in train.columns:
    if train[col].nunique() == 2 and train[col].dtype in [np.int64, np.int32, np.int8]:
        label_col = col
        print(f"🔍 라벨 컬럼 자동 감지됨: {label_col}")
        break
if label_col is None:
    raise ValueError("⚠️ 라벨 컬럼을 찾을 수 없습니다.")

# 데이터 분리
X = train.drop(columns=[label_col])
y = train[label_col]
X_test = test.copy()

# 범주형 변수 인코딩 (pd.factorize 사용)
cat_cols = X.select_dtypes(include='object').columns
for col in cat_cols:
    X[col], _ = pd.factorize(X[col])
    X_test[col], _ = pd.factorize(X_test[col])  # 테스트 데이터의 새로운 범주 처리

# 특성 엔지니어링 (수치형 변수 곱셈)
num_cols = X.select_dtypes(include=np.number).columns
for i in range(len(num_cols)):
    for j in range(i + 1, len(num_cols)):
        X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
        X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]

# Stratified K-Fold 설정
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# LightGBM 모델 초기화
lgbm = lgb.LGBMClassifier(random_state=42, n_jobs=-1)

# 하이퍼파라미터 그리드
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 50],
    'max_depth': [-1, 5]
}

# GridSearchCV 설정
grid_search = GridSearchCV(lgbm, param_grid, scoring='f1', cv=skf, verbose=1)

# GridSearchCV 실행
grid_search.fit(X, y)

# 최적 파라미터 및 점수 출력
print("최적 파라미터:", grid_search.best_params_)
print("✅ 검증 F1 Score:", grid_search.best_score_)

# 최적 모델로 예측
best_model = grid_search.best_estimator_
test_preds = best_model.predict(X_test)

# 결과 반영 및 저장
submission['label'] = test_preds.astype(int)
submission.to_csv("/data/submission_fe.csv", index=False)
print("✅ submission_fe.csv 저장 완료")

🔍 라벨 컬럼 자동 감지됨: Cancer


  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]
  X_test[f'{num_cols[i]}*{num_cols[j]}'] = X_test[num_cols[i]] * X_test[num_cols[j]]
  X[f'{num_cols[i]}*{num_cols[j]}'] = X[num_cols[i]] * X[num_cols[j]]


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Number of positive: 6972, number of negative: 51134
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17803
[LightGBM] [Info] Number of data points in the train set: 58106, number of used features: 120
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119988 -> initscore=-1.992548
[LightGBM] [Info] Start training from score -1.992548
[LightGBM] [Info] Number of positive: 6973, number of negative: 51133
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17802
[LightGBM] [Info] Number of data points in the train set: 58106, number of used features: 120
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120005 -> initscore=-1.992385
[Li

OSError: Cannot save file into a non-existent directory: '\data'