# Import

In [3]:
import sys
import pandas as pd
import numpy as np
import sklearn
import optuna
import os
import json
import catboost
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


# Data Organization

In [12]:
months = ["07", "08", "09", "10", "11", "12"]
categories = ["회원정보", "신용정보", "승인매출정보", "청구입금정보", "잔액정보", "채널정보", "마케팅정보", "성과정보"]
data_types = ["train", "test"]

def merge_monthly_data(data_type, category):
    merged_list = []
    for month in months:
        file_name = f"./{data_type}/{category}/2018{month}_{data_type}_{category}.parquet"
        try:
            df = pd.read_parquet(file_name, engine="pyarrow")
            merged_list.append(df)
            print(f"✅ {file_name} 변환 완료")
        except FileNotFoundError:
            print(f"⚠️ 파일 없음: {file_name}")
    if merged_list:
        merged_df = pd.concat(merged_list, ignore_index=True)
        output_file = f"./{data_type}/{category}/{data_type}_{category}.csv"
        merged_df.to_csv(output_file, index=False)
        print(f"✅ {output_file} 저장 완료 (Shape: {merged_df.shape})")
    else:
        print(f"❌ {data_type}_{category} 데이터 없음")

for data_type in data_types:
    for category in categories:
        merge_monthly_data(data_type, category)

✅ ./train/회원정보/201807_train_회원정보.parquet 변환 완료
✅ ./train/회원정보/201808_train_회원정보.parquet 변환 완료
✅ ./train/회원정보/201809_train_회원정보.parquet 변환 완료
✅ ./train/회원정보/201810_train_회원정보.parquet 변환 완료
✅ ./train/회원정보/201811_train_회원정보.parquet 변환 완료
✅ ./train/회원정보/201812_train_회원정보.parquet 변환 완료
✅ ./train/회원정보/train_회원정보.csv 저장 완료 (Shape: (2400000, 78))
✅ ./train/신용정보/201807_train_신용정보.parquet 변환 완료
✅ ./train/신용정보/201808_train_신용정보.parquet 변환 완료
✅ ./train/신용정보/201809_train_신용정보.parquet 변환 완료
✅ ./train/신용정보/201810_train_신용정보.parquet 변환 완료
✅ ./train/신용정보/201811_train_신용정보.parquet 변환 완료
✅ ./train/신용정보/201812_train_신용정보.parquet 변환 완료
✅ ./train/신용정보/train_신용정보.csv 저장 완료 (Shape: (2400000, 42))
✅ ./train/승인매출정보/201807_train_승인매출정보.parquet 변환 완료
✅ ./train/승인매출정보/201808_train_승인매출정보.parquet 변환 완료
✅ ./train/승인매출정보/201809_train_승인매출정보.parquet 변환 완료
✅ ./train/승인매출정보/201810_train_승인매출정보.parquet 변환 완료
✅ ./train/승인매출정보/201811_train_승인매출정보.parquet 변환 완료
✅ ./train/승인매출정보/201812_train_승인매출정보.parquet 변환 완료
✅ ./train/승인

# Data Transform


In [13]:
file_names = [
    "train_회원정보.csv",
    "train_신용정보.csv",
    "train_승인매출정보.csv",
    "train_청구입금정보.csv",
    "train_잔액정보.csv",
    "train_채널정보.csv",
    "train_마케팅정보.csv",
    "train_성과정보.csv"
]

df = pd.read_csv(f"./train/{categories[0]}/{file_names[0]}")
for idx, file in enumerate(file_names[1:], start=2):
    print(f"\n🔹 병합 중: {file} ({idx}/{len(file_names)})")
    temp_df = pd.read_csv(f"./train/{categories[idx-1]}/{file}")
    df = df.merge(temp_df, how="left", on=["ID", "기준년월"])
    print(f"✅ 병합 후 크기: {df.shape}")

output_file = "./train/base_train.csv"
df.to_csv(output_file, index=False)
print(f"\n✅ 최종 데이터 저장 완료: {output_file}")
print(f"🧾 최종 데이터 크기: {df.shape[0]}행, {df.shape[1]}열")


🔹 병합 중: train_신용정보.csv (2/8)
✅ 병합 후 크기: (2400000, 118)

🔹 병합 중: train_승인매출정보.csv (3/8)


  temp_df = pd.read_csv(f"./train/{categories[idx-1]}/{file}")


✅ 병합 후 크기: (2400000, 522)

🔹 병합 중: train_청구입금정보.csv (4/8)
✅ 병합 후 크기: (2400000, 566)

🔹 병합 중: train_잔액정보.csv (5/8)
✅ 병합 후 크기: (2400000, 646)

🔹 병합 중: train_채널정보.csv (6/8)
✅ 병합 후 크기: (2400000, 749)

🔹 병합 중: train_마케팅정보.csv (7/8)
✅ 병합 후 크기: (2400000, 811)

🔹 병합 중: train_성과정보.csv (8/8)
✅ 병합 후 크기: (2400000, 858)

✅ 최종 데이터 저장 완료: ./train/base_train.csv
🧾 최종 데이터 크기: 2400000행, 858열


In [14]:
file_names = [
    "test_회원정보.csv",
    "test_신용정보.csv",
    "test_승인매출정보.csv",
    "test_청구입금정보.csv",
    "test_잔액정보.csv",
    "test_채널정보.csv",
    "test_마케팅정보.csv",
    "test_성과정보.csv"
]

df = pd.read_csv(f"./test/{categories[0]}/{file_names[0]}")
for idx, file in enumerate(file_names[1:], start=2):
    print(f"\n🔹 병합 중: {file} ({idx}/{len(file_names)})")
    temp_df = pd.read_csv(f"./test/{categories[idx-1]}/{file}")
    df = df.merge(temp_df, how="left", on=["ID", "기준년월"])
    print(f"✅ 병합 후 크기: {df.shape}")

output_file = "./test/base_test.csv"
df.to_csv(output_file, index=False)
print(f"\n✅ 최종 데이터 저장 완료: {output_file}")
print(f"🧾 최종 데이터 크기: {df.shape[0]}행, {df.shape[1]}열")


🔹 병합 중: test_신용정보.csv (2/8)
✅ 병합 후 크기: (600000, 117)

🔹 병합 중: test_승인매출정보.csv (3/8)


  temp_df = pd.read_csv(f"./test/{categories[idx-1]}/{file}")


✅ 병합 후 크기: (600000, 521)

🔹 병합 중: test_청구입금정보.csv (4/8)
✅ 병합 후 크기: (600000, 565)

🔹 병합 중: test_잔액정보.csv (5/8)
✅ 병합 후 크기: (600000, 645)

🔹 병합 중: test_채널정보.csv (6/8)
✅ 병합 후 크기: (600000, 748)

🔹 병합 중: test_마케팅정보.csv (7/8)
✅ 병합 후 크기: (600000, 810)

🔹 병합 중: test_성과정보.csv (8/8)
✅ 병합 후 크기: (600000, 857)

✅ 최종 데이터 저장 완료: ./test/base_test.csv
🧾 최종 데이터 크기: 600000행, 857열


# Data Preprocessing - 1

In [15]:
file_names = [
    "train_회원정보.csv",
    "train_신용정보.csv",
    "train_승인매출정보.csv",
    "train_청구입금정보.csv",
    "train_잔액정보.csv",
    "train_채널정보.csv",
    "train_마케팅정보.csv",
    "train_성과정보.csv"
]

df = pd.read_csv(f"./train/{categories[0]}/{file_names[0]}")
original_shape = df.shape

for idx, file in enumerate(file_names[1:], start=2):    
    print(f"\n🔹 병합 진행 중: {file} (파일 {idx} / {len(file_names)})")
    temp_df = pd.read_csv(f"./train/{categories[idx-1]}/{file}")
    df = df.merge(temp_df, how="left", on=["ID", "기준년월"])
    print(f"✅ 병합 후 데이터 크기: {df.shape[0]}행, {df.shape[1]}열")

    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
    if constant_cols:
        print(f"📌 제거된 모든 값이 동일한 칼럼: {constant_cols}")
        df = df.drop(columns=constant_cols)
    else:
        print("📌 모든 값이 동일한 칼럼 없음")

    col_groups = {}
    for col in df.columns:
        for key in col_groups:
            if df[col].equals(df[key]):
                col_groups[key].append(col)
                break
        else:
            col_groups[col] = [col]

    duplicate_cols = [col for group in col_groups.values() for col in group[1:]]
    if duplicate_cols:
        print(f"📌 제거된 중복 칼럼: {duplicate_cols}")
        df = df.drop(columns=duplicate_cols)
    else:
        print("📌 중복 칼럼 없음")

    if 'ID' in df.columns and df.columns.str.contains('ID').sum() > 1:
        df = df.loc[:, ~df.columns.str.contains('ID', case=False)].join(df[['ID']])

    if '기준년월' in df.columns and df.columns.str.contains('기준년월').sum() > 1:
        df = df.loc[:, ~df.columns.str.contains('기준년월', case=False)].join(df[['기준년월']])

    print(f"🔹 {file} 처리 완료. 현재 데이터 크기: {df.shape[0]}행, {df.shape[1]}열")

new_shape = df.shape
output_file = "./train/base_clean_train.csv"
df.to_csv(output_file, index=False)

print(f"\n✅ 원래 데이터 크기: {original_shape[0]}행, {original_shape[1]}열")
print(f"✅ 병합 후 최종 데이터 크기: {new_shape[0]}행, {new_shape[1]}열")
print(f"\n✅ 최종 데이터 저장 완료: {output_file}")


🔹 병합 진행 중: train_신용정보.csv (파일 2 / 8)
✅ 병합 후 데이터 크기: 2400000행, 118열
📌 제거된 모든 값이 동일한 칼럼: ['이용카드수_체크_가족', '이용금액_R3M_체크_가족', '연회비할인카드수_B0M', '할인금액_기본연회비_B0M', '할인금액_제휴연회비_B0M', '상품관련면제카드수_B0M', '임직원면제카드수_B0M', '우수회원면제카드수_B0M', '기타면제카드수_B0M', '시장연체상환여부_R3M']
📌 제거된 중복 칼럼: ['청구금액_기본연회비_B0M', '청구금액_제휴연회비_B0M']
🔹 train_신용정보.csv 처리 완료. 현재 데이터 크기: 2400000행, 106열

🔹 병합 진행 중: train_승인매출정보.csv (파일 3 / 8)


  temp_df = pd.read_csv(f"./train/{categories[idx-1]}/{file}")


✅ 병합 후 데이터 크기: 2400000행, 510열
📌 제거된 모든 값이 동일한 칼럼: ['이용건수_부분무이자_B0M', '이용금액_부분무이자_B0M', '여유_여행이용금액', '납부_렌탈료이용금액', '납부_유선방송이용금액', '납부_건강연금이용금액', '할부건수_부분_3M_R12M', '할부건수_부분_6M_R12M', '할부건수_부분_14M_R12M', '할부금액_부분_3M_R12M', 'RP건수_유선방송_B0M', 'RP건수_건강_B0M', 'RP후경과월_유선방송', 'RP후경과월_건강', '증감_RP건수_유선방송_전월', '증감_RP건수_건강_전월', '이용개월수_당사페이_R6M', '이용금액_당사페이_R6M', '이용금액_당사기타_R6M', '이용건수_당사페이_R6M', '이용건수_당사기타_R6M', '이용금액_당사페이_R3M', '이용금액_당사기타_R3M', '이용건수_당사페이_R3M', '이용건수_당사기타_R3M', '이용금액_당사페이_B0M', '이용금액_당사기타_B0M', '이용건수_당사페이_B0M', '이용건수_당사기타_B0M', '승인거절건수_입력오류_B0M', '승인거절건수_기타_B0M']
📌 제거된 중복 칼럼: ['이용횟수_연체_B0M', '할부건수_부분_12M_R12M']
🔹 train_승인매출정보.csv 처리 완료. 현재 데이터 크기: 2400000행, 477열

🔹 병합 진행 중: train_청구입금정보.csv (파일 4 / 8)
✅ 병합 후 데이터 크기: 2400000행, 521열
📌 제거된 모든 값이 동일한 칼럼: ['대표결제방법코드']
📌 중복 칼럼 없음
🔹 train_청구입금정보.csv 처리 완료. 현재 데이터 크기: 2400000행, 520열

🔹 병합 진행 중: train_잔액정보.csv (파일 5 / 8)
✅ 병합 후 데이터 크기: 2400000행, 600열
📌 제거된 모든 값이 동일한 칼럼: ['카드론잔액_최종경과월', '최종연체개월수_R15M', 'RV잔액이월횟수_R6M', 'RV잔액이월횟수_R3M', '연체잔액_

In [16]:
test_file_names = [
    "test_회원정보.csv",
    "test_신용정보.csv",
    "test_승인매출정보.csv",
    "test_청구입금정보.csv",
    "test_잔액정보.csv",
    "test_채널정보.csv",
    "test_마케팅정보.csv",
    "test_성과정보.csv"
]

test_df = pd.read_csv(f"./test/{categories[0]}/{test_file_names[0]}")
test_original_shape = test_df.shape

for idx, file in enumerate(test_file_names[1:], start=2):
    print(f"\n🔹 병합 진행 중: {file} (파일 {idx} / {len(test_file_names)})")
    temp_df = pd.read_csv(f"./test/{categories[idx-1]}/{file}")
    test_df = test_df.merge(temp_df, how="left", on=["ID", "기준년월"])
    print(f"✅ 병합 후 데이터 크기: {test_df.shape[0]}행, {test_df.shape[1]}열")

train_df = pd.read_csv("./train/base_clean_train.csv", nrows=1)
train_columns = train_df.columns
test_columns_to_keep = [col for col in test_df.columns if col in train_columns]

test_df = test_df[test_columns_to_keep]
test_final_shape = test_df.shape
test_output_file = "./test/base_clean_test.csv"
test_df.to_csv(test_output_file, index=False)

print(f"\n✅ 원래 test 데이터 크기: {test_original_shape[0]}행, {test_original_shape[1]}열")
print(f"✅ 병합 후 최종 test 데이터 크기: {test_final_shape[0]}행, {test_final_shape[1]}열")
print(f"\n✅ 최종 test 데이터 저장 완료: {test_output_file}")

train_col_set = set(train_columns)
test_col_set = set(test_df.columns)
if train_col_set == test_col_set:
    print("\n✅ train과 test의 컬럼이 완전히 일치합니다!")
else:
    train_only_cols = train_col_set - test_col_set
    test_only_cols = test_col_set - train_col_set
    print(f"\n⚠️ train과 test의 컬럼이 다릅니다!")
    print(f"🔹 train에만 있는 컬럼 ({len(train_only_cols)}개): {train_only_cols}")
    print(f"🔹 test에만 있는 컬럼 ({len(test_only_cols)}개): {test_only_cols}")


🔹 병합 진행 중: test_신용정보.csv (파일 2 / 8)
✅ 병합 후 데이터 크기: 600000행, 117열

🔹 병합 진행 중: test_승인매출정보.csv (파일 3 / 8)


  temp_df = pd.read_csv(f"./test/{categories[idx-1]}/{file}")


✅ 병합 후 데이터 크기: 600000행, 521열

🔹 병합 진행 중: test_청구입금정보.csv (파일 4 / 8)
✅ 병합 후 데이터 크기: 600000행, 565열

🔹 병합 진행 중: test_잔액정보.csv (파일 5 / 8)
✅ 병합 후 데이터 크기: 600000행, 645열

🔹 병합 진행 중: test_채널정보.csv (파일 6 / 8)
✅ 병합 후 데이터 크기: 600000행, 748열

🔹 병합 진행 중: test_마케팅정보.csv (파일 7 / 8)
✅ 병합 후 데이터 크기: 600000행, 810열

🔹 병합 진행 중: test_성과정보.csv (파일 8 / 8)
✅ 병합 후 데이터 크기: 600000행, 857열

✅ 원래 test 데이터 크기: 600000행, 77열
✅ 병합 후 최종 test 데이터 크기: 600000행, 745열

✅ 최종 test 데이터 저장 완료: ./test/base_clean_test.csv

⚠️ train과 test의 컬럼이 다릅니다!
🔹 train에만 있는 컬럼 (1개): {'Segment'}
🔹 test에만 있는 컬럼 (0개): set()


# Modeling - 1

In [1]:
import torch
print("torch:", torch.__version__)
print("torch.version.cuda:", torch.version.cuda)   # 12.1 또는 11.8이어야 함 (None이면 CPU 빌드)
print("cuda available:", torch.cuda.is_available())

torch: 2.8.0+cu126
torch.version.cuda: 12.6
cuda available: True


In [4]:
train = pd.read_csv('./train/base_clean_train.csv')
test = pd.read_csv('./test/base_clean_test.csv')

ab_ids = train[train['Segment'].isin(['A', 'B'])]['ID'].unique()
train = train[~train['ID'].isin(ab_ids)].copy()

label_encoder = LabelEncoder()
train['Segment'] = label_encoder.fit_transform(train['Segment'])

X = train.drop(columns=['Segment', 'ID'])
y = train['Segment']
X_test = test.drop(columns=['ID'])

cat_features = [col for col in X.columns if X[col].dtype == 'object']
for col in cat_features:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

best_params = {
    "bootstrap_type": "Bayesian",
    "learning_rate": 0.2997682904093563,
    "l2_leaf_reg": 9.214022161348987,
    "random_strength": 7.342192789415524,
    "bagging_temperature": 0.11417356499443036,
    "border_count": 251,
    "iterations": 1500,
    "loss_function": "MultiClass",
    "eval_metric": "TotalF1",
    "task_type": "GPU",
    "verbose": 100,
    "random_seed": 42,
    "depth": 8,
    "class_weights": [2, 1, 1]
}

n_classes = len(np.unique(y))
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
all_test_probs = np.zeros((X_test.shape[0], n_classes))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"🚀 Fold {fold+1} training...")
    X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
    X_valid_fold, y_valid_fold = X.iloc[valid_idx], y.iloc[valid_idx]
    model = CatBoostClassifier(**best_params)
    model.fit(X_train_fold, y_train_fold, cat_features=cat_features)
    fold_probs = model.predict_proba(X_test)
    all_test_probs += fold_probs

    # for 루프 안, 마지막에 추가
    if fold == kf.get_n_splits() - 1:  # 마지막 fold일 때
        model.save_model("./models/segment_model_last.cbm")

avg_test_probs = all_test_probs / kf.get_n_splits()
prob_df = pd.DataFrame(avg_test_probs, columns=range(n_classes))
prob_df['ID'] = test['ID'].values

mean_probs = prob_df.groupby('ID').mean().reset_index()
mean_probs['Segment'] = mean_probs.drop(columns='ID').values.argmax(axis=1)
segment_mapping = {0: 'C', 1: 'D', 2: 'E'}
mean_probs['Segment'] = mean_probs['Segment'].map(segment_mapping)

submission = pd.DataFrame({'ID': mean_probs['ID'], 'Segment': mean_probs['Segment']})
submission.to_csv('./test/base_catboost_kfold.csv', index=False)
print("✅ CatBoost + 10-Fold CV 예측 완료 및 저장 🎯")

  train = pd.read_csv('./train/base_clean_train.csv')
  test = pd.read_csv('./test/base_clean_test.csv')


🚀 Fold 1 training...
0:	learn: 0.8425386	total: 579ms	remaining: 14m 27s
100:	learn: 0.9045248	total: 20.6s	remaining: 4m 44s
200:	learn: 0.9178476	total: 39.8s	remaining: 4m 17s
300:	learn: 0.9269867	total: 59.1s	remaining: 3m 55s
400:	learn: 0.9341938	total: 1m 18s	remaining: 3m 34s
500:	learn: 0.9401372	total: 1m 37s	remaining: 3m 14s
600:	learn: 0.9451065	total: 1m 57s	remaining: 2m 55s
700:	learn: 0.9495277	total: 2m 16s	remaining: 2m 35s
800:	learn: 0.9533706	total: 2m 36s	remaining: 2m 16s
900:	learn: 0.9568719	total: 2m 55s	remaining: 1m 56s
1000:	learn: 0.9595244	total: 3m 14s	remaining: 1m 36s
1100:	learn: 0.9621529	total: 3m 33s	remaining: 1m 17s
1200:	learn: 0.9646592	total: 3m 52s	remaining: 58s
1300:	learn: 0.9668975	total: 4m 12s	remaining: 38.6s
1400:	learn: 0.9691693	total: 4m 31s	remaining: 19.2s
1499:	learn: 0.9712015	total: 4m 51s	remaining: 0us
🚀 Fold 2 training...
0:	learn: 0.8425332	total: 532ms	remaining: 13m 17s
100:	learn: 0.9043988	total: 20.8s	remaining: 4m 

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc1 in position 10: invalid start byte

# Data Preprocessing - 2

In [2]:
train = pd.read_csv('./train/base_clean_train.csv')
test = pd.read_csv('./test/base_clean_test.csv')

train_A = train[train['Segment'] == 'A']
cols_to_check = [col for col in train.columns if col not in ['ID', 'Segment']]

def is_fixed_column(df, col):
    return df[col].nunique() == 1

fixed_columns_A = {col: train_A[col].iloc[0] for col in cols_to_check if is_fixed_column(train_A, col)}
fixed_cols = list(fixed_columns_A.keys())
print(f"📦 고정된 칼럼 {len(fixed_cols)}개 제거할 예정입니다.")

matching_ids_train = train.copy()
for col, value in fixed_columns_A.items():
    matching_ids_train = matching_ids_train[matching_ids_train[col] == value]
matching_ids_train_list = matching_ids_train.groupby('ID').filter(lambda x: len(x) == 6)['ID'].unique()

matching_ids_test = test.copy()
for col, value in fixed_columns_A.items():
    matching_ids_test = matching_ids_test[matching_ids_test[col] == value]
matching_ids_test_list = matching_ids_test.groupby('ID').filter(lambda x: len(x) == 6)['ID'].unique()

train_filtered = train[train['ID'].isin(matching_ids_train_list)].drop(columns=fixed_cols)
test_filtered = test[test['ID'].isin(matching_ids_test_list)].drop(columns=fixed_cols)

print(f"🚀 최종 train 데이터 shape: {train_filtered.shape}")
print(f"🚀 최종 test 데이터 shape: {test_filtered.shape}")
train_filtered.to_csv('./train/train_vips_A.csv', index=False)
test_filtered.to_csv('./test/test_vips_A.csv', index=False)

  train = pd.read_csv('./train/base_clean_train.csv')
  test = pd.read_csv('./test/base_clean_test.csv')


📦 고정된 칼럼 83개 제거할 예정입니다.
🚀 최종 train 데이터 shape: (249594, 663)
🚀 최종 test 데이터 shape: (62586, 662)


In [3]:
train = pd.read_csv('./train/base_clean_train.csv')
test = pd.read_csv('./test/base_clean_test.csv')

train_B = train[train['Segment'] == 'B']
cols_to_check = [col for col in train.columns if col not in ['ID', 'Segment']]

def is_fixed_column(df, col):
    return df[col].nunique() == 1

fixed_columns_B = {col: train_B[col].iloc[0] for col in cols_to_check if is_fixed_column(train_B, col)}
fixed_cols = list(fixed_columns_B.keys())
print(f"📦 고정된 칼럼 {len(fixed_cols)}개 제거할 예정입니다.")

matching_ids_train = train.copy()
for col, value in fixed_columns_B.items():
    matching_ids_train = matching_ids_train[matching_ids_train[col] == value]
matching_ids_train_list = matching_ids_train.groupby('ID').filter(lambda x: len(x) == 6)['ID'].unique()

matching_ids_test = test.copy()
for col, value in fixed_columns_B.items():
    matching_ids_test = matching_ids_test[matching_ids_test[col] == value]
matching_ids_test_list = matching_ids_test.groupby('ID').filter(lambda x: len(x) == 6)['ID'].unique()

train_filtered = train[train['ID'].isin(matching_ids_train_list)].drop(columns=fixed_cols)
test_filtered = test[test['ID'].isin(matching_ids_test_list)].drop(columns=fixed_cols)

print(f"🚀 최종 train 데이터 shape: {train_filtered.shape}")
print(f"🚀 최종 test 데이터 shape: {test_filtered.shape}")
train_filtered.to_csv('./train/train_vips_B.csv', index=False)
test_filtered.to_csv('./test/test_vips_B.csv', index=False)

  train = pd.read_csv('./train/base_clean_train.csv')
  test = pd.read_csv('./test/base_clean_test.csv')


📦 고정된 칼럼 121개 제거할 예정입니다.
🚀 최종 train 데이터 shape: (127746, 625)
🚀 최종 test 데이터 shape: (31998, 624)


# Modeling - 2

In [4]:
train = pd.read_csv('./train/train_vips_A.csv')
test = pd.read_csv('./test/test_vips_A.csv')

label_encoder = LabelEncoder()
train['Segment'] = label_encoder.fit_transform(train['Segment'])

X = train.drop(columns=['Segment', 'ID'])
y = train['Segment']
X_test = test.drop(columns=['ID'])

cat_features = [col for col in X.columns if X[col].dtype == 'object']
for col in cat_features:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

params = {
    'iterations': 2000,
    'learning_rate': 0.05,
    'depth': 6,
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'verbose': 100,
    'random_seed': 42,
    'task_type': 'GPU',
    'class_weights': [20, 50, 2, 1, 1],
}

n_classes = 5
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

print(f"\n🚀 단일 Model Run 시작")
all_test_probs = np.zeros((X_test.shape[0], n_classes))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"📂 Fold {fold + 1}")
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
    model = CatBoostClassifier(**params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_valid_fold, y_valid_fold),
        cat_features=cat_features,
        early_stopping_rounds=100,
        use_best_model=True
    )
    test_probs = model.predict_proba(X_test)
    all_test_probs += test_probs

avg_test_probs = all_test_probs / kf.get_n_splits()
prob_df = pd.DataFrame(avg_test_probs, columns=[0, 1, 2, 3, 4])
prob_df['ID'] = test['ID'].values

mean_probs = prob_df.groupby('ID').mean().reset_index()
mean_probs['Segment'] = mean_probs[[0, 1, 2, 3, 4]].idxmax(axis=1)
segment_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}
mean_probs['Segment'] = mean_probs['Segment'].map(segment_mapping)

a_ids = mean_probs.loc[mean_probs['Segment'] == 'A', 'ID'].tolist()
print(f"\n✅ A로 분류된 ID 수 = {len(a_ids)}개")
print(f"🔎 A ID: {a_ids[:50]}")


🚀 단일 Model Run 시작
📂 Fold 1
0:	learn: 1.5334477	test: 1.5335813	best: 1.5335813 (0)	total: 98.9ms	remaining: 3m 17s
100:	learn: 0.6723557	test: 0.6874511	best: 0.6874511 (100)	total: 3.06s	remaining: 57.5s
200:	learn: 0.6042565	test: 0.6257129	best: 0.6257129 (200)	total: 6.02s	remaining: 53.9s
300:	learn: 0.5677055	test: 0.5937705	best: 0.5937705 (300)	total: 8.97s	remaining: 50.6s
400:	learn: 0.5423898	test: 0.5712051	best: 0.5712051 (400)	total: 11.9s	remaining: 47.6s
500:	learn: 0.5212180	test: 0.5518802	best: 0.5518802 (500)	total: 14.9s	remaining: 44.6s
600:	learn: 0.5028475	test: 0.5353221	best: 0.5353221 (600)	total: 17.9s	remaining: 41.7s
700:	learn: 0.4873043	test: 0.5214620	best: 0.5214620 (700)	total: 20.9s	remaining: 38.7s
800:	learn: 0.4745080	test: 0.5107807	best: 0.5107807 (800)	total: 23.8s	remaining: 35.7s
900:	learn: 0.4623019	test: 0.5001477	best: 0.5001477 (900)	total: 26.8s	remaining: 32.7s
1000:	learn: 0.4513416	test: 0.4911058	best: 0.4911058 (1000)	total: 29.8s

In [None]:
train = pd.read_csv('./train/train_vips_B.csv')
test = pd.read_csv('./test/test_vips_B.csv')

label_encoder = LabelEncoder()
train['Segment'] = label_encoder.fit_transform(train['Segment'])

X = train.drop(columns=['Segment', 'ID'])
y = train['Segment']
X_test = test.drop(columns=['ID'])

cat_features = [col for col in X.columns if X[col].dtype == 'object']
for col in cat_features:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

params = {
    'iterations': 1000,
    'learning_rate': 0.03,
    'depth': 8,
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'verbose': 100,
    'random_seed': 42,
    'task_type': 'GPU',
    'class_weights': [10, 10, 1, 1, 1],
}

n_classes = 5
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(f"\n🚀 단일 Model Run 시작")
all_test_probs = np.zeros((X_test.shape[0], n_classes))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"📂 Fold {fold + 1}")
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
    model = CatBoostClassifier(**params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_valid_fold, y_valid_fold),
        cat_features=cat_features,
        early_stopping_rounds=100,
        use_best_model=True
    )
    test_probs = model.predict_proba(X_test)
    all_test_probs += test_probs

avg_test_probs = all_test_probs / kf.get_n_splits() 
prob_df = pd.DataFrame(avg_test_probs, columns=[0, 1, 2, 3, 4])
prob_df['ID'] = test['ID'].values

mean_probs = prob_df.groupby('ID').mean().reset_index()
mean_probs['Segment'] = mean_probs[[0, 1, 2, 3, 4]].idxmax(axis=1)
segment_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}
mean_probs['Segment'] = mean_probs['Segment'].map(segment_mapping)

b_ids = mean_probs.loc[mean_probs['Segment'] == 'B', 'ID'].tolist()
print(f"\n✅ B로 분류된 ID 수 = {len(b_ids)}개")
print(f"🔎 B ID: {b_ids[:5]}")


🚀 단일 Model Run 시작
📂 Fold 1
0:	learn: 1.5620331	test: 1.5624593	best: 1.5624593 (0)	total: 49.6ms	remaining: 49.5s
100:	learn: 0.7242639	test: 0.7455356	best: 0.7455356 (100)	total: 5.09s	remaining: 45.3s
200:	learn: 0.6362751	test: 0.6708864	best: 0.6708864 (200)	total: 10.1s	remaining: 40.3s
300:	learn: 0.5875421	test: 0.6309730	best: 0.6309730 (300)	total: 15.3s	remaining: 35.4s
400:	learn: 0.5506352	test: 0.6004816	best: 0.6004816 (400)	total: 20.4s	remaining: 30.4s
500:	learn: 0.5212347	test: 0.5778563	best: 0.5778563 (500)	total: 25.4s	remaining: 25.3s
600:	learn: 0.4936263	test: 0.5553482	best: 0.5553482 (600)	total: 30.5s	remaining: 20.2s
700:	learn: 0.4698285	test: 0.5361675	best: 0.5361675 (700)	total: 35.5s	remaining: 15.2s
800:	learn: 0.4478868	test: 0.5185214	best: 0.5185214 (800)	total: 40.6s	remaining: 10.1s
900:	learn: 0.4280761	test: 0.5032189	best: 0.5032189 (900)	total: 45.7s	remaining: 5.02s
999:	learn: 0.4094466	test: 0.4881741	best: 0.4881741 (999)	total: 50.7s	re

# Submission

In [6]:
base_df = pd.read_csv('./test/base_catboost_kfold.csv')
base_df.loc[base_df['ID'].isin(a_ids), 'Segment'] = 'A'
base_df.loc[base_df['ID'].isin(b_ids), 'Segment'] = 'B'
base_df.to_csv('./test/final_catboost.csv', index=False)

print(f"✅ Segment가 'A'로 수정된 {len(a_ids)}개 ID 반영 완료")
print(f"✅ Segment가 'B'로 수정된 {len(b_ids)}개 ID 반영 완료")
print("🎯 최종 결과 저장 완료: final_catboost.csv")

✅ Segment가 'A'로 수정된 38개 ID 반영 완료
✅ Segment가 'B'로 수정된 1개 ID 반영 완료
🎯 최종 결과 저장 완료: final_catboost.csv


## Model Evaluation

In [2]:
import json, pandas as pd

# test_error.tsv에서 최소값(=최적 반복 시 검증 지표)
df = pd.read_csv("catboost_info/test_error.tsv", sep="\t")
best_iter = int(df['MultiClass'].idxmin())     # eval_metric이 다르면 컬럼명 바꾸기
best_val  = float(df.loc[best_iter, 'MultiClass'])
print("best_iter:", best_iter, "best_val:", best_val)

# catboost_training.json에서 베스트 정보
with open("catboost_info/catboost_training.json") as f:
    log = json.load(f)
print("best_iteration:", log.get("best_iteration"))
print("best_scores:", log.get("best_scores"))  # {'learn': {...}, 'validation': {...}}

best_iter: 999 best_val: 0.485408788
best_iteration: None
best_scores: None


In [3]:
import pandas as pd

df = pd.read_csv("catboost_info/test_error.tsv", sep="\t")
metric = df.columns[1]  # 첫 컬럼은 iter, 두 번째가 지표

min_metrics = {"MultiClass", "Logloss", "RMSE", "MAE"}
if metric in min_metrics:
    best_iter = int(df[metric].idxmin())
else:
    best_iter = int(df[metric].idxmax())

best_val = float(df.loc[best_iter, metric])
print(f"metric={metric}  best_iter={best_iter}  best_val={best_val:.6f}")


metric=MultiClass  best_iter=999  best_val=0.485409
