In [None]:
import pandas as pd
import numpy as np
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

In [None]:
train_data = pd.read_csv('train07to10.csv')
train_data2 = pd.read_csv('train11to12.csv')

In [None]:
test_data = pd.read_csv('test07to10.csv')
test_data2 = pd.read_csv('test11to12.csv')

In [None]:
train_df = pd.concat([train_data, train_data2])

In [None]:
test_df = pd.concat([test_data, test_data2])

In [None]:
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]

X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()

# 타깃 라벨 인코딩
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

In [None]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

X_test = test_df.copy()

encoders = {}  # 각 컬럼별 encoder 저장

for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    if unseen_labels_val:
        le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    X_test[col] = le_train.transform(X_test[col])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
X_test = X_test[X.columns]

In [None]:
# XGBoost용 custom eval function
def xgb_f1_score(y_pred, dtrain):
    y_true = dtrain.get_label()
    y_pred = y_pred.reshape(-1, 5)  # 5 클래스
    y_pred_labels = y_pred.argmax(axis=1)
    return 'f1_macro', f1_score(y_true, y_pred_labels, average='macro')

In [None]:
# 원본 클래스명과 매핑된 숫자 확인
class_mapping = dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))
print(class_mapping)  # {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}


# 2. 클래스별 샘플 수 정의
class_counts = {
    'A': 972,
    'B': 144,
    'C': 127590,
    'D': 349242,
    'E': 1922052
}
reference = class_counts['E']
weights = {le_target.transform([k])[0]: reference / v for k, v in class_counts.items()}

# 3. 샘플별 가중치 계산
sample_weights = compute_sample_weight(class_weight=weights, y=y_encoded)

# 4. XGBoost 모델 학습 (올바른 방식)
model = XGBClassifier(
    objective='multi:softprob',
    num_class=5,
    eval_metric=xgb_f1_score,
    max_depth=6,
    learning_rate=0.05,
    n_estimators=1000,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    scale_pos_weight=1,  # 샘플 가중치로 처리하니 이건 고정
    tree_method='hist',  # 속도 빠르게
    random_state=42
)

model.fit(
    X, y_encoded,
    sample_weight=sample_weights,
    eval_set=[(X_val, y_val)],  # 검증 데이터 넣을 수 있으면 좋아
    early_stopping_rounds=10,
    verbose=True
)

In [None]:
X_test.drop(columns=['ID'],inplace=True)

In [None]:
# row-level 예측 수행
y_test_pred = model.predict(X_test)
# 예측 결과를 변환
y_test_pred_labels = le_target.inverse_transform(y_test_pred)

# row 단위 예측 결과를 test_data에 추가
test_data1 = test_df.copy()  # 원본 유지
test_data1["pred_label"] = y_test_pred_labels

In [None]:
submission = test_data1.groupby("ID")["pred_label"] \
    .agg(lambda x: x.value_counts().idxmax()) \
    .reset_index()

submission.columns = ["ID", "Segment"]

In [None]:
submission.to_csv(r'C:\Users\id000\Downloads/xgboost_submit7.csv',index=False)