# **데이터 로드**

Train / Test 데이터를 로드합니다.


In [5]:
#------------------------------------------------
# 데이터 로드
#------------------------------------------------

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

plt.rcParams["figure.dpi"] = 120

TRAIN_CSV = "/content/drive/MyDrive/dataset/train.csv"
TEST_CSV = "/content/drive/MyDrive/dataset/test.csv"

train = pd.read_csv(TRAIN_CSV)
test = pd.read_csv(TEST_CSV)

print(f"train shape: {train.shape}")
print(f"test shape: {test.shape}")

display(train.head())
display(test.head())

train shape: (30858, 10)
test shape: (13225, 9)


Unnamed: 0,ID,age,gender,tenure,frequent,payment_interval,subscription_type,contract_length,after_interaction,support_needs
0,TRAIN_00000,54.0,F,47.0,22.0,8.0,member,90,25.0,0
1,TRAIN_00001,30.0,M,16.0,15.0,5.0,vip,360,23.0,0
2,TRAIN_00002,29.0,M,8.0,30.0,21.0,plus,30,21.0,0
3,TRAIN_00003,38.0,F,38.0,23.0,10.0,vip,90,6.0,0
4,TRAIN_00004,25.0,F,52.0,3.0,17.0,member,30,1.0,2


Unnamed: 0,ID,age,gender,tenure,frequent,payment_interval,subscription_type,contract_length,after_interaction
0,TEST_00000,18.0,M,40.0,6.0,15.0,member,30,18.0
1,TEST_00001,40.0,M,41.0,23.0,0.0,member,90,16.0
2,TEST_00002,59.0,F,30.0,1.0,21.0,member,360,25.0
3,TEST_00003,38.0,M,2.0,10.0,0.0,member,30,18.0
4,TEST_00004,30.0,M,28.0,21.0,20.0,member,360,28.0


Unnamed: 0_level_0,count
support_needs,Unnamed: 1_level_1
0,14297
1,8297
2,8264


# **간단한 데이터 확인 (EDA 생략)**
- 수치형 변수 기초통계량 표시
- 범주형 변수 유형, 개수 표시
- 타깃 변수 "support_needs" 개수 출력 : 불균형 존재
- 결측치 비율 확인

In [13]:
#------------------------------------------------
# 간단한 데이터 확인
#------------------------------------------------

# 수치형 변수 기초통계량 확인
print(f"\n>>기초통계량")
display(train.describe().T)

# 범주형 변수 확인
cat_cols = train[['subscription_type', 'contract_length', 'support_needs']]

for col in cat_cols:
  print(f"\n>>{col}")
  display(train[col].value_counts())

# 타깃 변수 개수 (불균형)

print(f"\n>>타깃 변수 개수")
display(train["support_needs"].value_counts().T)

# 결측치 확인
print(f"\n>>결측치 확인")
def missing_ratio(df):
  return (df.isna().mean()*100).round(2).rename("missing_ratio")

info = pd.DataFrame({
    "Datatype" : train.dtypes,
    "Unique" : train.nunique(),
    "Missing Ratio" : missing_ratio(train)
})

display(info.sort_values(by="Missing Ratio", ascending=False))
display(train.describe().T)


>>기초통계량


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,30858.0,39.428479,12.423977,18.0,29.0,39.0,48.0,65.0
tenure,30858.0,31.250275,17.403133,1.0,16.0,32.0,46.0,60.0
frequent,30858.0,15.803876,8.612354,1.0,9.0,16.0,23.0,30.0
payment_interval,30858.0,12.947145,8.302702,0.0,6.0,12.0,19.0,30.0
contract_length,30858.0,186.64301,143.948206,30.0,90.0,90.0,360.0,360.0
after_interaction,30858.0,14.546147,8.599893,1.0,7.0,14.0,22.0,30.0
support_needs,30858.0,0.804492,0.832419,0.0,0.0,1.0,2.0,2.0



>>subscription_type


Unnamed: 0_level_0,count
subscription_type,Unnamed: 1_level_1
plus,10481
vip,10405
member,9972



>>contract_length


Unnamed: 0_level_0,count
contract_length,Unnamed: 1_level_1
360,12419
90,12257
30,6182



>>support_needs


Unnamed: 0_level_0,count
support_needs,Unnamed: 1_level_1
0,14297
1,8297
2,8264



>>타깃 변수 개수


Unnamed: 0_level_0,count
support_needs,Unnamed: 1_level_1
0,14297
1,8297
2,8264



>>결측치 확인


Unnamed: 0,Datatype,Unique,Missing Ratio
ID,object,30858,0.0
age,float64,48,0.0
gender,object,2,0.0
tenure,float64,60,0.0
frequent,float64,30,0.0
payment_interval,float64,31,0.0
subscription_type,object,3,0.0
contract_length,int64,3,0.0
after_interaction,float64,30,0.0
support_needs,int64,3,0.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,30858.0,39.428479,12.423977,18.0,29.0,39.0,48.0,65.0
tenure,30858.0,31.250275,17.403133,1.0,16.0,32.0,46.0,60.0
frequent,30858.0,15.803876,8.612354,1.0,9.0,16.0,23.0,30.0
payment_interval,30858.0,12.947145,8.302702,0.0,6.0,12.0,19.0,30.0
contract_length,30858.0,186.64301,143.948206,30.0,90.0,90.0,360.0,360.0
after_interaction,30858.0,14.546147,8.599893,1.0,7.0,14.0,22.0,30.0
support_needs,30858.0,0.804492,0.832419,0.0,0.0,1.0,2.0,2.0


# **전처리 및 모델 정의**
### 전처리
* 수치형 변수 /범주형 변수 구분 ; 범주형 변수도 정수형인 경우가 있어 직접 열 선택하여 구분
* Yeo-Johnson 파워 변환, 2차 다항 변환
* 결측치 없으므로 따로 처리하지 않음

### Focal Loss 사용
* 데이터의 클래스 불균형을 다루기 위한 Focal Loss 사용

### MLP 모델 설계
* 3개의 Hidden Layer로 구성
* Dropout, Early Stopping 적용하여 과적합 방지
* Adam Optimizer 사용

In [None]:
#------------------------------------------------
# 전처리 + 모델 정의 및 학습 준비
#------------------------------------------------
import copy
import numpy as np
import pandas as pd
import torch.nn.functional as F

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.base import clone
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
import torch, torch.nn as nn, torch.optim as optim
from tqdm.auto import tqdm
from collections import Counter

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ----- (1) 전처리 정의 -----
num_cols = ['age', 'tenure', 'frequent', 'payment_interval', 'after_interaction']
cat_cols = ['gender', 'subscription_type', 'contract_length']

num_pipeline = Pipeline(steps=[
    ("yeo",  PowerTransformer(method="yeo-johnson", standardize=False)),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("scal", StandardScaler())
])

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False, min_frequency=0.01)

preprocess_tpl = ColumnTransformer(
    transformers=[("num", num_pipeline, num_cols), ("cat", ohe, cat_cols)],
    remainder="drop", sparse_threshold=0.0
)

# ----- (2) 데이터 분리 -----
X = train.drop(columns=['support_needs']).copy()
y = train['support_needs'].astype(int).copy()
classes_sorted = np.sort(y.unique())
n_classes = len(classes_sorted)

# ----- Focal Loss Class 정의 -----
class FocalLoss(nn.Module):
    """
    Multi-class Focal Loss with optional class-wise alpha (tensor) and label smoothing.
    - logits: (N, C)
    - target: (N,) long
    """
    def __init__(self, gamma: float = 2.0, alpha: torch.Tensor | float | None = None,
                 reduction: str = "mean", label_smoothing: float = 0.0):
        super().__init__()
        self.gamma = gamma
        # alpha: None | scalar | tensor(num_classes,)
        if isinstance(alpha, torch.Tensor):
            self.register_buffer("alpha", alpha)
        else:
            self.alpha = alpha  # scalar or None
        assert reduction in ("none", "mean", "sum")
        self.reduction = reduction
        self.label_smoothing = float(label_smoothing)

    def forward(self, logits: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        # log-probs and probs
        log_probs = F.log_softmax(logits, dim=1)
        probs     = log_probs.exp()

        # base CE with optional label smoothing
        if self.label_smoothing > 0.0:
            n_classes = logits.size(1)
            with torch.no_grad():
                true_dist = torch.zeros_like(log_probs)
                true_dist.fill_(self.label_smoothing / n_classes)
                true_dist.scatter_(1, target.unsqueeze(1), 1.0 - self.label_smoothing)
            ce = -(true_dist * log_probs).sum(dim=1)
        else:
            # standard NLL for speed when no smoothing
            ce = F.nll_loss(log_probs, target, reduction="none"

        # p_t and focal factor on the true class
        pt = probs.gather(1, target.unsqueeze(1)).squeeze(1)
        focal_factor = (1.0 - pt).clamp(min=1e-6).pow(self.gamma)

        loss = focal_factor * ce

        # class-wise alpha
        if isinstance(self.alpha, torch.Tensor):
            alpha_t = self.alpha[target]
            loss = loss * alpha_t
        elif isinstance(self.alpha, (float, int)):
            loss = loss * float(self.alpha)

        if self.reduction == "mean":
            return loss.mean()
        if self.reduction == "sum":
            return loss.sum()
        return loss

# ----- 하이퍼파라미터 ------

max_epochs = 200
batch_size = 256
lr = 1e-3
patience = 20
hidden_dims = (256, 256, 128)
dropout_ratio = 0.2
neg_slope = 0.01

# ----- (3) 모델 정의 -----
class MLP(nn.Module):
    def __init__(self, input_dim, n_classes, hidden_dims=hidden_dims, dropout_ratio=dropout_ratio, neg_slope=neg_slope):
        super().__init__()
        layers, prev = [], input_dim
        for h in hidden_dims:
            layers += [nn.Linear(prev, h), nn.GELU(), nn.LayerNorm(h), nn.Dropout(dropout_ratio)]
            prev = h
        layers += [nn.Linear(prev, n_classes)]
        self.net = nn.Sequential(*layers)
        # He initialization
        for m in self.net:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, a=neg_slope, nonlinearity='relu')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x):
        return self.net(x)

# ----- (4) CV + 앙상블 준비 -----
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_metrics = []
# OOF 저장용 (각 샘플에 대해 그 샘플을 보지 않은 폴드 모델의 로짓)
oof_logits = np.zeros((len(X), n_classes), dtype=np.float32)
# 앙상블용 폴드별 아티팩트 저장
fold_artifacts = []  # dict(state_dict, input_dim, preprocess)

def mc_dropout_logits(model, X_t, T=1):
    """MC Dropout 추론 도우미: T=1이면 일반 추론과 동일"""
    if T <= 1:
        model.eval()
        with torch.no_grad():
            return model(X_t).detach().cpu().numpy()
    outs = []
    with torch.no_grad():
        model.train()  # Dropout 활성
        for _ in range(T):
            outs.append(model(X_t).detach().cpu().numpy())
    return np.mean(outs, axis=0)

# **학습**
### 5 K-fold
* 매 Fold마다 랜덤으로 Train / Validation set 분리
* 5 folds
* Train/validation 비율 1:4

### SMOTE 적용
* 클래스 불균형을 완화하기 위해 적용
* 상대적으로 적은 1, 2 클래스를 오버샘플링

In [None]:
#------------------------------------------------
# 학습 (SMOTE 적용)
#------------------------------------------------
for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), start=1):
    print(f"===== Fold {fold} =====")
    # --- 전처리 ---
    preprocess = clone(preprocess_tpl)
    X_tr_df, X_va_df = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr,   y_va     = y.iloc[tr_idx].to_numpy(), y.iloc[va_idx].to_numpy()

    preprocess.fit(X_tr_df)
    Xt_tr = preprocess.transform(X_tr_df)
    Xt_va = preprocess.transform(X_va_df)
    input_dim = Xt_tr.shape[1]

    # --- SMOTE 오버샘플링 (훈련 폴드에만) ---
    # 소수 클래스 샘플 수가 적을 때 k_neighbors를 자동으로 낮춤
    binc = np.bincount(y_tr)
    min_cnt = binc[binc > 0].min()

    if min_cnt < 2:
        # SMOTE가 불가(최소 2개 필요) → 임시로 RandomOverSampler 사용
        resampler = RandomOverSampler(sampling_strategy="auto", random_state=42)
    else:
        # k_neighbors < 소수 클래스 표본 수
        k_for_smote = min(5, max(1, min_cnt - 1))
        resampler = SMOTETomek(
            sampling_strategy="auto",
            smote=SMOTE(k_neighbors=k_for_smote, random_state=42),  # ★ 인스턴스로 전달
            random_state=42
            # tomek=TomekLinks(n_jobs=-1)  # 필요시 명시
        )

    Xt_tr_res, y_tr_res = resampler.fit_resample(Xt_tr, y_tr)

    print(" - 클래스 분포(전):", Counter(y_tr))
    print(" - 클래스 분포(후):", Counter(y_tr_res))

    # --- 텐서 ---
    Xtr_t = torch.tensor(Xt_tr_res, dtype=torch.float32).to(device)
    ytr_t = torch.tensor(y_tr_res,  dtype=torch.long).to(device)
    Xva_t = torch.tensor(Xt_va,     dtype=torch.float32).to(device)
    yva_t = torch.tensor(y_va,      dtype=torch.long).to(device)

    train_ds = torch.utils.data.TensorDataset(Xtr_t, ytr_t)
    train_ld = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)

    # (선택) 클래스 가중치는 SMOTE와 중복 보상될 수 있어 끄는 것을 권장
    # cw = compute_class_weight(class_weight='balanced', classes=classes_sorted, y=y_tr_res)
    # class_weights = torch.tensor(cw, dtype=torch.float32).to(device)

    # --- 모델/손실/옵티마이저 ---
    model = MLP(input_dim, n_classes, dropout_ratio=dropout_ratio).to(device)
    criterion = FocalLoss(
        gamma=2.0,
        alpha=None,
        label_smoothing=0.05,
        reduction="mean"
    ).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

    # --- Early Stopping ---
    best_val_loss = float('inf')
    best_state = None
    epochs_no_improve = 0

    for epoch in tqdm(range(1, max_epochs+1), desc=f"Fold {fold} Training"):
        model.train()
        running_loss = 0.0
        for xb, yb in train_ld:
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * xb.size(0)
        train_loss = running_loss / len(train_ds)

        # Validate (검증셋에는 SMOTE 미적용)
        model.eval()
        with torch.no_grad():
            val_logits = model(Xva_t)
            val_loss = criterion(val_logits, yva_t).item()

        # Early stopping
        if val_loss < best_val_loss - 1e-6:
            best_val_loss = val_loss
            best_state = {k: v.detach().clone() for k, v in model.state_dict().items()}
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"\n[Fold {fold}] Early stopping at epoch {epoch} (best val loss={best_val_loss:.4f})")
                break

    # --- 베스트 로드 & OOF 로짓 저장 ---
    if best_state is not None:
        model.load_state_dict(best_state)
    model.eval()
    with torch.no_grad():
        va_logits = model(Xva_t).detach().cpu().numpy()
    oof_logits[va_idx] = va_logits

    va_pred = va_logits.argmax(1)
    va_acc  = accuracy_score(y_va, va_pred)
    va_f1 = f1_score(y_va, va_pred, average="macro")
    fold_metrics.append({
        "fold": fold,
        "val_acc": va_acc,
        "val_loss": best_val_loss,
        "val_f1_macro": va_f1
    })

    # --- 폴드 아티팩트 저장(앙상블용) ---
    fold_artifacts.append({
        "state_dict": copy.deepcopy(model.state_dict()),
        "input_dim": input_dim,
        "preprocess": copy.deepcopy(preprocess)
    })


# ----- (5) OOF 성능 집계 -----
oof_pred = oof_logits.argmax(1)
oof_acc  = accuracy_score(y, oof_pred)
oof_f1   = f1_score(y, oof_pred, average="macro")

cv_acc   = np.mean([m["val_acc"] for m in fold_metrics])
cv_std   = np.std([m["val_acc"] for m in fold_metrics])

cv_f1    = np.mean([m["val_f1_macro"] for m in fold_metrics])
cv_f1_std= np.std([m["val_f1_macro"] for m in fold_metrics])

print(f"\nFold-wise Val Acc: {cv_acc:.4f} ± {cv_std:.4f}")
print(f"Fold-wise Val Macro F1: {cv_f1:.4f} ± {cv_f1_std:.4f}")

print(f"OOF Acc (one-model-per-fold): {oof_acc:.4f}")
print(f"OOF Macro F1 (one-model-per-fold): {oof_f1:.4f}")
print(fold_metrics)

# **성능 기록**
하이퍼파라미터, 활성함수 종류 등 속성을 변경하고 얻은 결과값을 저장, 가장 성능이 높은 조합 탐색

In [None]:
keys = ['max_epochs','batch_size','learning rate','patience','hidden_dims',
        'dropout_ratio','Val Acc','Val Std','Val Macro F1','Val Macro F1 Std']
vals = [max_epochs, batch_size, lr, patience, hidden_dims,
        dropout_ratio, cv_acc, cv_std, cv_f1, cv_f1_std]

score_dict = {k: [] for k in keys}

In [None]:
vals = [max_epochs, batch_size, lr, patience, hidden_dims,
        dropout_ratio, cv_acc, cv_std, cv_f1, cv_f1_std]
for k, v in zip(keys, vals):
    score_dict[k].append(v)

score_df = pd.DataFrame.from_dict(score_dict, orient='columns')
score_df.round(2)
display(score_df)

Unnamed: 0,max_epochs,batch_size,learning rate,patience,hidden_dims,dropout_ratio,Val Acc,Val Std,Val Macro F1,Val Macro F1 Std
0,200,256,0.001,20,"(256, 256, 128)",0.2,0.498477,0.005344,0.481419,0.003519
1,200,256,0.001,20,"(256, 256, 128)",0.2,0.498477,0.005344,0.481419,0.003519
2,200,256,0.001,20,"(256, 256, 128, 128)",0.2,0.498866,0.006756,0.484545,0.005455
3,200,256,0.001,20,"(512, 256, 128)",0.05,0.495982,0.009871,0.476826,0.002975
4,200,256,0.001,20,"(256, 256, 128)",0.05,0.493713,0.005663,0.47352,0.004245
5,200,256,0.001,20,"(256, 256, 128)",0.2,0.503889,0.009406,0.483823,0.003533
6,200,128,0.001,20,"(256, 256, 128)",0.2,0.504893,0.007447,0.480684,0.005039
7,200,512,0.001,20,"(256, 256, 128)",0.2,0.499935,0.00457,0.483105,0.004386
8,200,256,0.001,20,"(256, 256, 128)",0.5,0.50282,0.008846,0.483487,0.004016
9,200,256,0.003,20,"(256, 256, 128)",0.2,0.505509,0.007675,0.474959,0.007625


# **TEST 예측 및 제출 파일 생성**
각 폴드 전처리로 test를 변환 → 폴드별 모델로 로짓 → 평균


In [None]:
# ----- (6) test 앙상블 예측 -----
T_MC = 20

Xt_test_list = []
for art in fold_artifacts:
    Xt_te_fold = art["preprocess"].transform(test)
    Xt_test_list.append(torch.tensor(Xt_te_fold, dtype=torch.float32).to(device))

test_logits_list = []
for (art, Xte_t) in zip(fold_artifacts, Xt_test_list):
    # 폴드별 모델 인스턴스 생성 및 가중치 로드 (입력차원은 폴드마다 다를 수 있음)
    model = MLP(art["input_dim"], n_classes, dropout_ratio=dropout_ratio).to(device)
    model.load_state_dict(art["state_dict"])
    # 로짓 예측 (MC Dropout 옵션)
    logits_np = mc_dropout_logits(model, Xte_t, T=T_MC)
    test_logits_list.append(logits_np)

# 폴드 평균(필수) + MC Dropout 평균(선택) 결과
test_logits_mean = np.mean(test_logits_list, axis=0)
test_pred = test_logits_mean.argmax(1)

submission = pd.DataFrame({"ID": test["ID"], "support_needs": test_pred})
submission.to_csv("submission_ensemble.csv", index=False)
print("✅ submission_ensemble.csv 저장 완료 (K-Fold 로짓 평균 앙상블"
      + (f" + MC Dropout x{T_MC}" if T_MC>1 else "") + ")")

✅ submission_ensemble.csv 저장 완료 (K-Fold 로짓 평균 앙상블 + MC Dropout x20)
