In [2]:
import pandas as pd
file_path = r"C:\Users\eunse\OneDrive\바탕 화면\KUBIG 2025\프로젝트\open\train.csv"

try:
    # 지정된 경로의 train.csv 파일을 불러옵니다.
    train_df = pd.read_csv(file_path)
    
    # 성공적으로 불러왔는지 확인하기 위해 상위 5개 행을 출력합니다.
    print("✅ 파일을 성공적으로 불러왔습니다.")
    print("--- train.csv 데이터 샘플 ---")
    print(train_df.head())

except FileNotFoundError:
    print(f"❌ 오류: 해당 경로에서 파일을 찾을 수 없습니다.")
    print(f"경로를 다시 확인해주세요: {file_path}")
except Exception as e:
    print(f"❌ 파일을 불러오는 중 다른 오류가 발생했습니다: {e}")

import pandas as pd
file_path = r"C:\Users\eunse\OneDrive\바탕 화면\KUBIG 2025\프로젝트\open\test.csv"

try:
    # 지정된 경로의 test.csv 파일을 불러옵니다.
    test_df = pd.read_csv(file_path)
    
    # 성공적으로 불러왔는지 확인하기 위해 상위 5개 행을 출력합니다.
    print("✅ 파일을 성공적으로 불러왔습니다.")
    print("--- test.csv 데이터 샘플 ---")
    print(test_df.head())

except FileNotFoundError:
    print(f"❌ 오류: 해당 경로에서 파일을 찾을 수 없습니다.")
    print(f"경로를 다시 확인해주세요: {file_path}")
except Exception as e:
    print(f"❌ 파일을 불러오는 중 다른 오류가 발생했습니다: {e}")


✅ 파일을 성공적으로 불러왔습니다.
--- train.csv 데이터 샘플 ---
            ID   age gender  tenure  frequent  payment_interval  \
0  TRAIN_00000  54.0      F    47.0      22.0               8.0   
1  TRAIN_00001  30.0      M    16.0      15.0               5.0   
2  TRAIN_00002  29.0      M     8.0      30.0              21.0   
3  TRAIN_00003  38.0      F    38.0      23.0              10.0   
4  TRAIN_00004  25.0      F    52.0       3.0              17.0   

  subscription_type  contract_length  after_interaction  support_needs  
0            member               90               25.0              0  
1               vip              360               23.0              0  
2              plus               30               21.0              0  
3               vip               90                6.0              0  
4            member               30                1.0              2  
✅ 파일을 성공적으로 불러왔습니다.
--- test.csv 데이터 샘플 ---
           ID   age gender  tenure  frequent  payment_interval  \
0  T

In [3]:
import os, random, time
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score# 0) 재현성을 위한 시드 고정 (optional)
# -----------------
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seed(42)

# ================== ✅ 1. 심화 피처 엔지니어링 함수 정의 ==================
def feature_engineering_advanced(df, is_train=True, agg_maps=None):
    """그룹별 통계 피처를 생성하는 심화 피처 엔지니어링 함수"""
    df_copy = df.copy()
    epsilon = 1e-6
    
    # 그룹화할 범주형 컬럼과 통계를 낼 수치형 컬럼 정의
    group_cols = ["gender", "subscription_type", "contract_length"]
    agg_cols = ["age", "tenure", "frequent", "after_interaction"]
    
    if is_train:
        # 학습 데이터일 경우, 그룹별 통계치를 계산하고 딕셔너리에 저장
        agg_maps = {}
        for group_col in group_cols:
            agg_map = df_copy.groupby(group_col)[agg_cols].agg(['mean', 'std']).reset_index()
            # 멀티레벨 컬럼을 단일 레벨로 만듦 (예: ('age', 'mean') -> 'age_mean')
            agg_map.columns = [f"{col[0]}_{col[1]}" if col[1] else col[0] for col in agg_map.columns]
            agg_maps[group_col] = agg_map
    
    # 저장된 통계 맵을 사용하여 새로운 피처를 추가
    for group_col, agg_map in agg_maps.items():
        df_copy = pd.merge(df_copy, agg_map, on=group_col, how="left")
        for col in agg_cols:
            # 그룹 평균 대비 개인 수치의 차이/정규화 피처 생성
            df_copy[f'{col}_diff_from_{group_col}_mean'] = df_copy[col] - df_copy[f'{col}_mean']
            df_copy[f'{col}_norm_by_{group_col}'] = (df_copy[col] - df_copy[f'{col}_mean']) / (df_copy[f'{col}_std'] + epsilon)
            # 원본 통계 피처는 삭제
            df_copy = df_copy.drop(columns=[f'{col}_mean', f'{col}_std'])
            
    if is_train:
        return df_copy, agg_maps
    else:
        # 검증/테스트 데이터에 없는 그룹으로 인해 NaN이 생길 경우 0으로 채움
        df_copy = df_copy.fillna(0)
        return df_copy
# =========================================================================

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TARGET = "support_needs"
# (NUM_COLS는 피처 엔지니어링 후 다시 정의될 예정)
CAT_COLS = ["gender","subscription_type"]


# 2) 데이터 유출 방지를 위해 가장 먼저 분리
# -----------------
tr_df, va_df = train_df.pipe(lambda d: train_test_split(d, test_size=0.2, stratify=d[TARGET], random_state=42))


# ================== ✅ 2. 심화 피처 엔지니어링 적용 ==================
# 학습 데이터로 심화 피처 엔지니어링 실행 및 통계 맵 저장
tr_df_featured, agg_maps = feature_engineering_advanced(tr_df, is_train=True)

# 저장된 통계 맵을 사용하여 검증 데이터 변환
va_df_featured = feature_engineering_advanced(va_df, is_train=False, agg_maps=agg_maps)

# 전처리 대상이 될 수치형 컬럼 목록을 새로 생성된 피처를 포함하도록 업데이트
original_num_cols = ["age","tenure","frequent","payment_interval","contract_length","after_interaction"]
new_num_cols = [col for col in tr_df_featured.columns if 'diff' in col or 'norm' in col]
NUM_COLS = original_num_cols + new_num_cols

# 이후 코드에서 사용할 데이터프레임을 피처가 추가된 버전으로 교체
tr_df = tr_df_featured
va_df = va_df_featured
# =========================================================================


# -----------------
# 3) 전처리기 클래스 (수정 없음)
# -----------------
class Preprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.cat_cols_fit = None

    def fit_transform_train(self, df):
        Xnum = self.scaler.fit_transform(df[NUM_COLS])
        Xcat = pd.get_dummies(df[CAT_COLS], drop_first=False)
        self.cat_cols_fit = Xcat.columns.tolist()
        X = np.hstack([Xnum, Xcat.values])
        y = df[TARGET].values
        return X, y

    def transform_val_or_test(self, df, has_target=True):
        Xnum = self.scaler.transform(df[NUM_COLS])
        Xcat = pd.get_dummies(df[CAT_COLS], drop_first=False)
        Xcat = Xcat.reindex(columns=self.cat_cols_fit, fill_value=0)
        X = np.hstack([Xnum, Xcat.values])
        y = df[TARGET].values if has_target else None
        return X, y

prep = Preprocessor()
X_tr, y_tr = prep.fit_transform_train(tr_df)
X_va, y_va = prep.transform_val_or_test(va_df, has_target=True)


In [4]:
# 4) PyTorch Dataset / DataLoader (수정 없음)
# -----------------
class CSNDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        if self.y is None: return self.X[i]
        return self.X[i], self.y[i]

train_loader = DataLoader(CSNDataset(X_tr, y_tr), batch_size=512, shuffle=True)
val_loader   = DataLoader(CSNDataset(X_va, y_va), batch_size=1024, shuffle=False)

# -----------------
# 5) 모델 정의 (수정 없음)
# -----------------
input_dim = X_tr.shape[1]
num_classes = len(np.unique(y_tr))

class WideMLP(nn.Module):
    def __init__(self, input_dim, num_classes, hidden=512, layers=4, pdrop=0.2):
        super().__init__()
        dims = [input_dim] + [hidden]*layers
        blocks = []
        for i in range(len(dims)-1):
            blocks += [
                nn.Linear(dims[i], dims[i+1]),
                nn.BatchNorm1d(dims[i+1]),
                nn.ReLU(),
                nn.Dropout(pdrop),
            ]
        self.backbone = nn.Sequential(*blocks)
        self.head = nn.Linear(hidden, num_classes)
    def forward(self, x):
        return self.head(self.backbone(x))

model = WideMLP(input_dim, num_classes, hidden=512, layers=4, pdrop=0.2).to(DEVICE)

# -----------------
# 6) 손실 함수, 옵티마이저, 스케줄러 (수정 없음)
# -----------------
classes, counts = np.unique(y_tr, return_counts=True)
alpha = (counts.sum() / (len(classes) * counts)).astype(np.float32)
alpha = torch.tensor(alpha, device=DEVICE)

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction="mean"):
        super().__init__()
        self.gamma = gamma; self.alpha = alpha; self.reduction = reduction
    def forward(self, logits, target):
        log_prob = torch.log_softmax(logits, dim=1)
        prob = torch.softmax(logits, dim=1)
        pt = prob[torch.arange(logits.size(0)), target]
        log_pt = log_prob[torch.arange(logits.size(0)), target]
        focal = (1 - pt).pow(self.gamma)
        if self.alpha is not None:
            a = self.alpha[target]
            loss = -a * focal * log_pt
        else:
            loss = -focal * log_pt
        return loss.mean() if self.reduction == "mean" else loss.sum()

criterion = FocalLoss(gamma=2.0, alpha=alpha, reduction="mean")
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer, T_0=10, T_mult=2, eta_min=1e-6)

# -----------------
# 7) 조기 종료 (수정 없음)
# -----------------
class EarlyStopping:
    def __init__(self, patience=15, mode="min", min_delta=0.0):
        self.patience = patience; self.mode = mode; self.min_delta = min_delta
        self.best = None; self.count = 0; self.stop = False
    def __call__(self, current):
        if self.best is None: self.best = current; return
        improved = (current < self.best - self.min_delta) if self.mode=="min" else (current > self.best + self.min_delta)
        if improved: self.best = current; self.count = 0
        else:
            self.count += 1
            if self.count >= self.patience: self.stop = True

early_stop = EarlyStopping(patience=15, mode="min", min_delta=0.0)

# -----------------
# 8) 학습 루프 (수정 없음)
# -----------------
epochs = 150
best_val_loss = float("inf")
best_state = None

print("🔥 모델 학습을 시작합니다...")
for epoch in range(1, epochs+1):
    model.train()
    tr_loss, tr_correct, tr_total = 0.0, 0, 0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        tr_loss += loss.item() * len(xb)
        tr_correct += (out.argmax(1) == yb).sum().item()
        tr_total += len(xb)
        scheduler.step(epoch - 1 + (tr_total / len(train_loader.dataset)))

    tr_loss /= tr_total; tr_acc = tr_correct / tr_total

    model.eval()
    va_loss, va_total, va_correct = 0.0, 0, 0
    all_preds, all_true = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            out = model(xb)
            loss = criterion(out, yb)
            va_loss += loss.item() * len(xb)
            pred = out.argmax(1)
            va_correct += (pred == yb).sum().item()
            va_total += len(xb)
            all_preds.append(pred.cpu().numpy()); all_true.append(yb.cpu().numpy())
    va_loss /= va_total; va_acc = va_correct / va_total
    y_true = np.concatenate(all_true); y_pred = np.concatenate(all_preds)
    macro_f1 = f1_score(y_true, y_pred, average="macro")

    print(f"Epoch {epoch:03d} | TrainLoss {tr_loss:.4f} | TrainAcc {tr_acc:.4f} | "
          f"ValLoss {va_loss:.4f} | ValAcc {va_acc:.4f} | MacroF1 {macro_f1:.4f} | LR {optimizer.param_groups[0]['lr']:.2e}")

    if va_loss < best_val_loss:
        best_val_loss = va_loss
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
    early_stop(va_loss)
    if early_stop.stop:
        print(f"> {epoch} Epoch에서 조기 종료. Best ValLoss: {best_val_loss:.4f}")
        break

if best_state is not None:
    model.load_state_dict(best_state)


🔥 모델 학습을 시작합니다...
Epoch 001 | TrainLoss 0.5072 | TrainAcc 0.4181 | ValLoss 0.4656 | ValAcc 0.4576 | MacroF1 0.4404 | LR 2.93e-04
Epoch 002 | TrainLoss 0.4785 | TrainAcc 0.4385 | ValLoss 0.4668 | ValAcc 0.4606 | MacroF1 0.4404 | LR 2.71e-04
Epoch 003 | TrainLoss 0.4718 | TrainAcc 0.4424 | ValLoss 0.4631 | ValAcc 0.4718 | MacroF1 0.4457 | LR 2.38e-04
Epoch 004 | TrainLoss 0.4662 | TrainAcc 0.4567 | ValLoss 0.4636 | ValAcc 0.4614 | MacroF1 0.4503 | LR 1.97e-04
Epoch 005 | TrainLoss 0.4622 | TrainAcc 0.4606 | ValLoss 0.4579 | ValAcc 0.4689 | MacroF1 0.4591 | LR 1.50e-04
Epoch 006 | TrainLoss 0.4574 | TrainAcc 0.4665 | ValLoss 0.4581 | ValAcc 0.4770 | MacroF1 0.4605 | LR 1.04e-04
Epoch 007 | TrainLoss 0.4530 | TrainAcc 0.4688 | ValLoss 0.4553 | ValAcc 0.4749 | MacroF1 0.4612 | LR 6.26e-05
Epoch 008 | TrainLoss 0.4522 | TrainAcc 0.4725 | ValLoss 0.4554 | ValAcc 0.4783 | MacroF1 0.4613 | LR 2.96e-05
Epoch 009 | TrainLoss 0.4510 | TrainAcc 0.4738 | ValLoss 0.4555 | ValAcc 0.4765 | MacroF1 0.46

In [5]:
# 9) 전체 데이터로 재학습 (수정 없음)
# -----------------
retrain_for_submit = True
if retrain_for_submit:
    print("\n🔥 전체 데이터로 모델을 다시 학습합니다...")
    # ✅ 심화 피처 엔지니어링을 전체 학습 데이터에 적용
    full_train_featured, full_agg_maps = feature_engineering_advanced(train_df, is_train=True)
    
    # ✅ NUM_COLS를 다시 한번 업데이트 (Full Train 기준)
    full_new_num_cols = [col for col in full_train_featured.columns if 'diff' in col or 'norm' in col]
    NUM_COLS = original_num_cols + full_new_num_cols

    full_prep = Preprocessor()
    X_full, y_full = full_prep.fit_transform_train(full_train_featured)
    full_loader = DataLoader(CSNDataset(X_full, y_full), batch_size=512, shuffle=True)
    
    model = WideMLP(X_full.shape[1], num_classes, hidden=512, layers=4, pdrop=0.2).to(DEVICE)

    _, counts_full = np.unique(y_full, return_counts=True)
    alpha_full = (counts_full.sum() / (len(classes) * counts_full)).astype(np.float32)
    alpha_full = torch.tensor(alpha_full, device=DEVICE)

    criterion = FocalLoss(gamma=2.0, alpha=alpha_full, reduction="mean")
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-6)

    model.train()
    warm_epochs = 30
    for e in range(1, warm_epochs+1):
        tr_loss, tr_correct, tr_total = 0.0, 0, 0
        for xb, yb in full_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()
            tr_loss += loss.item() * len(xb)
            tr_correct += (out.argmax(1) == yb).sum().item()
            tr_total += len(xb)
            scheduler.step(e - 1 + (tr_total / len(full_loader.dataset)))
        print(f"[FullTrain] Epoch {e:02d} | Loss {tr_loss/tr_total:.4f} | Acc {tr_correct/tr_total:.4f}")

    prep = full_prep



🔥 전체 데이터로 모델을 다시 학습합니다...
[FullTrain] Epoch 01 | Loss 0.5072 | Acc 0.4104
[FullTrain] Epoch 02 | Loss 0.4822 | Acc 0.4318
[FullTrain] Epoch 03 | Loss 0.4732 | Acc 0.4385
[FullTrain] Epoch 04 | Loss 0.4649 | Acc 0.4540
[FullTrain] Epoch 05 | Loss 0.4590 | Acc 0.4590
[FullTrain] Epoch 06 | Loss 0.4570 | Acc 0.4652
[FullTrain] Epoch 07 | Loss 0.4522 | Acc 0.4711
[FullTrain] Epoch 08 | Loss 0.4511 | Acc 0.4708
[FullTrain] Epoch 09 | Loss 0.4494 | Acc 0.4751
[FullTrain] Epoch 10 | Loss 0.4502 | Acc 0.4728
[FullTrain] Epoch 11 | Loss 0.4536 | Acc 0.4692
[FullTrain] Epoch 12 | Loss 0.4489 | Acc 0.4794
[FullTrain] Epoch 13 | Loss 0.4468 | Acc 0.4835
[FullTrain] Epoch 14 | Loss 0.4427 | Acc 0.4850
[FullTrain] Epoch 15 | Loss 0.4414 | Acc 0.4870
[FullTrain] Epoch 16 | Loss 0.4401 | Acc 0.4938
[FullTrain] Epoch 17 | Loss 0.4377 | Acc 0.4950
[FullTrain] Epoch 18 | Loss 0.4376 | Acc 0.4977
[FullTrain] Epoch 19 | Loss 0.4362 | Acc 0.4971
[FullTrain] Epoch 20 | Loss 0.4355 | Acc 0.4971
[FullTrain] E

In [6]:
# 10) test.csv 예측 및 제출 파일 저장 (수정 없음)
# -----------------
print("\n🔥 test.csv 데이터로 예측을 수행하고 제출 파일을 생성합니다...")
# ✅ 심화 피처 엔지니어링을 테스트 데이터에 적용
test_df_featured = feature_engineering_advanced(test_df, is_train=False, agg_maps=full_agg_maps)

id_col = "ID"
X_te, _ = prep.transform_val_or_test(test_df_featured, has_target=False)

test_loader = DataLoader(CSNDataset(X_te, None), batch_size=1024, shuffle=False)
model.eval()
preds = []
with torch.no_grad():
    for xb in test_loader:
        xb = xb.to(DEVICE)
        out = model(xb)
        pred = out.argmax(1).cpu().numpy()
        preds.append(pred)
preds = np.concatenate(preds)

sub = pd.DataFrame({
    "ID": test_df[id_col],
    "support_needs": preds.astype(int)
})
stamp = time.strftime("%Y%m%d_%H%M%S")
out_path = f"submission_{stamp}.csv"
sub.to_csv(out_path, index=False)
print(f"✅ 제출 파일 저장 완료: {out_path}")


🔥 test.csv 데이터로 예측을 수행하고 제출 파일을 생성합니다...
✅ 제출 파일 저장 완료: submission_20250818_165750.csv
