In [15]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [2]:
train_data = pd.read_csv('train07to10.csv')
train_data2 = pd.read_csv('train11to12.csv')
test_data = pd.read_csv('test07to10.csv')
test_data2 = pd.read_csv('test11to12.csv')

In [3]:
train_df = pd.concat([train_data, train_data2])
test_df = pd.concat([test_data, test_data2])

In [4]:
train_df.columns

Index(['기준년월', '_2순위카드이용금액', '이용카드수_신용체크', '마케팅동의여부', '이용카드수_신용', '수신거부여부_메일',
       '동의여부_한도증액안내', '이용카드수_체크', '보유여부_해외겸용_본인', '보유여부_해외겸용_신용_본인',
       ...
       '증감율_이용건수_일시불_분기', '잔액_신판ca평균한도소진율_r3m', '변동률_RV일시불평잔', '변동률_할부평잔',
       '증감율_이용건수_체크_분기', '변동률_일시불평잔', '증감율_이용건수_신판_분기', '증감율_이용건수_CA_분기',
       '잔액_신판ca최대한도소진율_r3m', '혜택수혜율_R3M'],
      dtype='object', length=431)

In [5]:
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]

X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()

# 타깃 라벨 인코딩
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

In [6]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

X_test = test_df.copy()

encoders = {}  # 각 컬럼별 encoder 저장

for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    if unseen_labels_val:
        le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    X_test[col] = le_train.transform(X_test[col])

In [7]:
X_test = X_test[X.columns]

In [8]:
X = X.fillna(0)
X_test = X_test.fillna(0)

In [9]:
# TabNet expects numpy arrays
X_train_np = X.values
X_test_np = X_test.values
y_train_np = y_encoded

In [10]:
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train_np, y_train_np, test_size=0.2, stratify=y_train_np
)

In [16]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_sub, y_train_sub)

MemoryError: Unable to allocate 6.14 GiB for an array with shape (1920000, 429) and data type float64

In [11]:
# 클래스별 가중치 텐서
# 클래스 불균형 보정
class_counts = {
    'A': 972,
    'B': 144,
    'C': 127590,
    'D': 349242,
    'E': 1922052
}
reference = class_counts['E']
class_weights = {le_target.transform([k])[0]: reference / v for k, v in class_counts.items()}

sample_weights = np.array([class_weights[y] for y in y_train_np])
class_weights = torch.tensor([reference / class_counts[cls] for cls in le_target.classes_], dtype=torch.float)

# loss 함수 설정
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

In [12]:
# TabNet 학습
clf = TabNetClassifier(
    n_d=64,
    n_a=64,
    n_steps=5,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-2),
    scheduler_params={"step_size": 10, "gamma": 0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='entmax',
    verbose=1,
    seed=42
)



In [13]:
def custom_f1_score(y_true, y_pred):
    y_pred_classes = y_pred.argmax(axis=1)  # 확률 → 클래스
    return f1_score(y_true, y_pred_classes, average='macro')

In [14]:
clf.fit(
    X_resampled, y_resampled,
    eval_set=[(X_val, y_val)],
    max_epochs=50,
    patience=10,
    eval_metric=['accuracy'],
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=4,
)

TypeError: issubclass() arg 1 must be a class