In [1]:
import os
import wget
from pathlib import Path
# 데이터 수집 URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# 데이터셋 명
dataset_name = 'census-income'
# 데이터 저장 경로 생성
out = Path(os.getcwd() + '/data/' + dataset_name + '.csv')
out.parent.mkdir(parents = True, exist_ok = True)
if out.exists(): print("File already exists.")
else:
    print("Downloading file...")
    # 데이터 다운로드
    wget.download(url, out.as_posix())

File already exists.


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# 데이터 불러오기
train = pd.read_csv(out).copy()
target = ' <=50K'
# 학습/검증/평가 데이터 구분
if "Set" not in train.columns: train["Set"] = np.random.choice(["train", "valid", "test"], p = [.8, .1, .1], size = (train.shape[0], ))
train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index
test_indices  = train[train.Set == "test" ].index
# 컬럼별 unique 개수
nunique = train.nunique()
# 컬럼별 dtype
types = train.dtypes
categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    # 범주형 변수면 결측을 VV_likely로 채우고 라벨 인코딩 적용
    if types[col] == 'object' or nunique[col] < 200:
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
        print(col, train[col].nunique(), l_enc.classes_)
    # 범주형 변수 아니면 결측을 평균으로 보완
    else: train.fillna(train.loc[train_indices, col].mean(), inplace = True)

39 73 [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
 90]
 State-gov 9 [' ?' ' Federal-gov' ' Local-gov' ' Never-worked' ' Private'
 ' Self-emp-inc' ' Self-emp-not-inc' ' State-gov' ' Without-pay']
 Bachelors 16 [' 10th' ' 11th' ' 12th' ' 1st-4th' ' 5th-6th' ' 7th-8th' ' 9th'
 ' Assoc-acdm' ' Assoc-voc' ' Bachelors' ' Doctorate' ' HS-grad'
 ' Masters' ' Preschool' ' Prof-school' ' Some-college']
 13 16 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
 Never-married 7 [' Divorced' ' Married-AF-spouse' ' Married-civ-spouse'
 ' Married-spouse-absent' ' Never-married' ' Separated' ' Widowed']
 Adm-clerical 15 [' ?' ' Adm-clerical' ' Armed-Forces' ' Craft-repair' ' Exec-managerial'
 ' Farming-fishing' ' Handlers-cleaners' ' Machine-op-inspct'
 ' Other-service' ' Priv-house-serv' ' Prof-specialty' ' Protective-serv'
 ' Sa

In [3]:
# 타겟 라벨 변경
train.loc[train[target] == 0, target] = "not_wealthy"
train.loc[train[target] == 1, target] = "wealthy"

In [15]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
# 입력 변수
unused_feat = ['Set']
features = [col for col in train.columns if col not in unused_feat + [target]] 
# 범주형 변수
cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]
# TabNet 임베딩 파라미터 설정
tabnet_params = {"cat_idxs" : cat_idxs,
                 "cat_dims" : cat_dims,
                 "cat_emb_dim" : 1,
                 "optimizer_fn" : torch.optim.Adam,
                 "optimizer_params" : {"lr" : 2e-2},
                 "scheduler_params":{"step_size" : 50, # how to use learning rate scheduler
                                     "gamma" : 0.9},
                 "scheduler_fn" : torch.optim.lr_scheduler.StepLR,
                 "mask_type" : 'entmax' # "sparsemax"
                }
clf = TabNetClassifier(**tabnet_params)

Device used : cpu


In [16]:
# 학습 데이터
X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]
# 검증 데이터
X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]
# 평가 데이터
X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]
# epoch 설정
max_epochs = 100 if not os.getenv("CI", False) else 2
# 훈련 (warm start = False)
save_history = []
for _ in range(2):
    clf.fit(
            X_train = X_train, y_train = y_train,
            eval_set = [(X_train, y_train), (X_valid, y_valid)], eval_name = ['train', 'valid'], eval_metric = ['auc'],
            max_epochs = max_epochs, patience = 20,
            batch_size = 1024, virtual_batch_size = 128,
            num_workers = 0, weights = 1, drop_last = False
            )
    save_history.append(clf.history["valid_auc"])
assert(np.all(np.array(save_history[0] == np.array(save_history[1]))))

epoch 0  | loss: 0.66866 | train_auc: 0.73637 | valid_auc: 0.7241  |  0:00:03s
epoch 1  | loss: 0.51214 | train_auc: 0.82026 | valid_auc: 0.81876 |  0:00:06s
epoch 2  | loss: 0.44143 | train_auc: 0.85685 | valid_auc: 0.85582 |  0:00:10s
epoch 3  | loss: 0.41206 | train_auc: 0.88625 | valid_auc: 0.88699 |  0:00:12s
epoch 4  | loss: 0.3921  | train_auc: 0.90057 | valid_auc: 0.8991  |  0:00:15s
epoch 5  | loss: 0.38398 | train_auc: 0.90618 | valid_auc: 0.90566 |  0:00:18s
epoch 6  | loss: 0.38255 | train_auc: 0.91013 | valid_auc: 0.91065 |  0:00:20s
epoch 7  | loss: 0.37016 | train_auc: 0.91536 | valid_auc: 0.91569 |  0:00:23s
epoch 8  | loss: 0.36502 | train_auc: 0.91576 | valid_auc: 0.91524 |  0:00:25s
epoch 9  | loss: 0.3585  | train_auc: 0.91998 | valid_auc: 0.9203  |  0:00:29s
epoch 10 | loss: 0.35864 | train_auc: 0.92316 | valid_auc: 0.92419 |  0:00:32s
epoch 11 | loss: 0.35099 | train_auc: 0.92182 | valid_auc: 0.92011 |  0:00:34s
epoch 12 | loss: 0.35332 | train_auc: 0.92525 | vali

KeyboardInterrupt: 