In [1]:
import numpy as np
import torch
from sklearn.model_selection import train_test_split

from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy
from pytorch_widedeep.datasets import load_adult

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = load_adult(as_frame=True)
df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop("income", axis=1, inplace=True)
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.income_label)

  with resources.path(


In [3]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_label
34348,27,Private,150080,Bachelors,13,Never-married,Exec-managerial,Own-child,White,Female,0,0,40,United-States,0
11089,63,Private,286990,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,1
35216,43,Self-emp-inc,150533,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Other-relative,White,Male,0,0,50,United-States,1
16986,44,Private,63042,Bachelors,13,Divorced,Exec-managerial,Own-child,White,Female,0,0,50,United-States,1
7691,30,Private,172748,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,0


In [9]:
# 将不同的数据进行分类处理
wide_cols = [
    "education",
    "relationship",
    "workclass",
    "occupation",
    "native-country",
    "gender",
]
crossed_cols = [("education", "occupation"), ("native-country", "occupation")]

cat_embed_cols = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital-gain",
    "capital-loss",
    "native-country",
]
continuous_cols = ["age", "hours-per-week"]
target = "income_label"

In [11]:
target = df_train[target].values

KeyError: "None of [Index([0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       ...\n       0, 0, 0, 0, 0, 0, 1, 0, 1, 0],\n      dtype='int32', length=39073)] are in the [columns]"

In [15]:
#判断grand truth的分布
print(target)
print("target的形状:", target.shape)
print("target中1的数量:", np.sum(target == 1))
print("target中0的数量:", np.sum(target == 0))

[0 0 0 ... 0 1 0]
target的形状: (39073,)
target中1的数量: 9349
target中0的数量: 29724


In [16]:
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = wide_preprocessor.fit_transform(df_train) #此处的转换要参考论文


In [19]:
print(X_wide)
print(len(X_wide))
print(X_wide.shape)


[[  1  17  23 ...  89  91 309]
 [  2  18  24 ...  90  92 310]
 [  3  17  23 ...  89  93 311]
 ...
 [  1  20  25 ...  89 102 324]
 [ 15  20  23 ...  89 170 315]
 [  5  19  23 ...  90 101 316]]
39073
(39073, 8)


WidePreprocessor(wide_cols=['education', 'relationship', 'workclass', 'occupation', 'native-country', 'gender'], crossed_cols=[('education', 'occupation'), ('native-country', 'occupation')])

In [20]:
tab_preprocessor = TabPreprocessor(
    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols  # type: ignore[arg-type]
)
X_tab = tab_preprocessor.fit_transform(df_train)




In [22]:
print(X_tab)
print(tab_preprocessor.column_idx)
print(tab_preprocessor.cat_embed_input)


[[ 1  1  1 ...  1 32 40]
 [ 2  2  2 ...  1 49 20]
 [ 1  3  1 ...  1 32 38]
 ...
 [ 3  1  4 ...  1 51  8]
 [ 1 15  4 ...  1 47 48]
 [ 1  5  1 ...  1 33 40]]
{'workclass': 0, 'education': 1, 'marital-status': 2, 'occupation': 3, 'relationship': 4, 'race': 5, 'gender': 6, 'capital-gain': 7, 'capital-loss': 8, 'native-country': 9, 'age': 10, 'hours-per-week': 11}
[('workclass', 9, 5), ('education', 16, 8), ('marital-status', 7, 5), ('occupation', 15, 7), ('relationship', 6, 4), ('race', 5, 4), ('gender', 2, 2), ('capital-gain', 119, 23), ('capital-loss', 98, 21), ('native-country', 42, 13)]


In [23]:
# build the model
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=continuous_cols,
)
model = WideDeep(wide=wide, deeptabular=tab_mlp)

In [24]:
# train and validate
trainer = Trainer(model, objective="binary", metrics=[Accuracy])
trainer.fit(
    X_wide=X_wide,
    X_tab=X_tab,
    target=target,
    n_epochs=5,
    batch_size=256,
)

epoch 1: 100%|██████████| 153/153 [02:44<00:00,  1.08s/it, loss=0.453, metrics={'acc': 0.7905}] 
epoch 2: 100%|██████████| 153/153 [02:05<00:00,  1.22it/s, loss=0.352, metrics={'acc': 0.8356}] 
epoch 3: 100%|██████████| 153/153 [02:34<00:00,  1.01s/it, loss=0.322, metrics={'acc': 0.8514}] 
epoch 4: 100%|██████████| 153/153 [02:12<00:00,  1.15it/s, loss=0.305, metrics={'acc': 0.8604}] 
epoch 5: 100%|██████████| 153/153 [02:08<00:00,  1.19it/s, loss=0.294, metrics={'acc': 0.8652}] 


In [25]:
# predict on test
X_wide_te = wide_preprocessor.transform(df_test)
X_tab_te = tab_preprocessor.transform(df_test)
preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)

predict: 100%|██████████| 39/39 [02:17<00:00,  3.52s/it]  


In [26]:
# Option 1: this will also save training history and lr history if the
# LRHistory callback is used
trainer.save(path="model_weights", save_state_dict=True)

In [None]:
#预训练参数可以直接用来进行推理或者是微调
# From here in advance, Option 1 or 2 are the same. I assume the user has
# prepared the data and defined the new model components:
# 1. Build the model
model_new = WideDeep(wide=wide, deeptabular=tab_mlp)
model_new.load_state_dict(torch.load("model_weights/wd_model.pt"))

# 2. Instantiate the trainer
trainer_new = Trainer(model_new, objective="binary")

# 3. Either start the fit or directly predict
preds = trainer_new.predict(X_wide=X_wide, X_tab=X_tab, batch_size=32)