# 三+四、模型调优-神经网络的实现及训练过程的优化

In [47]:
!pip install pytorch-tabnet



In [48]:
import numpy as np
import pandas as pd
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

In [49]:
def reduce_mem_usage(df):
    '''
    遍历DataFrame的所有列并修改它们的数据类型以减少内存使用
    :param df: 需要处理的数据集
    :return:
    '''
    start_mem = df.memory_usage().sum() / 1024 ** 2  # 记录原数据的内存大小
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:  # 这里只过滤了object格式，如果代码中还包含其他类型，要一并过滤
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':  # 如果是int类型的话,不管是int64还是int32,都加入判断
                # 依次尝试转化成in8,in16,in32,in64类型,如果数据大小没溢出,那么转化
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:  # 不是整形的话,那就是浮点型
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:  # 如果不是数值型的话,转化成category类型
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2    # 看一下转化后的数据的内存大小
    print('Memory usage after optimization is {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))  # 看一下压缩比例
    return df

In [96]:
train_data = reduce_mem_usage(pd.read_csv('/content/drive/My Drive/final-ml/train_final.csv'))
test_data = reduce_mem_usage(pd.read_csv('/content/drive/My Drive/final-ml/test_final.csv'))

Memory usage of dataframe is 55.69 MB
Memory usage after optimization is 8.11 MB
Decreased by 85.4%
Memory usage of dataframe is 55.69 MB
Memory usage after optimization is 8.11 MB
Decreased by 85.4%


In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [97]:
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

## a）TabNet的实现

In [98]:
# 准备数据
DL_train, DL_test = train_data.copy(), test_data.copy()
X_train, Y_train = DL_train.drop(columns='loan_status').values, DL_train['loan_status'].values.astype(int)
X_test, Y_test = DL_test.drop(columns='loan_status').values, DL_test['loan_status'].values.astype(int)

In [71]:
from sklearn.model_selection import train_test_split
X_train, X_eval, Y_train, Y_eval = train_test_split(X_train, Y_train, test_size=0.3, random_state=4)

In [None]:
# 构建模型
tn = TabNetClassifier()
tn.fit(X_train, Y_train, 
       eval_set=[(X_eval, Y_eval)], 
      eval_metric = ['logloss'])
preds = tn.predict(X_test)

Device used : cpu
epoch 0  | loss: 0.43578 | val_0_logloss: 2.71898 |  0:00:02s
epoch 1  | loss: 0.24606 | val_0_logloss: 1.04697 |  0:00:04s
epoch 2  | loss: 0.23373 | val_0_logloss: 1.33253 |  0:00:06s
epoch 3  | loss: 0.22719 | val_0_logloss: 0.93691 |  0:00:08s
epoch 4  | loss: 0.22218 | val_0_logloss: 0.60842 |  0:00:11s
epoch 5  | loss: 0.21623 | val_0_logloss: 0.40839 |  0:00:13s
epoch 6  | loss: 0.21465 | val_0_logloss: 0.30493 |  0:00:15s
epoch 7  | loss: 0.21237 | val_0_logloss: 0.24692 |  0:00:17s
epoch 8  | loss: 0.21144 | val_0_logloss: 0.23349 |  0:00:20s
epoch 9  | loss: 0.20993 | val_0_logloss: 0.21921 |  0:00:22s
epoch 10 | loss: 0.20732 | val_0_logloss: 0.21032 |  0:00:24s
epoch 11 | loss: 0.20827 | val_0_logloss: 0.21315 |  0:00:26s
epoch 12 | loss: 0.20504 | val_0_logloss: 0.21277 |  0:00:28s
epoch 13 | loss: 0.20515 | val_0_logloss: 0.21061 |  0:00:30s
epoch 14 | loss: 0.20577 | val_0_logloss: 0.21539 |  0:00:33s
epoch 15 | loss: 0.20461 | val_0_logloss: 0.20968 | 

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, preds)

0.91602

### TabNet + tricks

#### 半监督预训练

In [None]:
import torch
from pytorch_tabnet.pretraining import TabNetPretrainer

tn_pre = TabNetPretrainer(
        optimizer_fn = torch.optim.Adam,
        optimizer_params = dict(lr=2e-2),
        mask_type='entmax'
)

tn_pre.fit(
    X_train = X_train,
    eval_set = [X_eval],
    pretraining_ratio=0.8
)

tn1 = TabNetClassifier(
        optimizer_fn=torch.optim.Adadelta,
        optimizer_params=dict(lr=2e-2),
        scheduler_params={'step_size':10, 'gamma': 0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='sparsemax'
)

tn1.fit(
    X_train=X_train, y_train=Y_train,
    eval_set=[(X_eval, Y_eval)],
    eval_metric=['logloss'],
    from_unsupervised=tn_pre,
    max_epochs=200
)

In [23]:
from sklearn.metrics import accuracy_score
preds = tn1.predict(X_test)
accuracy_score(Y_test, preds)

0.80452

##### fine-tune阶段选择不同的优化器，max_epochs=100， init_lr=2e-2 情形下，分别的准确率
- Adam：0.91512
- SGD：0.80452
- Adadelta：0.80452
- 初步结论：Adam能够保证适当速度收敛，而SGD和Adadelta实在太慢。但当使用Adam得到最佳准确率时，并pretrain和finetune均未训练满100个epoch就停下了，所以再增加epochs也无法进一步提升准确率。故考虑改变预训练比例

In [44]:
def TabNet_trick(op, pretrain_ratio, pretrain_epoch, finetune_epoch, finetune_lr):
  tn_pre = TabNetPretrainer(
        optimizer_fn = torch.optim.Adam,
        optimizer_params = dict(lr=2e-2),
        mask_type='entmax'
        )

  tn_pre.fit(
      X_train = X_train,
      eval_set = [X_eval],
      pretraining_ratio=pretrain_ratio,
      max_epochs=pretrain_epoch
      )
  
  if op == 'Adam':
    opt = torch.optim.Adam
  elif op == 'SGD':
    opt = torch.optim.SGD
  elif op == 'Adadelta':
    opt = torch.optim.Adadelta


  tn1 = TabNetClassifier(
          optimizer_fn=opt,
          optimizer_params=dict(lr=finetune_lr),
          scheduler_params={'step_size':10, 'gamma': 0.9},
          scheduler_fn=torch.optim.lr_scheduler.StepLR,
          mask_type='sparsemax'
          )

  tn1.fit(
      X_train=X_train, y_train=Y_train,
      eval_set=[(X_eval, Y_eval)],
      eval_metric=['logloss'],
      from_unsupervised=tn_pre,
      max_epochs=finetune_epoch
      )
  
  preds = tn1.predict(X_test)
  acc = accuracy_score(Y_test, preds)
  
  print('Using {0}, pretrain max_epochs: {1}, pretrain ratio: {2}, finetune max epochs: {3}, the model on Test set obtains accuracy of {4}'\
        .format(op, pretrain_epoch, pretrain_ratio, finetune_epoch, acc))

In [41]:
(0.91684 - 0.91602) / 0.91602

0.0008951769612017674

当pretrain_epoch, finetune_epoch都为100，pretrain_ratio为0.9，优化器为Adam时，模型的准确率在0.91684，比最初结果（0.91602）提高0.09%.

## b) DNN



In [61]:
from tensorflow.keras import layers
from tensorflow.keras import models
import tensorflow.keras as K

init = K.initializers.glorot_uniform(seed=1)
inputs = layers.Input(shape = (145,))
x = layers.Dense(200, kernel_initializer = init, activation='relu')(inputs)
x = layers.Dense(100, kernel_initializer = init, activation='relu')(x)
x = layers.Dense(50, kernel_initializer = init, activation='relu')(x)
outputs = layers.Dense(2, kernel_initializer = init, activation='softmax')(x)
model = models.Model(inputs, outputs)

In [75]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers
Y_test = to_categorical(Y_test, 2).astype(int)
Y_train = to_categorical(Y_train, 2).astype(int)

model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(), metrics=['accuracy'])

In [76]:
bs, epo = 256, 100
model.fit(X_train, Y_train, batch_size = bs, epochs = epo, shuffle=True, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f334e2d5210>

In [77]:
res = model.evaluate(X_test, Y_test, verbose=0)
print('Evaluation on test data: loss = %0.6f, accuracy = %0.2f%% \n' % (res[0], res[1]*100))

Evaluation on test data: loss = 0.216902, accuracy = 91.25% 



## c) 模型集成

将DNN模型与前一章节的三个ML模型进行集成，比较准确率是否有提升

In [110]:
# ML model
from sklearn.ensemble import VotingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import GradientBoostingClassifier as GBDT
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from tensorflow import losses
from tensorflow.keras import Sequential

xgb_bst = XGBClassifier(max_depth = 3,
                     learning_rate = 0.1,
                     n_estimators = 202,
                     silent=False,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma = 7,
                     min_child_weight=5,
                     subsample=0.9,
                     colsample_bytree=0.8,
                     reg_lambda = 0.2,
                     seed=7)

rf_bst = RF(criterion = 'gini', 
               max_depth = 13, 
               min_samples_split = 70, 
               n_estimators = 70,
           max_features = 'sqrt', random_state = 10)

gbdt_bst = GBDT(n_estimators=51,
            learning_rate = 0.1,
            max_depth = 1,
             min_samples_leaf = 3,
             subsample=0.5,
            n_iter_no_change = 500,
            validation_fraction=0.7,
            random_state = 0)

# DNN model
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
def DNN_model():
  model = Sequential([
    layers.Dense(200, activation='relu', input_shape=(145,)),
    layers.Dense(100, activation='relu'),
    layers.Dense(50, activation='relu'),
    layers.Dense(2, activation='softmax')
  ])
  model.compile(optimizer=optimizers.SGD(), loss=losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])
  return model

dnn = KerasClassifier(build_fn = DNN_model, epochs = 100, verbose=0)
dnn._estimator_type = "classifier"

In [111]:
hd_vote = VotingClassifier(estimators=[   ('rf', rf_bst), 
                                          ('gbdt', gbdt_bst), 
                                          ('xgb', xgb_bst), 
                                          ('dnn', dnn)], 
                                          voting='hard')

sf_vote = VotingClassifier(estimators=[   ('rf', rf_bst), 
                                          ('gbdt', gbdt_bst), 
                                          ('xgb', xgb_bst), 
                                          ('dnn', dnn)], 
                                          voting='soft', 
                                          weights = [2,1,3, 3])

In [105]:
Y_train = train_data['loan_status'].values.astype(int)
Y_test = test_data['loan_status'].values.astype(int)


In [112]:
hd_vote.fit(X_train, Y_train)
sf_vote.fit(X_train, Y_train)
hd_preds = hd_vote.predict(X_test)
sf_preds = sf_vote.predict(X_test)
print('hard vote, acc: %0.2f%%' % (accuracy_score(Y_test, hd_preds) * 100))
print('soft vote, acc: %0.2f%%' % (accuracy_score(Y_test, sf_preds) * 100))




hard vote, acc: 91.50%
soft vote, acc: 19.55%


#### 本章小结
- 使用了TabNet对结构化数据进行建模，并通过半监督预训练、Adam优化提高了模型的准确率
- 搭建DNN对结构化数据进行建模，但结果未超过baseline
- 将DNN和上一章节的三个机器学习模型进行投票选举融合，得到模型的准确率为91.50%，相比单一模型均有提升，但soft vote结果较差。