In [2]:
import pandas as pd
import numpy as np
import hyperopt
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
import catboost as cab
from catboost import CatBoostClassifier, Pool, cv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from category_encoders.target_encoder import TargetEncoder

import warnings
warnings.filterwarnings('ignore') # 关闭警告
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [3]:

# 导入数据
train_data = pd.read_csv("./dataset/train.csv")
test_data = pd.read_csv("./dataset/test_A.csv")
drop_index = ((train_data.isnull()) + (train_data == "?")).mean(axis=1).sort_values(ascending=False).head(100).index
train_data = train_data.drop(index = drop_index).reset_index().drop(columns=['index'])

# 数据备份
df_train = train_data.copy()
df_test = test_data.copy()

# 将所有“？”值设置为np.nan
df1_train = pd.DataFrame(np.where(df_train == "?", np.nan, df_train), columns = df_train.columns)
df1_test = pd.DataFrame(np.where(df_test == "?", np.nan, df_test), columns = df_test.columns)

# 所有类别特征名称列表
cat_fea_names = ['MON_12_CUST_CNT_PTY_ID', 'WTHR_OPN_ONL_ICO', 'LGP_HLD_CARD_LVL', 'NB_CTC_HLD_IDV_AIO_CARD_SITU']

# 将类别特征中缺失的值都设为"oth"
for name in cat_fea_names: 
    df1_train[name][df1_train[name].isnull()] = "oth"
    df1_test[name][df1_test[name].isnull()] = 'oth'

# 缺失值过多的需要删除的特征名称
drop_fea_list = ["AGN_CUR_YEAR_WAG_AMT", "AGN_CUR_YEAR_AMT", "AGN_CNT_RCT_12_MON", "CUR_YEAR_PUB_TO_PRV_TRX_PTY_CNT", "AGN_AGR_LATEST_AGN_AMT"]

# 删除特征后的数据
df2_train = df1_train.drop(columns = drop_fea_list)
df2_test = df1_test.drop(columns = drop_fea_list)

# 剩下的类别特征
cat_fea_names = list(set(cat_fea_names) - set(drop_fea_list))

# 切分特征和标签
X_train, y_train = df1_train.drop(columns = ["LABEL", "CUST_UID"]), df1_train.LABEL
X_test = df1_test.drop(columns = ["CUST_UID"])

for i in X_train.columns:
    if i not in cat_fea_names:
        X_train[i] = X_train[i].astype("float")
        X_test[i] = X_test[i].astype("float")
y_train = y_train.astype("int")

FileNotFoundError: [Errno 2] No such file or directory: './dataset/train.csv'

In [8]:
# 中位数填补连续特征值
def median_impute(X_train, y_train, X_test, cat_fea_names):
    continue_fea_names = list(set(X_train.columns) - set(cat_fea_names))
    X_train_cat, X_test_cat = X_train.loc[:,cat_fea_names], X_test.loc[:,cat_fea_names]
    impute = SimpleImputer(missing_values=np.nan, strategy='median')
    min_max = MinMaxScaler()
    X_train_cont = pd.DataFrame(impute.fit_transform(X_train.loc[:,continue_fea_names]), columns=continue_fea_names)
    X_test_cont = pd.DataFrame(impute.fit_transform(X_test.loc[:,continue_fea_names]), columns=continue_fea_names)
    X_train, X_test = pd.concat([X_train_cont, X_train_cat], axis=1), pd.concat([X_test_cont, X_test_cat], axis=1)
    
    for i in X_train.columns: # 转换数据类型，保证送入模型前的正确性
        if i not in cat_fea_names: # 对连续型特征设为float型
            X_train[i] = X_train[i].astype("float")
            X_test[i] = X_test[i].astype("float")
    y_train = y_train.astype("int")
    return X_train, X_test
X_train, X_test = median_impute(X_train, y_train, X_test, cat_fea_names)

# TS编码
encoder = TargetEncoder(cols=cat_fea_names, 
                        handle_unknown='value',  
                        handle_missing='value').fit(X_train,y_train) # 在训练集上训练
X_train = encoder.transform(X_train) # 转换训练集
X_test = encoder.transform(X_test) # 转换测试集
    
# 创建模型输入格式数据
X_train_dict, X_test_dict = {}, {}
for i in X_train.columns:
    X_train_dict[i] = np.array(X_train[i])
    X_test_dict[i] = np.array(X_test[i])
y_train = np.array(y_train)

In [9]:
# 创建输入参数和数据
continue_fea_names = list(set(X_train.columns) - set(cat_fea_names))
feature_columns = [DenseFeat(name) for name in continue_fea_names]
feature_columns.extend([SparseFeat(name='LGP_HLD_CARD_LVL', vocabulary_size=7, embedding_dim=2),
SparseFeat(name='WTHR_OPN_ONL_ICO', vocabulary_size=3, embedding_dim=2),
SparseFeat(name='NB_CTC_HLD_IDV_AIO_CARD_SITU', vocabulary_size=7, embedding_dim=2),
SparseFeat(name='MON_12_CUST_CNT_PTY_ID', vocabulary_size=2, embedding_dim=2)])

In [10]:
X_train.shape

(39900, 49)

In [47]:
earlyStopping = EarlyStopping(monitor='val_logloss', patience=200, verbose=1, mode='max')
modelCheckpoint = ModelCheckpoint(filepath="./trained_dcn_fintech", monitor='val_logloss', save_best_only=True, mode = 'max', verbose=1)

# model_dcn = DCN(linear_feature_columns=feature_columns, dnn_feature_columns=feature_columns, \
#            cross_num=2, cross_parameterization='vector', dnn_hidden_units=(64, 64, 32, 16), l2_reg_linear=0.00001, \
#             l2_reg_embedding=0.01, l2_reg_cross=0.01, l2_reg_dnn=0.01, init_std=0.001, seed=1024, \
#             dnn_dropout=0, dnn_activation='relu', dnn_use_bn=True, task='binary', device="cuda:0", gpus=None)

# model_dcn.compile('adam', 'binary_crossentropy', metrics=['auc'])

# model_dcn = model_dcn.fit(X_train_dict, y_train, batch_size=256, epochs=1000, verbose=2, initial_epoch=0, validation_split=0.2,\
#          validation_data=None, shuffle=True, callbacks = [earlyStopping, modelCheckpoint])


model_deepfm = DeepFM(linear_feature_columns=feature_columns, dnn_feature_columns=feature_columns, use_fm=True, \
            dnn_hidden_units=(64, 64, 32, 16), l2_reg_linear=0.00001, l2_reg_embedding=0.01, l2_reg_dnn=0.01, init_std=0.001, 
            seed=1024, dnn_dropout=0, dnn_activation='relu', dnn_use_bn=False, task='binary', device="cuda:0", gpus=None)

model_deepfm.compile('adam', 'binary_crossentropy', metrics=['auc', 'logloss'])

model_deepfm_history = model_deepfm.fit(X_train_dict, y_train, batch_size=256, epochs=1, verbose=2, initial_epoch=0, validation_split=0.2,\
         validation_data=None, shuffle=True, callbacks = [earlyStopping, modelCheckpoint])

res = model_deepfm.evaluate(X_train_dict, y_train, batch_size=256)

cuda:0
Train on 31920 samples, validate on 7980 samples, 125 steps per epoch
Epoch 1/1
2s - loss:  0.5498 - auc:  0.6653 - logloss:  0.5496 - val_auc:  0.6767 - val_logloss:  0.5299
Epoch 00001: val_logloss improved from -inf to 0.52994, saving model to ./trained_dcn_fintech


In [43]:
res

{'auc': 0.679141918652987, 'logloss': 0.5323812632055053}

In [19]:
import hyperopt
def object_model(params):
    earlyStopping = EarlyStopping(monitor='val_auc', patience=20, verbose=params['verbose'], mode='max')
    modelCheckpoint = ModelCheckpoint(filepath="./trained_dcn_fintech", monitor='val_auc', save_best_only=True, mode = 'max', verbose=params['verbose'])
    model_deepfm = DeepFM(linear_feature_columns=feature_columns, dnn_feature_columns=feature_columns, use_fm=True, \
                dnn_hidden_units=params['dnn_hidden_units'], l2_reg_linear=0.00001, l2_reg_embedding=0.01, l2_reg_dnn=0.01, init_std=0.001, 
                seed=1024, dnn_dropout=0, dnn_activation='relu', dnn_use_bn=True, task='binary', device="cuda:0", gpus=None)

    model_deepfm.compile('adam', 'binary_crossentropy', metrics=['auc'])

    model_deepfm_history = model_deepfm.fit(X_train_dict, y_train, batch_size=256, epochs=200, verbose=params['verbose'], initial_epoch=0, validation_split=0.2,\
             validation_data=None, shuffle=True, callbacks = [earlyStopping, modelCheckpoint])
    res = model_deepfm.evaluate(X_train_dict, y_train, batch_size=256)
    
    return -res['auc']
    
ctr_params = {
        'dnn_hidden_units': hyperopt.hp.choice('dnn_hidden_units', ((64,64,32,32),(128,64,32,16),(64,64,64,64)))
#         ,'dnn_dropout': hyperopt.hp.uniform('dnn_dropout', 0.2, 0.5)
        ,'verbose':0
        }

algo = hyperopt.partial(hyperopt.tpe.suggest, n_startup_jobs=20)
best_params = hyperopt.fmin(object_model, ctr_params, algo , max_evals = 10)
print("_best_params: ", best_params)

 10%|█         | 1/10 [04:32<40:56, 272.92s/trial, best loss: -0.854275520992938]


KeyboardInterrupt: 