In [1]:
import pandas as pd
import numpy as np
import time
import sys
sys.path.append('tools/tensorflow-DeepFM-master/')

import config
import DataReader
from DeepFM import DeepFM




import tensorflow as tf

from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import log_loss,roc_auc_score


<font color=#0099ff size=5 face="黑体">固定函数</font>

In [2]:
def timestamp_datetime(value):
    format = '%Y-%m-%d %H:%M:%S'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return value.tm_hour

def process(df):
    cols = [c for c in df.columns if c not in config.KEYS]
    for i in range(1,3):
        df['item_category_list_bin_%d'%i] = df['item_category_list'].apply(lambda x: x.split(';')[i] if len(x.split(';'))>i else -1)
    df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
    df['context_timestamp_hour'] = df['context_timestamp'].map(timestamp_datetime)   
    return df

<font color=#0099ff size=5 face="黑体">读取数据</font>

In [3]:
dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ')
dfTest = pd.read_table(config.TEST_FILE,sep=' ')

dfTrain = process(dfTrain)
dfTest = process(dfTest)

featInput = [c for c in dfTrain.columns if c not in config.KEYS]
featInput = [c for c in featInput if (not c in config.IGNORE_COLS)]

'''TrainX = dfTrain[featInput].values
TestX = dfTest[featInput].values
Trainy = dfTrain[config.LABEL].values'''

'TrainX = dfTrain[featInput].values\nTestX = dfTest[featInput].values\nTrainy = dfTrain[config.LABEL].values'

<font color=#0099ff size=5 face="黑体">生成稀疏特征</font>

In [4]:
fd = DataReader.FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS,
                           active_bound=1 )
data_parser = DataReader.DataParser(feat_dict=fd)
Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

<font color=#0099ff size=5 face="黑体">设置模型参数</font>

In [7]:
dfm_params = {
    "use_fm": True,
    "use_deep": True,
    "embedding_size": 8,
    "dropout_fm": [1.0, 1.0],
    "deep_layers": [32, 32],
    "dropout_deep": [0.5, 0.5, 0.5],
    "deep_layers_activation": tf.nn.relu,
    "epoch": 20,
    "batch_size": 1024,
    "learning_rate": 0.001,
    "optimizer_type": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "eval_metric": log_loss,
    "random_seed": config.RANDOM_SEED
}
dfm_params["feature_size"] = fd.feat_dim
dfm_params["field_size"] = len(Xi_train[0])
print(dfm_params["feature_size"])

2158


In [8]:
folds = list(KFold(n_splits=config.NUM_SPLITS,shuffle=True,random_state=config.RANDOM_SEED).split(dfTrain.values))
_get = lambda x,l:[x[i] for i in l]

y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

for i,(train_idx, valid_idx) in enumerate(folds):
    Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
    Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

    dfm = DeepFM(**dfm_params)
    dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)
    
    y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)
    
y_test_meta /= float(len(folds))    

#params: 25948
[1] train-result=0.1147, valid-result=0.1198 [18.9 s]
[2] train-result=0.1007, valid-result=0.1070 [18.5 s]
[3] train-result=0.0961, valid-result=0.1003 [17.9 s]
[4] train-result=0.0910, valid-result=0.0944 [17.7 s]
[5] train-result=0.0898, valid-result=0.0931 [18.5 s]
[6] train-result=0.0886, valid-result=0.0918 [17.8 s]
[7] train-result=0.0878, valid-result=0.0917 [17.9 s]
[8] train-result=0.0877, valid-result=0.0916 [18.2 s]
[9] train-result=0.0878, valid-result=0.0916 [17.9 s]
[10] train-result=0.0876, valid-result=0.0917 [17.8 s]
[11] train-result=0.0874, valid-result=0.0917 [18.1 s]
[12] train-result=0.0873, valid-result=0.0916 [18.3 s]
[13] train-result=0.0875, valid-result=0.0915 [18.4 s]
[14] train-result=0.0872, valid-result=0.0917 [19.3 s]
[15] train-result=0.0872, valid-result=0.0916 [18.4 s]
[16] train-result=0.0872, valid-result=0.0917 [18.1 s]
[17] train-result=0.0873, valid-result=0.0917 [19.2 s]
[18] train-result=0.0870, valid-result=0.0920 [18.8 s]
[19]

In [12]:
submit = pd.DataFrame({'instance_id':ids_test,'predicted_score':y_test_meta[:,0]})
submit.to_csv('../../Submission/advertisement/FM_deep_0321.txt', sep=" ", index=False, line_terminator='\n')