In [1]:
import pandas as pd
import numpy as np
import time
import sys
sys.path.append('tools/tensorflow-DeepFM-master/')

import config
import DataReader
from DeepFM import DeepFM




import tensorflow as tf

from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import log_loss,roc_auc_score


<font color=#0099ff size=5 face="黑体">固定函数</font>

In [2]:
timeFeatList = [
    ['user_id','day'],
    ['user_id','day','hour'],
    ['item_id','day'],
    ['item_id','day','hour'],
    ['shop_id','day'],
    ['shop_id','day','hour'],
    ['item_brand_id','day'],
    ['item_brand_id','day','hour'],
    ['item_city_id','day'],
    ['item_city_id','day','hour'],
]

In [3]:
def timestamp_datetime(value):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value))

def time_feat(df,featList,featName):
    '''scaler = StandardScaler()
    tmp = df.groupby(featList).size().reset_index().rename(columns={0:featName})
    tmp[featName] = scaler.fit_transform(tmp[featName].values.reshape(-1,1))
    df = df.merge(tmp,'left',on=featList)'''
    df[featName] = df.groupby(featList)['context_timestamp'].rank(method='first')   
    return df

def process(df):
    df['time'] = df.context_timestamp.apply(timestamp_datetime)
    df['day'] = df.time.apply(lambda x: int(x[8:10]))
    df['hour'] = df.time.apply(lambda x: int(x[11:13]))
    for lst in timeFeatList:
        df = time_feat(df,lst,'_'.join(lst))
    df['item_property_list_clean'] = df['item_property_list'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    for i in range(3):
        df['item_category_list_bin%d'%i] = df['item_category_list'].apply(lambda x: x.split(';')[i] if len(x.split(';'))>i else -1)
    df["missing_feat"] = np.sum((df == -1).values, axis=1)
    return df

#def string_process(dfTrain,dfTest):
    

<font color=#0099ff size=5 face="黑体">读取数据</font>

In [4]:
dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ')
dfTrain.drop_duplicates(inplace=True)
dfTest = pd.read_table(config.TEST_FILE,sep=' ')

dfTrain = process(dfTrain)
dfTest = process(dfTest)


featInput = [c for c in dfTrain.columns if c not in config.KEYS]
featInput = [c for c in featInput if (not c in config.IGNORE_COLS)]


<font color=#0099ff size=5 face="黑体">生成稀疏特征</font>

In [5]:
fd = DataReader.FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS,
                           active_bound=1)
data_parser = DataReader.DataParser(feat_dict=fd)
Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

<font color=#0099ff size=5 face="黑体">设置模型参数</font>

In [9]:
dfm_params = {
    "use_fm": True,
    "use_deep": True,
    "embedding_size": 8,
    "dropout_fm": [1.0, 1.0],
    "deep_layers": [32, 32],
    "dropout_deep": [0.5, 0.5, 0.5],
    "deep_layers_activation": tf.nn.relu,
    "epoch": 20,
    "batch_size": 2048,
    "learning_rate": 0.001,
    "optimizer_type": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "eval_metric": log_loss,
    "random_seed": config.RANDOM_SEED
}
dfm_params["feature_size"] = fd.feat_dim
dfm_params["field_size"] = len(Xi_train[0])
print(dfm_params["feature_size"])

186270


In [10]:
###根据日期拆分训练集
train_idx = dfTrain.loc[dfTrain['day']<24].index
valid_idx = dfTrain.loc[dfTrain['day']==24].index

#train_idx = list(train_idx)+appendList

_get = lambda x,l:[x[i] for i in l]
Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

dfm = DeepFM(**dfm_params)
dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

#params: 1687068
[1] train-result=0.1310, valid-result=0.1259 [39.2 s]
[2] train-result=0.0980, valid-result=0.0931 [34.8 s]
[3] train-result=0.0813, valid-result=0.0876 [37.0 s]
[4] train-result=0.0716, valid-result=0.0875 [35.3 s]
[5] train-result=0.0598, valid-result=0.0882 [38.4 s]
[6] train-result=0.0532, valid-result=0.0894 [36.9 s]
[7] train-result=0.0514, valid-result=0.0905 [37.0 s]
[8] train-result=0.0470, valid-result=0.0924 [38.4 s]
[9] train-result=0.0413, valid-result=0.0936 [35.5 s]
[10] train-result=0.0409, valid-result=0.0928 [36.5 s]
[11] train-result=0.0398, valid-result=0.0969 [35.2 s]
[12] train-result=0.0364, valid-result=0.0937 [36.3 s]
[13] train-result=0.0379, valid-result=0.0940 [38.5 s]
[14] train-result=0.0341, valid-result=0.0986 [37.8 s]
[15] train-result=0.0348, valid-result=0.0967 [38.5 s]
[16] train-result=0.0304, valid-result=0.1049 [34.8 s]
[17] train-result=0.0337, valid-result=0.1015 [35.1 s]
[18] train-result=0.0308, valid-result=0.0993 [35.2 s]
[1

KeyboardInterrupt: 

In [None]:
help(dfm.embeddings)

In [None]:
dfm.y_second_order

In [None]:
result_valid_ = dfm.predict( Xi_valid_, Xv_valid_)
print(log_loss(y_valid_, result_valid_))

In [None]:
df = pd.DataFrame({'target':y_valid_,'score':result_valid_})
df['score'].max()

In [None]:
df.sort_values('score',ascending=False,inplace=True)
df.reset_index(inplace=True)

In [None]:
df.iloc[:20,1].mean()

In [None]:
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)

In [None]:
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)
y_test_meta[:,0].max()

In [None]:
folds = list(KFold(n_splits=config.NUM_SPLITS,shuffle=True,random_state=config.RANDOM_SEED).split(dfTrain.values))
_get = lambda x,l:[x[i] for i in l]

y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

for i,(train_idx, valid_idx) in enumerate(folds):
    Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
    Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

    dfm = DeepFM(**dfm_params)
    dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)
    
    y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)
    
y_test_meta /= float(len(folds))    

In [None]:
submit = pd.DataFrame({'instance_id':ids_test,'predicted_score':y_test_meta[:,0]})
submit.to_csv('../../Submission/advertisement/FM_deep_0322.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'].describe()

In [None]:
np.log(0.9058)

In [None]:
2.15/18371


In [None]:
submit.shape