In [1]:
#coding=utf-8
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import roc_auc_score

def load_data():
    df = pd.read_csv("Data/train.csv.gz")
    return df

def oneHotEncoder(array_1d):
    label = LabelEncoder().fit_transform(array_1d)
    label = label.reshape(len(label), 1)
    one_hot = OneHotEncoder(sparse=False).fit_transform(label)
    return one_hot

def minMaxScale(array_2d):
    return MinMaxScaler().fit_transform(array_2d)

def preprocess(data):
    cat_list =[f for f in data.columns]
    for c in cat_list:
        data[c] = LabelEncoder().fit_transform(list(data[c].values))

    return data

def auc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

def eval_matric(y_true, y_prob):
    print(sum(y_true)/ len(y_true))
    print(sum([i>0.5 for i in y_prob])/ len(y_true))

    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
        gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    print("gini:", gini)
    return gini

In [2]:
from deepctr.models import DeepFM, xDeepFM, DCN
from deepctr.inputs import  SparseFeat, DenseFeat,get_fixlen_feature_names

In [6]:
print("---loading and preprocessing the data---")
data = load_data()
data = data.set_index("id")
target = data['target']
data.drop(['target'], axis=1, inplace=True)
#data = preprocess(data)

---loading and preprocessing the data---


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder

from deepctr.models import DeepFM
from deepctr.inputs import  SparseFeat, DenseFeat,get_fixlen_feature_names

def recognize_feature(data, label_encoder = False):
    sparse_features = []
    dense_features = []
    for f in data.columns:
        if data[f].dtype=='object':
            lbl = LabelEncoder()
            lbl.fit(list(data[f].values))
            data[f] = lbl.transform(list(data[f].values))
            sparse_features.append(f)
        elif f.find('cat') >=0 and f.find('bin') <0:
            lbl = LabelEncoder()
            lbl.fit(list(data[f].values))
            data[f] = lbl.transform(list(data[f].values))
            sparse_features.append(f)
        elif data[f].dtype not in ['float16','float32','float64']:
            if(len(data[f].unique()) < 100 and f.find('bin') <0):
                lbl = LabelEncoder()
                lbl.fit(list(data[f].values))
                data[f] = lbl.transform(list(data[f].values))
                sparse_features.append(f)
    print("sparse : unique sum ", sum([len(data[f].unique()) for f in sparse_features]))
        
    dense_features = list(set(data.columns.tolist()) - set(sparse_features))
    return data, sparse_features, dense_features

def hash_encoding(data, sparse_features):
    return ;
def one_hot_for_sparse(data, sparse_features):
    for f in sparse_features:
        one_hot = pd.get_dummies(data[f], prefix =f, dummy_na = True)
        data.drop(f , axis = 1, inplace=True)
        data = data.join(one_hot)
    return data
def scalar_for_dense(data, dense_features):
    for f in dense_features:
        scaler = MinMaxScaler()
        data[f] = scaler.fit_transform(data[f].values.reshape(-1,1))
    return data
    

data, sparse_features, dense_features = recognize_feature(data)
print(len(dense_features),len(sparse_features))

sparse_label_dict = dict()
for f in sparse_features:
    sparse_label_dict[f] = data[f].max()

print(data.shape)
#data = one_hot_for_sparse(data, sparse_features)
#data = scalar_for_dense(data, dense_features)
print(data.shape)

sparse : unique sum  376
27 30
(595212, 57)
(595212, 57)


In [None]:
print(sparse_label_dict)

In [8]:
train, test, y_train, y_test = train_test_split(data, target, test_size=0.33, random_state=42)

In [53]:
from keras import optimizers
from keras import backend as K
from keras.models import Sequential
from keras.layers import Input,Dense, concatenate,Dropout,BatchNormalization,Activation,Flatten,Add
from keras.layers import RepeatVector, merge, Subtract, Lambda, Multiply, Embedding, Concatenate, Reshape
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.engine.topology import Layer

class Added_Weights(Layer):
    def __init__(self, use_bias = False, **kwargs):
        self.use_bias = use_bias
        
        super(Added_Weights, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.kernel = self.add_weight(name='kernel',
                                      shape=(input_shape[1], input_shape[2]),
                                      initializer='uniform',  # TODO: Choose your initializer
                                      trainable=True)
        
        if(self.use_bias):
            self.bias = self.add_weight(name='bias',
                                        shape=(1, input_shape[2]),
                                        initializer='uniform',  # TODO: Choose your initializer
                                        trainable=True)
        else:
            self.bias = self.add_weight(name='bias',
                                        shape=(1, input_shape[2]),
                                        initializer='zeros',
                                        trainable=False)
        
        super(Added_Weights, self).build(input_shape)

    def call(self, x, **kwargs):
        # Implicit broadcasting occurs here.
        # Shape x: (BATCH_SIZE, N, M)
        # Shape kernel: (N, M)
        # Shape output: (BATCH_SIZE, N, M)
        #if self.use_bias:
        #if self.use_bias:
        return x * self.kernel + self.bias
        

    def compute_output_shape(self, input_shape):
        return input_shape
    
class DeepFM():
    def __init__(self, sparse_features, dense_features, sparse_label_dict, hidden_layer, embed_dim):
        self.sparse_features = sparse_features
        self.dense_features = dense_features
        self.sparse_label_dict = sparse_label_dict
        self.hidden_layer = hidden_layer
        self.embed_dim = embed_dim
        
    def fit(self, train, test, y_train, y_test):
        cat_input = []
        cat_output = []        
        for col in self.sparse_features:
            input = Input(shape= (1,))
            cat_input.append(input)
            emb = Embedding(sparse_label_dict[col], self.embed_dim, input_length =1 ,trainable = True)(input)
            cat_output.append(emb)
         
        cat_output = Concatenate(axis=1)(cat_output)
        
        first_order = Added_Weights(use_bias = True)(cat_output)
        first_order = Flatten()(first_order)
        
        # 需要使用lambda 层封装Backend 的函数操作
        first_order = Lambda(lambda x: K.sum(x, axis =1, keepdims=True))(first_order)
        
        # cat_output shape : s *k, keras 需要把这个list 进行concat 为一个tensor
        # 然后fatten 为一个weight，然后在sum，或者是直接sum, w * x ,w 是tf.variable
        
        # second order for sparse features with fixed dim
        # vx * vx - vx, vx shape: (1, k)
        vx = Added_Weights()(cat_output)
        sum_square = Lambda(lambda x: K.sum(x, axis =1))(vx)
        sum_square = Multiply()([sum_square, sum_square])
        square_sum = Multiply()([vx, vx])
        square_sum = Lambda(lambda x: K.sum(x, axis =1))(square_sum)
        second_order = Subtract()([sum_square, square_sum])
        second_order = Lambda(lambda x: K.sum(x/2, axis =1, keepdims=True))(second_order)
        print(second_order.shape)
        '''
        dense_input = []
        for col in self.dense_features:
            input = Input(shape = (1, ))
            dense_input.append(input)
        dense_input = Concatenate(axis=1)(dense_input)
        '''
        dense_input = Input(shape = (len(self.dense_features), ))
        
        dnn_input = Concatenate(axis=1)([Flatten()(cat_output), dense_input])
        #dnn_input = dense_input 
        dnn_output = dnn_input 
        for layer in self.hidden_layer:
            dnn_output  = BatchNormalization()(dnn_output)
            dnn_output  = Dense(layer, activation='relu')(dnn_output)
            dnn_output  = Dropout(0.2)(dnn_output)
        dnn_output = Dense(1, activation='linear')(dnn_output)
        
        #output  = Concatenate(axis=1)([first_order, second_order, dnn_output])
        output  = Add()([first_order, second_order, dnn_output])
        output = Dense(1, activation='sigmoid')(output)
        model = Model(inputs = cat_input + [dense_input], outputs=output)
        print("---starting the training---")
        model.compile(
            optimizer="adam",
            loss='binary_crossentropy',
            metrics=["accuracy"]
        )
        #print(model.summary())
        model.fit([train[f] for f in self.sparse_features] + [train[self.dense_features]], y_train, nb_epoch=50, batch_size=1000)
        loss, accuracy = model.evaluate([test[f] for f in self.sparse_features] +  [test[self.dense_features]], y_test)
        print('\n', 'test accuracy:', accuracy)
        y_pred = model.predict([test[f] for f in self.sparse_features] +  [test[self.dense_features]])
        print(sum(y_test))
        print(len(y_test))
        print("auc is ", roc_auc_score(y_test, y_pred))
        eval_matric(y_test, y_pred)

In [54]:
model = DeepFM(sparse_features, dense_features, sparse_label_dict, [2048, 1024, 100, 50] , 4)
model.fit(train, test, y_train, y_test)

(?, 1)
---starting the training---




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

 test accuracy: 0.9619488850422564
7202
196420
auc is  0.5707878363803417
0.03666632725791671
[0.00171062]




gini: [nan]


In [14]:
from keras import optimizers
from keras import backend as K
from keras.models import Sequential
from keras.layers import Input,Dense, concatenate,Dropout,BatchNormalization,Activation
from keras.layers import RepeatVector, merge, Subtract, Lambda, Multiply
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.engine.topology import Layer
            
def models(train, y_train, test, y_test):
    # wide
    wide = Sequential()
    wide = Input(shape=(train.shape[1],))

    # deep
    deep_data = Input(shape=(train.shape[1],))
    deep = Dense(1000, activation='relu')(deep_data)
    #deep = BatchNormalization()(deep)
    deep = Dropout(0.2)(deep)
    
    deep = Dense(100, activation='relu')(deep)
    #deep = BatchNormalization()(deep)
    deep = Dropout(0.2)(deep)

    deep = Dense(50, activation='relu')(deep)
    #deep = BatchNormalization()(deep)
    deep = Dropout(0.2)(deep)
    # wide & deep 
    #wide_deep = concatenate([wide, deep])
    wide_deep = deep
    #wide_deep = deep
    wide_deep = Dense(1, activation='sigmoid')(wide_deep)
    model = Model(inputs=[wide, deep_data], outputs=wide_deep)
    
    sgd = optimizers.SGD(lr=0.1)
    adam = optimizers.Adam(lr=0.000)
    
    print("---starting the training---")
    model.compile(
        optimizer="adam",
        loss='binary_crossentropy',
        metrics=["accuracy"]
    )
    model.fit([train, train], y_train, nb_epoch=20, batch_size=1000)

    loss, accuracy = model.evaluate([test, test], y_test)
    print('\n', 'test accuracy:', accuracy)
    y_pred = model.predict([test,test])
    print(sum(y_test))
    print(len(y_test))
    print("auc is ", roc_auc_score(y_test, y_pred))
    eval_matric(y_test, y_pred)

In [15]:
d = one_hot_for_sparse(data, sparse_features)
#d = scalar_for_dense(data, dense_features)
train, test, y_train, y_test = train_test_split(d, target, test_size=0.33, random_state=42)

In [16]:
models(train, y_train, test, y_test)

---starting the training---




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

KeyboardInterrupt: 