In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding,Input,Concatenate,Flatten
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [2]:
tf.__version__

'2.3.1'

In [3]:
# 读取数据集
df_criteo_train = pd.read_csv("../dataset/criteo_small/criteo_small_train.csv")
df_criteo_test = pd.read_csv("../dataset/criteo_small/criteo_small_test.csv")

In [4]:
df_criteo_train.head(3)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,30,31,32,33,34,35,36,37,38,39
0,646682,1,2.0,1151,7.0,,1738.0,8.0,2.0,10.0,...,e5ba7672,836a67dd,21ddcdc9,5840adea,c0cd6339,,c7dc6720,7e60320b,7a402766,ba14bbcb
1,7023680,1,,1,,4.0,13633.0,42.0,25.0,11.0,...,e5ba7672,c24ac50d,,,8a0017b9,,32c7478e,60a197ae,,
2,12834824,0,0.0,19,9.0,11.0,5199.0,101.0,2.0,11.0,...,07c540c4,48dc5aca,,,5ec62a90,,32c7478e,3de93139,,


In [5]:
df_criteo_test.head(3)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,30,31,32,33,34,35,36,37,38,39
0,7668413,0,4.0,1,2.0,44.0,50.0,30.0,6.0,39.0,...,e5ba7672,e1b6ea80,21ddcdc9,5840adea,290c14f6,,c7dc6720,ded4aac9,2bf691b1,bdf46dce
1,41167459,0,0.0,-1,,,2636.0,30.0,36.0,4.0,...,e5ba7672,1bed0358,,,f1f2d80e,,423fab69,2f1dc62c,,
2,43891510,0,,18,1.0,5.0,13550.0,,0.0,22.0,...,e5ba7672,e4ca448c,,,d5a47947,,32c7478e,9117a34a,,


In [6]:
# for col_name in df_criteo_train.columns:
#     print(df_criteo_train.dtypes)
sparse_cols = [fea for fea in df_criteo_train.iloc[:,2:].columns if df_criteo_train[fea].dtypes == "object"]
dense_cols = [fea for fea in df_criteo_train.iloc[:,2:].columns if df_criteo_train[fea].dtypes != "object"]
label_col = ["0"]

In [7]:
# preprocessing of dense features
# 对连续数据填补空值
def process_feats(data, feats, dense=True, train=True, scaler=None, dict_label_encoder= None):
    """Process dense or sparse features

    Fill the NaN values, transformations.

    Args: 
        data: A Pandas DataFrame contains the dense features. 
        feats: Dense features' name in DataFrame.

    Returns:
        A Dataframe after fill the NaN values and transformations.  
    """
    
    if dense:
        # fill NaN values for Dense features
        data_copy = data.copy()
        data_copy = data_copy[feats].fillna(0) # fill NaN for Dense features as intger 0
        
        if train:
            scaler = StandardScaler()
            data_copy[feats] = scaler.fit_transform(data_copy[feats]) # standardscaler fit_transform 
        else:
            data_copy[feats] = scaler.transform(data_copy[feats]) # train set's standardscler transform only.
        
        return data_copy,scaler

        
    else: 
        # fill NaNN values for Sparse features
        data_copy = data.copy()
        data_copy = data_copy[feats].fillna("-1") # fill NaN for Sparse features as "-1"
        

        if train:
            dict_label_encoder = dict()
            for feat in feats:
                label_encoder = LabelEncoder()
                data_copy[feat] = label_encoder.fit_transform(data_copy[feat])
                dict_label_encoder[feat] = label_encoder
        else:
            for feat in feats:
                label_encoder = dict_label_encoder[feat]
                data_copy[feat] = label_encoder.transform(data_copy[feat])

        return data_copy,dict_label_encoder

train_data_sparse, dict_label_encoder = process_feats(data= pd.concat([df_criteo_train,df_criteo_test],axis=0),feats = sparse_cols, dense= False,train=True)
train_data_dense, scaler = process_feats(data=df_criteo_train,feats=dense_cols, dense= True, train= True)

test_data_sparse, dict_label_encoder = process_feats(data= df_criteo_test,feats = sparse_cols, dense= False,train=False,dict_label_encoder=dict_label_encoder)
test_data_dense, scaler = process_feats(data=df_criteo_test,feats=dense_cols, dense= True, train=False, scaler=scaler)

In [8]:
print(train_data_sparse.shape)
print(train_data_dense.shape)

print('\n')
print(test_data_sparse.shape)
print(test_data_dense.shape)

(20000, 26)
(15000, 13)


(5000, 26)
(5000, 13)


In [9]:
# Get Dense Input
def get_dense_input(feats):
    dense_inputs = []
    for feat in feats:
        dense_input = Input(shape=(1,),name=feat)
        dense_inputs.append(dense_input)
    dense_inputs = Concatenate(axis=1)(dense_inputs)
    return dense_inputs

# Get Sparse Innput
def get_sparse_input(data,feats,embedding_size = 8):
    sparse_inputs = []
    for feat in feats:
        sparse_input = Input(shape=(1,),name=feat)
        sparse_inputs.append(sparse_input)
    
    sparse_embeddings = []
    for i,sparse_input in enumerate(sparse_inputs):
        f = feats[i]
        voc_size = data[f].nunique()
        reg = tf.keras.regularizers.l2(0.7)
        embed = Embedding(
            voc_size+1,
            embedding_size,
            embeddings_regularizer=reg,

        )(sparse_input)
        embed = Flatten()(embed)
        sparse_embeddings.append(embed)
    sparse_embeddings = Concatenate(axis=1)(sparse_embeddings)
    return sparse_embeddings

def get_model_input(dense_input,sparse_input):
    model_input = Concatenate(axis=1)([dense_input,sparse_input])

    return model_input
# get dense input
dense_inputs = get_dense_input(dense_cols)
print(dense_inputs)

# get sparse embedding input
sparse_inputs = get_sparse_input(df_criteo_train,sparse_cols)
print(sparse_inputs)

# get model's input, by concatenate dense_input and sparse_input
model_input = get_model_input(dense_input=dense_inputs,sparse_input=sparse_inputs)
print(model_input)

Tensor("concatenate/concat:0", shape=(None, 13), dtype=float32)
Tensor("concatenate_1/concat:0", shape=(None, 208), dtype=float32)
Tensor("concatenate_2/concat:0", shape=(None, 221), dtype=float32)
