In [None]:
"""[summary]
Click Logs dataset

dataset: Criteo
    - train.txt: Training set. 10 days of click-through data, ordered chronologically.
    - test.txt: Test set. 1 day of ads to for testing your model predictions.
features:
    - Label: Target variable that indicates if an ad was clicked (1) or not (0)
    - I1-I13: A total of 13 columns of integer features (mostly count features).
    - C1-C26: A total of 26 columns of categorical features.        

"""

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.model_selection import train_test_split

In [3]:
file = '../dataset/Criteo/train.txt'
embed_dim = 8
read_part = True
sample_num = 5_000_000
test_size = 0.2

names = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11',
             'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
             'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22',
             'C23', 'C24', 'C25', 'C26']

In [4]:
df = pd.read_csv(file, sep='\t', 
                 iterator=True,
                 header=None,
                 names=names)
df = df.get_chunk(sample_num)

df

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,0,,339,4.0,,2803.0,,0.0,0.0,0.0,...,776ce399,642f2610,1d1eb838,b1252a9d,ba23d2b6,,423fab69,45ab94c8,2bf691b1,c84c4aec
4999996,0,,1,10.0,10.0,9015.0,50.0,1.0,10.0,49.0,...,776ce399,31d666f1,,,18647a37,,32c7478e,ff04358d,,
4999997,0,,0,28.0,4.0,8490.0,,0.0,4.0,4.0,...,1e88c74f,891589e7,5af15ee4,b1252a9d,646cab01,,32c7478e,d89aa9c1,9b3e8820,279f0ca1
4999998,0,0.0,39,,,1850.0,201.0,41.0,46.0,284.0,...,3486227d,c61e82d7,21ddcdc9,5840adea,99c09e97,,423fab69,335a6a1e,445bbe3b,3055b376


In [5]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]
features = sparse_features + dense_features

df[sparse_features] = df[sparse_features].fillna('-1')
df[dense_features] = df[dense_features].fillna(0)

df


Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,-1,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,-1,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,-1,-1,e587c466,ad3062eb,3a171ecb,3b183c5c,-1,-1
3,0,0.0,893,0.0,0.0,4392.0,0.0,0.0,0.0,0.0,...,1e88c74f,74ef3502,-1,-1,6b3a5ca6,-1,3a171ecb,9117a34a,-1,-1
4,0,3.0,-1,0.0,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,-1,-1,21c9516a,-1,32c7478e,b34f3128,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,0,0.0,339,4.0,0.0,2803.0,0.0,0.0,0.0,0.0,...,776ce399,642f2610,1d1eb838,b1252a9d,ba23d2b6,-1,423fab69,45ab94c8,2bf691b1,c84c4aec
4999996,0,0.0,1,10.0,10.0,9015.0,50.0,1.0,10.0,49.0,...,776ce399,31d666f1,-1,-1,18647a37,-1,32c7478e,ff04358d,-1,-1
4999997,0,0.0,0,28.0,4.0,8490.0,0.0,0.0,4.0,4.0,...,1e88c74f,891589e7,5af15ee4,b1252a9d,646cab01,-1,32c7478e,d89aa9c1,9b3e8820,279f0ca1
4999998,0,0.0,39,0.0,0.0,1850.0,201.0,41.0,46.0,284.0,...,3486227d,c61e82d7,21ddcdc9,5840adea,99c09e97,-1,423fab69,335a6a1e,445bbe3b,3055b376


In [6]:
 # Bin continuous data into intervals.
est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform')
df[dense_features] = est.fit_transform(df[dense_features])

df[dense_features]

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0
2,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,12.0,1.0,0.0,0.0
3,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4999996,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4999997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4999998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0


In [7]:
for feat in sparse_features:
    le = LabelEncoder()
    df[feat] = le.fit_transform(df[feat])

df


Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,4492,259,3,30649,0,3,74157,75,37871
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,3212,259,1,385104,0,3,25457,75,28699
2,0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,...,6,920,0,0,910234,11,3,22075,0,0
3,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,2109,0,0,425677,0,3,54247,0,0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,687,0,0,133749,0,2,67132,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5,1817,223,3,738341,0,4,26129,11,50235
4999996,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,5,879,0,0,96594,0,2,95475,0,0
4999997,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,2480,719,3,398683,0,2,81037,52,9894
4999998,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,4,3617,259,1,609569,0,4,19232,16,12016


In [9]:
from utils import sparseFeature

feature_columns = [sparseFeature(feat, int(df[feature].max()) + 1)
                        for feature in features]
    
feature_columns

[{'feature_name': 'C26', 'feature_num': 1396},
 {'feature_name': 'C26', 'feature_num': 549},
 {'feature_name': 'C26', 'feature_num': 1373639},
 {'feature_name': 'C26', 'feature_num': 406655},
 {'feature_name': 'C26', 'feature_num': 290},
 {'feature_name': 'C26', 'feature_num': 21},
 {'feature_name': 'C26', 'feature_num': 11862},
 {'feature_name': 'C26', 'feature_num': 607},
 {'feature_name': 'C26', 'feature_num': 3},
 {'feature_name': 'C26', 'feature_num': 53574},
 {'feature_name': 'C26', 'feature_num': 5173},
 {'feature_name': 'C26', 'feature_num': 1156254},
 {'feature_name': 'C26', 'feature_num': 3119},
 {'feature_name': 'C26', 'feature_num': 26},
 {'feature_name': 'C26', 'feature_num': 11689},
 {'feature_name': 'C26', 'feature_num': 833957},
 {'feature_name': 'C26', 'feature_num': 10},
 {'feature_name': 'C26', 'feature_num': 4710},
 {'feature_name': 'C26', 'feature_num': 2062},
 {'feature_name': 'C26', 'feature_num': 4},
 {'feature_name': 'C26', 'feature_num': 1015598},
 {'feature_n

In [10]:
train, test = train_test_split(df, test_size=test_size)

train.shape, test.shape

((4000000, 40), (1000000, 40))

In [11]:
train.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
1119619,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,389,259,3,156693,11,2,34369,1,63747
2287390,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,5,2666,259,3,40999,0,11,26786,1,48983
1441693,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,3227,0,0,172310,0,2,52849,0,0
4173982,0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,8,3051,259,3,469091,0,2,8878,2,11692
4582945,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,3212,259,1,385104,0,4,25457,78,28699


In [12]:
train_X = train[features].values.astype('int32')
train_y = train['label'].values.astype('int32')
test_X = test[features].values.astype('int32')
test_y = test['label'].values.astype('int32')

print(train_X.shape, train_y.shape)
print(test_X.shape, test_y.shape)

(4000000, 39) (4000000,)
(1000000, 39) (1000000,)
