In [1]:
# ! pip install deepctr_torch
# ! pip install numpy
# ! pip install pandas

In [2]:
import numpy as np
import pandas as pd
import random
import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
pd.set_option('display.max_columns',None)


Please check the latest version manually on https://pypi.org/project/deepctr-torch/#history


In [3]:
dtype={
  'id': np.dtype(str),
  'click': np.dtype(int),
  'hour': np.dtype(str),
  'C1': np.dtype(str),
  'banner_pos': np.dtype(str),
  'site_id': np.dtype(str),
  'site_domain': np.dtype(str), 
  'site_category': np.dtype(str),
  'app_id': np.dtype(str),
  'app_domain': np.dtype(str),
  'app_category': np.dtype(str),
  'device_id': np.dtype(str),
  'device_ip': np.dtype(str),
  'device_model': np.dtype(str),
  'device_type': np.dtype(str),
  'device_conn_type': np.dtype(str),
  'C14': np.dtype(str),
  'C15': np.dtype(str),
  'C16': np.dtype(str),
  'C17': np.dtype(str),
  'C18': np.dtype(str),
  'C19': np.dtype(str),
  'C20': np.dtype(str),
  'C21':np.dtype(str)
}
test_dtype={
  'id': np.dtype(str),
  'hour': np.dtype(str),
  'C1': np.dtype(str),
  'banner_pos': np.dtype(str),
  'site_id': np.dtype(str),
  'site_domain': np.dtype(str), 
  'site_category': np.dtype(str),
  'app_id': np.dtype(str),
  'app_domain': np.dtype(str),
  'app_category': np.dtype(str),
  'device_id': np.dtype(str),
  'device_ip': np.dtype(str),
  'device_model': np.dtype(str),
  'device_type': np.dtype(str),
  'device_conn_type': np.dtype(str),
  'C14': np.dtype(str),
  'C15': np.dtype(str),
  'C16': np.dtype(str),
  'C17': np.dtype(str),
  'C18': np.dtype(str),
  'C19': np.dtype(str),
  'C20': np.dtype(str),
  'C21':np.dtype(str)
}

In [5]:
# Due to RAM constraints, I ramdomly sample the training data
num_records = 40428967
sample_size = 10000000
skip_values = sorted(random.sample(range(1,num_records), num_records - sample_size))

train = pd.read_csv("input/train.gz", dtype=dtype, skiprows=skip_values)
test = pd.read_csv("input/test.gz", dtype=test_dtype)
submission = pd.read_csv("input/sampleSubmission.gz")
print('Train dataset:',train.shape)
print('Test dataset:',test.shape)
print('Submission:',submission.shape)

Train dataset: (10000000, 24)
Test dataset: (4577464, 23)
Submission: (4577464, 2)


In [6]:
train.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10000371904215119486,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,b3cf8def,8a4875bd,1,0,15704,320,50,1722,0,35,100084,79
1,10002518649031436658,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,98fed791,d9b5648e,0f2161f8,a99f214a,6dec2796,aad45b01,1,0,20984,320,50,2371,0,551,-1,46
2,10005334911727438633,0,14102100,1010,1,85f751fd,c4e18dd6,50e219e0,ffc6ffd0,7801e8d9,0f2161f8,fb23c543,69890c7f,9fef9da8,4,0,21665,320,50,2493,3,35,-1,117
3,10006192453619779489,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,685d1c4c,2347f47a,8ded1f7a,6a943594,8a014cbb,81b42528,1,3,15708,320,50,1722,0,35,-1,79
4,10006777279679619273,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,d2bb6502,2347f47a,8ded1f7a,4b2309e9,22c2dcf4,d6e0e6ff,1,3,18987,320,50,2158,3,291,100193,61


In [7]:
test.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10000174058809263569,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,69f45779,0eb711ec,1,0,8330,320,50,761,3,175,100075,23
1,10000182526920855428,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,e8d44657,ecb851b2,1,0,22676,320,50,2616,0,35,100083,51
2,10000554139829213984,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,10fb085b,1f0bc64f,1,0,22676,320,50,2616,0,35,100083,51
3,10001094637809798845,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,a99f214a,422d257a,542422a7,1,0,18648,320,50,1092,3,809,100156,61
4,10001377041558670745,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,a99f214a,078c6b38,1f0bc64f,1,0,23160,320,50,2667,0,47,-1,221


In [8]:
train.isna().sum()

id                  0
click               0
hour                0
C1                  0
banner_pos          0
site_id             0
site_domain         0
site_category       0
app_id              0
app_domain          0
app_category        0
device_id           0
device_ip           0
device_model        0
device_type         0
device_conn_type    0
C14                 0
C15                 0
C16                 0
C17                 0
C18                 0
C19                 0
C20                 0
C21                 0
dtype: int64

In [9]:
data = train
dense_features = train.columns.tolist()[16:]
sparse_features = train.columns.tolist()[3:16]
sparse_features.append("id")
target = ['click']

In [10]:
test_dense_features = test.columns.tolist()[15:]
test_sparse_features = test.columns.tolist()[2:15] 
test_sparse_features.append("id")

In [11]:
# Fit MinMaxScaler to dense features
scaler = MinMaxScaler(feature_range=(0,1))
data[dense_features] = scaler.fit_transform(data[dense_features])
test[test_dense_features] = scaler.transform(test[test_dense_features])

In [12]:
# Fit LabelEncoder to sparse features 
le = LabelEncoder()
all_labels = pd.concat([data[sparse_features], test[test_sparse_features]]).reset_index(drop=True)
for feature in sparse_features:
    le.fit(all_labels[feature])
    data[feature] = le.transform(data[feature])
    test[feature] = le.transform(test[feature])

# for feature in sparse_features:
#     data[feature] = le.fit_transform(data[feature])

In [13]:
# hyperparameters definitions: https://blog.csdn.net/qq_42363032/article/details/121672623
sparse_embed = [SparseFeat(feat, all_labels[feat].nunique()) for feat in sparse_features]
dense_embed = [DenseFeat(feat,1,) for feat in dense_features]

In [15]:
embed_features_columns = sparse_embed + dense_embed
dnn_feature_columns = embed_features_columns
linear_feature_columns = embed_features_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['C1',
 'banner_pos',
 'site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
 'device_id',
 'device_ip',
 'device_model',
 'device_type',
 'device_conn_type',
 'id',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21']

In [16]:
# build model input dict, key->feature name: value->feature value list 
train_model_input = {feature: data[feature] for feature in feature_names}
test_model_input = {feature: test[feature] for feature in feature_names}

In [17]:
# use GPU to train model
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.__version__)
print('Current device:', device)

2.1.1
Current device: cuda


In [18]:
# model introduction and optimization strategy: https://blog.csdn.net/qfikh/article/details/104037257
model = DeepFM(linear_feature_columns, 
               dnn_feature_columns,
               task='binary', 
               device=device,
               dnn_dropout=0.8,
               l2_reg_dnn=0.01,
               dnn_hidden_units=[128,64],
               dnn_use_bn=True 
               )
model

DeepFM(
  (embedding_dict): ModuleDict(
    (C1): Embedding(7, 4)
    (banner_pos): Embedding(7, 4)
    (site_id): Embedding(4144, 4)
    (site_domain): Embedding(5736, 4)
    (site_category): Embedding(26, 4)
    (app_id): Embedding(7193, 4)
    (app_domain): Embedding(433, 4)
    (app_category): Embedding(31, 4)
    (device_id): Embedding(1269931, 4)
    (device_ip): Embedding(3796136, 4)
    (device_model): Embedding(7273, 4)
    (device_type): Embedding(5, 4)
    (device_conn_type): Embedding(4, 4)
    (id): Embedding(14577463, 4)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (C1): Embedding(7, 1)
      (banner_pos): Embedding(7, 1)
      (site_id): Embedding(4144, 1)
      (site_domain): Embedding(5736, 1)
      (site_category): Embedding(26, 1)
      (app_id): Embedding(7193, 1)
      (app_domain): Embedding(433, 1)
      (app_category): Embedding(31, 1)
      (device_id): Embedding(1269931, 1)
      (device_ip): Embedding(3796136, 1)
      (device_model):

In [19]:
from torch.optim import Adam
model.compile(Adam(model.parameters(),0.0001),
              loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy','auc'])

In [20]:
from deepctr_torch.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(
        monitor='val_binary_crossentropy',
        min_delta=0.01,
        verbose=1,
        patience=3,
        mode='min'
    )
mdckpt = ModelCheckpoint(
        filepath='model.ckpt',
        monitor='val_binary_crossentropy', 
        verbose=1, 
        save_best_only=True, 
        mode='min'
    )

In [21]:
# 調參數方向 -> overfitting <- model over complicated
history = model.fit(train_model_input, 
          train[target].values, 
          batch_size=256,
          epochs=10,
          verbose=2,
          validation_split=0.2,
          callbacks=[es,mdckpt])
print(history)

cuda
Train on 8000000 samples, validate on 2000000 samples, 31250 steps per epoch
Epoch 1/10
1649s - loss:  0.4083 - binary_crossentropy:  0.4081 - auc:  0.7334 - val_binary_crossentropy:  0.3954 - val_auc:  0.7360
Epoch 00001: val_binary_crossentropy improved from inf to 0.39544, saving model to model.ckpt
Epoch 2/10
1644s - loss:  0.3505 - binary_crossentropy:  0.3483 - auc:  0.8135 - val_binary_crossentropy:  0.3953 - val_auc:  0.7371
Epoch 00002: val_binary_crossentropy improved from 0.39544 to 0.39529, saving model to model.ckpt
Epoch 3/10
1641s - loss:  0.2295 - binary_crossentropy:  0.2259 - auc:  0.8992 - val_binary_crossentropy:  0.3970 - val_auc:  0.7326
Epoch 00003: val_binary_crossentropy did not improve from 0.39529
Epoch 4/10
1640s - loss:  0.2173 - binary_crossentropy:  0.2136 - auc:  0.9271 - val_binary_crossentropy:  0.3972 - val_auc:  0.7328
Epoch 00004: val_binary_crossentropy did not improve from 0.39529
Epoch 00004: early stopping
<tensorflow.python.keras.callbacks

In [22]:
pred_ans = model.predict(test_model_input, 1024)

In [23]:
submission['click'] = pred_ans

In [24]:
submission

Unnamed: 0,id,click
0,10000174058809263569,0.428320
1,10000182526920855428,0.141637
2,10000554139829213984,0.179547
3,10001094637809798845,0.106800
4,10001377041558670745,0.217634
...,...,...
4577459,9998166651591969718,0.348590
4577460,9998249812366450951,0.197733
4577461,99988023653614546,0.412337
4577462,9999086574712596585,0.016943


In [25]:
submission.to_csv('output/submission_1.csv',index=False)