In [1]:
# ! pip install deepctr_torch
# ! pip install numpy
# ! pip install pandas

In [2]:
import numpy as np
import pandas as pd
import random
import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
pd.set_option('display.max_columns',None)




In [3]:
dtype={
  'id': np.dtype(str),
  'click': np.dtype(int),
  'hour': np.dtype(str),
  'C1': np.dtype(str),
  'banner_pos': np.dtype(str),
  'site_id': np.dtype(str),
  'site_domain': np.dtype(str), 
  'site_category': np.dtype(str),
  'app_id': np.dtype(str),
  'app_domain': np.dtype(str),
  'app_category': np.dtype(str),
  'device_id': np.dtype(str),
  'device_ip': np.dtype(str),
  'device_model': np.dtype(str),
  'device_type': np.dtype(str),
  'device_conn_type': np.dtype(str),
  'C14': np.dtype(str),
  'C15': np.dtype(str),
  'C16': np.dtype(str),
  'C17': np.dtype(str),
  'C18': np.dtype(str),
  'C19': np.dtype(str),
  'C20': np.dtype(str),
  'C21':np.dtype(str)
}
test_dtype={
  'id': np.dtype(str),
  'hour': np.dtype(str),
  'C1': np.dtype(str),
  'banner_pos': np.dtype(str),
  'site_id': np.dtype(str),
  'site_domain': np.dtype(str), 
  'site_category': np.dtype(str),
  'app_id': np.dtype(str),
  'app_domain': np.dtype(str),
  'app_category': np.dtype(str),
  'device_id': np.dtype(str),
  'device_ip': np.dtype(str),
  'device_model': np.dtype(str),
  'device_type': np.dtype(str),
  'device_conn_type': np.dtype(str),
  'C14': np.dtype(str),
  'C15': np.dtype(str),
  'C16': np.dtype(str),
  'C17': np.dtype(str),
  'C18': np.dtype(str),
  'C19': np.dtype(str),
  'C20': np.dtype(str),
  'C21':np.dtype(str)
}

In [4]:
num_records = 40428967
sample_size = 10000000
skip_values = sorted(random.sample(range(1,num_records), num_records - sample_size))

In [5]:
train = pd.read_csv("input/train.gz", dtype=dtype, skiprows=skip_values)
test = pd.read_csv("input/test.gz", dtype=test_dtype)
submission = pd.read_csv("input/sampleSubmission.gz")
print('Train dataset:',train.shape)
print('Test dataset:',test.shape)
print('Submission:',submission.shape)

Train dataset: (10000000, 24)
Test dataset: (4577464, 23)
Submission: (4577464, 2)


In [6]:
train.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10002028568167339219,0,14102100,1005,0,9e8cf15d,0d3cb7be,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,58811cdf,8326c04b,1,2,20596,320,50,2161,0,35,100148,157
1,10002518649031436658,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,98fed791,d9b5648e,0f2161f8,a99f214a,6dec2796,aad45b01,1,0,20984,320,50,2371,0,551,-1,46
2,10004482643316086592,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,66a5f0f3,d9b5648e,cef3e649,a99f214a,fa60af6b,b4b19c97,1,0,21234,320,50,2434,3,163,100088,61
3,10006415976094813740,0,14102100,1005,0,f84e52b6,d7e2f29b,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,a8649089,e9b8d8d7,1,0,16838,320,50,1882,3,35,-1,13
4,10007164336863914220,1,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,b2b14786,36d749e5,1,0,15706,320,50,1722,0,35,-1,79


In [7]:
test.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10000174058809263569,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,69f45779,0eb711ec,1,0,8330,320,50,761,3,175,100075,23
1,10000182526920855428,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,e8d44657,ecb851b2,1,0,22676,320,50,2616,0,35,100083,51
2,10000554139829213984,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,10fb085b,1f0bc64f,1,0,22676,320,50,2616,0,35,100083,51
3,10001094637809798845,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,a99f214a,422d257a,542422a7,1,0,18648,320,50,1092,3,809,100156,61
4,10001377041558670745,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,a99f214a,078c6b38,1f0bc64f,1,0,23160,320,50,2667,0,47,-1,221


In [8]:
train.isna().sum()

id                  0
click               0
hour                0
C1                  0
banner_pos          0
site_id             0
site_domain         0
site_category       0
app_id              0
app_domain          0
app_category        0
device_id           0
device_ip           0
device_model        0
device_type         0
device_conn_type    0
C14                 0
C15                 0
C16                 0
C17                 0
C18                 0
C19                 0
C20                 0
C21                 0
dtype: int64

In [9]:
data = train
dense_features = train.columns.tolist()[16:]
sparse_features = train.columns.tolist()[3:16]
sparse_features.append("id")
target = ['click']

In [11]:
test_dense_features = test.columns.tolist()[15:]
test_sparse_features = test.columns.tolist()[2:15] 
test_sparse_features.append("id")

In [13]:
scaler = MinMaxScaler(feature_range=(0,1))
data[dense_features] = scaler.fit_transform(data[dense_features])
test[test_dense_features] = scaler.transform(test[test_dense_features])

In [14]:
# 創建 LabelEncoder 實例
le = LabelEncoder()
# 合併訓練集和測試集的標籤
all_labels = pd.concat([data[sparse_features], test[test_sparse_features]]).reset_index(drop=True)
# 對所有標籤進行 fit 操作
for feature in sparse_features:
    le.fit(all_labels[feature])
    data[feature] = le.transform(data[feature])
    test[feature] = le.transform(test[feature])

# for feature in sparse_features:
#     data[feature] = le.fit_transform(data[feature])

In [15]:
# hyperparameters definitions: https://blog.csdn.net/qq_42363032/article/details/121672623
sparse_embed = [SparseFeat(feat, all_labels[feat].nunique()) for feat in sparse_features]
dense_embed = [DenseFeat(feat,1,) for feat in dense_features]

In [16]:
embed_features_columns = sparse_embed + dense_embed
dnn_feature_columns = embed_features_columns
linear_feature_columns = embed_features_columns

In [17]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['C1',
 'banner_pos',
 'site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
 'device_id',
 'device_ip',
 'device_model',
 'device_type',
 'device_conn_type',
 'id',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21']

In [18]:
# build model input dict, key->feature name: value->feature value list 
train_model_input = {feature: data[feature] for feature in feature_names}
test_model_input = {feature: test[feature] for feature in feature_names}

In [19]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.__version__)
print('Current device:', device)

2.1.1
Current device: cuda


In [28]:
# 調參數方向 -> 
# model introduction and optimization strategy: https://blog.csdn.net/qfikh/article/details/104037257
model = DeepFM(linear_feature_columns, 
               dnn_feature_columns,
               task='binary', 
               device=device,
               dnn_dropout=0.8,
               l2_reg_dnn=0.1,

               dnn_use_bn=True
               )
model

DeepFM(
  (embedding_dict): ModuleDict(
    (C1): Embedding(7, 4)
    (banner_pos): Embedding(7, 4)
    (site_id): Embedding(4126, 4)
    (site_domain): Embedding(5756, 4)
    (site_category): Embedding(25, 4)
    (app_id): Embedding(7181, 4)
    (app_domain): Embedding(421, 4)
    (app_category): Embedding(33, 4)
    (device_id): Embedding(1270228, 4)
    (device_ip): Embedding(3796437, 4)
    (device_model): Embedding(7285, 4)
    (device_type): Embedding(5, 4)
    (device_conn_type): Embedding(4, 4)
    (id): Embedding(14577463, 4)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (C1): Embedding(7, 1)
      (banner_pos): Embedding(7, 1)
      (site_id): Embedding(4126, 1)
      (site_domain): Embedding(5756, 1)
      (site_category): Embedding(25, 1)
      (app_id): Embedding(7181, 1)
      (app_domain): Embedding(421, 1)
      (app_category): Embedding(33, 1)
      (device_id): Embedding(1270228, 1)
      (device_ip): Embedding(3796437, 1)
      (device_model):

In [21]:
from torch.optim import Adam
model.compile(Adam(model.parameters(),0.0001),
              loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy','auc'])

In [22]:
from deepctr_torch.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(
        monitor='val_binary_crossentropy',
        min_delta=0.01,
        verbose=1,
        patience=2,
        mode='min'
    )
mdckpt = ModelCheckpoint(
        filepath='model.ckpt',
        monitor='val_binary_crossentropy', 
        verbose=1, 
        save_best_only=True, 
        mode='min'
    )

In [23]:
# 調參數方向 -> overfitting <- model over complicated
history = model.fit(train_model_input, 
          train[target].values, 
          batch_size=512,
          epochs=10,
          verbose=2,
          validation_split=0.2,
          callbacks=[es,mdckpt])
print(history)

cuda
Train on 8000000 samples, validate on 2000000 samples, 15625 steps per epoch
Epoch 1/10
946s - loss:  0.4080 - binary_crossentropy:  0.4073 - auc:  0.7351 - val_binary_crossentropy:  0.3960 - val_auc:  0.7363
Epoch 00001: val_binary_crossentropy improved from inf to 0.39598, saving model to model.ckpt
Epoch 2/10
903s - loss:  0.2443 - binary_crossentropy:  0.2366 - auc:  0.9135 - val_binary_crossentropy:  0.4069 - val_auc:  0.7156
Epoch 00002: val_binary_crossentropy did not improve from 0.39598
Epoch 3/10
902s - loss:  0.0575 - binary_crossentropy:  0.0529 - auc:  0.9944 - val_binary_crossentropy:  0.4110 - val_auc:  0.7231
Epoch 00003: val_binary_crossentropy did not improve from 0.39598
Epoch 00003: early stopping
<tensorflow.python.keras.callbacks.History object at 0x000001E17EF5CC70>


In [24]:
pred_ans = model.predict(test_model_input, 1024)

In [25]:
submission['click'] = pred_ans

In [26]:
submission

Unnamed: 0,id,click
0,10000174058809263569,0.383361
1,10000182526920855428,0.130719
2,10000554139829213984,0.337963
3,10001094637809798845,0.078516
4,10001377041558670745,0.185523
...,...,...
4577459,9998166651591969718,0.297320
4577460,9998249812366450951,0.114679
4577461,99988023653614546,0.393004
4577462,9999086574712596585,0.008871


In [27]:
submission.to_csv('output/submission_1.csv',index=False)