In [None]:
!pip install -U deepctr-torch
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

from google.colab import drive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CTR/filtered_train.csv')  
data['hr'] = data['hour'].astype(str).str.slice(6, 8).astype(int) 
data = data.iloc[: , 1:]    
data = data.drop('id', axis = 1)  
print(data.columns)

Index(['click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'hr'],
      dtype='object')


In [None]:
data.shape

(404290, 24)

In [None]:
data.dtypes

click                int64
hour                 int64
C1                   int64
banner_pos           int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type          int64
device_conn_type     int64
C14                  int64
C15                  int64
C16                  int64
C17                  int64
C18                  int64
C19                  int64
C20                  int64
C21                  int64
hr                   int64
dtype: object

In [None]:
data.nunique()

click                    2
hour                   240
C1                       7
banner_pos               7
site_id               2225
site_domain           2188
site_category           22
app_id                2241
app_domain             143
app_category            27
device_id            64742
device_ip           261706
device_model          4380
device_type              4
device_conn_type         4
C14                   2088
C15                      8
C16                      9
C17                    411
C18                      4
C19                     65
C20                    161
C21                     60
hr                      24
dtype: int64

In [None]:
data['click'].value_counts() 

0    335635
1     68655
Name: click, dtype: int64

1. Label Encoding for sparse features and simple transformation for dense features. 

In [None]:
sparse_features = list(data.select_dtypes(['object']).columns)
dense_features = list(data.select_dtypes(['int64']).columns)
dense_features.remove('click')
sparse_features.remove('device_id')
sparse_features.remove('device_ip')

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['click']

for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))     
data[dense_features] = mms.fit_transform(data[dense_features]) 

In [None]:
print(dense_features)

['hour', 'C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'hr']


2. Count the number of unique features for each sparse field and record the dense feature field names.  


In [None]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                          for feat in sparse_features] + [DenseFeat(feat, 1) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

3. Train and test dataset.

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=2022)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

4. Setup GPU.

In [None]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

cuda ready...


5. Define Model - DeepFM.

In [None]:
model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                task='binary',
                l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )

history = model.fit(train_model_input, train[target].values, epochs=10, verbose=2,validation_split=0.2)
pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cuda:0
Train on 258745 samples, validate on 64687 samples, 1011 steps per epoch
Epoch 1/10
11s - loss:  0.4105 - binary_crossentropy:  0.4105 - auc:  0.7211 - val_binary_crossentropy:  0.4056 - val_auc:  0.7281
Epoch 2/10
11s - loss:  0.4004 - binary_crossentropy:  0.4005 - auc:  0.7429 - val_binary_crossentropy:  0.4045 - val_auc:  0.7300
Epoch 3/10
11s - loss:  0.3972 - binary_crossentropy:  0.3972 - auc:  0.7496 - val_binary_crossentropy:  0.4047 - val_auc:  0.7294
Epoch 4/10
11s - loss:  0.3952 - binary_crossentropy:  0.3952 - auc:  0.7530 - val_binary_crossentropy:  0.4054 - val_auc:  0.7288
Epoch 5/10
11s - loss:  0.3936 - binary_crossentropy:  0.3936 - auc:  0.7557 - val_binary_crossentropy:  0.4058 - val_auc:  0.7284
Epoch 6/10
11s - loss:  0.3925 - binary_crossentropy:  0.3925 - auc:  0.7573 - val_binary_crossentropy:  0.4064 - val_auc:  0.7275
Epoch 7/10
11s - loss:  0.3915 - binary_crossentropy:  0.3915 - auc:  0.7592 - val_binary_crossentropy:  0.4069 - val_auc:  0.7269
Epo

6. Define Model - xDeepFM.

In [None]:
model = xDeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                task='binary',
                l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )

history = model.fit(train_model_input, train[target].values, epochs=10, verbose=2,validation_split=0.2)
pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cuda:0
Train on 258745 samples, validate on 64687 samples, 1011 steps per epoch
Epoch 1/10
12s - loss:  0.4098 - binary_crossentropy:  0.4098 - auc:  0.7226 - val_binary_crossentropy:  0.4052 - val_auc:  0.7280
Epoch 2/10
12s - loss:  0.4001 - binary_crossentropy:  0.4001 - auc:  0.7433 - val_binary_crossentropy:  0.4045 - val_auc:  0.7296
Epoch 3/10
13s - loss:  0.3969 - binary_crossentropy:  0.3969 - auc:  0.7497 - val_binary_crossentropy:  0.4052 - val_auc:  0.7294
Epoch 4/10
13s - loss:  0.3946 - binary_crossentropy:  0.3946 - auc:  0.7536 - val_binary_crossentropy:  0.4064 - val_auc:  0.7283
Epoch 5/10
12s - loss:  0.3929 - binary_crossentropy:  0.3929 - auc:  0.7569 - val_binary_crossentropy:  0.4066 - val_auc:  0.7276
Epoch 6/10
13s - loss:  0.3915 - binary_crossentropy:  0.3915 - auc:  0.7585 - val_binary_crossentropy:  0.4074 - val_auc:  0.7266
Epoch 7/10
13s - loss:  0.3904 - binary_crossentropy:  0.3904 - auc:  0.7605 - val_binary_crossentropy:  0.4079 - val_auc:  0.7256
Epo

7. Define Model - Wide and Deep.

In [None]:
model = WDL(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                task='binary',
                l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )

history = model.fit(train_model_input, train[target].values, epochs=10, verbose=2,validation_split=0.2)
pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cuda:0
Train on 258745 samples, validate on 64687 samples, 1011 steps per epoch
Epoch 1/10
10s - loss:  0.4107 - binary_crossentropy:  0.4107 - auc:  0.7208 - val_binary_crossentropy:  0.4057 - val_auc:  0.7279
Epoch 2/10
10s - loss:  0.4007 - binary_crossentropy:  0.4007 - auc:  0.7424 - val_binary_crossentropy:  0.4046 - val_auc:  0.7299
Epoch 3/10
10s - loss:  0.3975 - binary_crossentropy:  0.3975 - auc:  0.7490 - val_binary_crossentropy:  0.4047 - val_auc:  0.7293
Epoch 4/10
10s - loss:  0.3955 - binary_crossentropy:  0.3954 - auc:  0.7524 - val_binary_crossentropy:  0.4054 - val_auc:  0.7287
Epoch 5/10
10s - loss:  0.3939 - binary_crossentropy:  0.3939 - auc:  0.7552 - val_binary_crossentropy:  0.4058 - val_auc:  0.7283
Epoch 6/10
10s - loss:  0.3928 - binary_crossentropy:  0.3928 - auc:  0.7568 - val_binary_crossentropy:  0.4064 - val_auc:  0.7274
Epoch 7/10
10s - loss:  0.3918 - binary_crossentropy:  0.3918 - auc:  0.7586 - val_binary_crossentropy:  0.4069 - val_auc:  0.7269
Epo