In [None]:
import time, numpy as np, pandas as pd, gc
start_time = time.time()
np.random.seed(32)
from sklearn.model_selection import train_test_split

dtypes = {"ip": "uint32",
         "app": "uint16",
         "device": "uint16",
         "os": "uint16",
         "channel": "uint16",
         "is_attributed": "uint8",
         "click_id": "uint32"}

In [None]:
path = "../input/talkingdata-adtracking-fraud-detection/"
train = pd.read_csv(path + "train.csv", low_memory = True, skiprows = range(1, 68000000), 
                    engine = "c", dtype = dtypes,
#                     nrows = 90000000,
                    usecols = ["ip", "app", "device", "os", "channel", "click_time", "is_attributed"])

test = pd.read_csv(path + "test.csv", low_memory = True, engine = "c", dtype = dtypes,
                   usecols = ["ip", "app", "device", "os", "channel", "click_time", "click_id"])
print("Load data [{}] completed!".format(time.time() - start_time))

In [None]:
def process_time(data):
    data["hour"] = pd.to_datetime(data.click_time).dt.hour.astype("uint8")
    data["day"] = pd.to_datetime(data.click_time).dt.day.astype("uint8")
    data.drop(["click_time"], axis = 1, inplace = True)
    return data

keras_submission = pd.DataFrame()
keras_submission["click_id"] = test[["click_id"]].astype("uint32")
test.drop(["click_id"], axis = 1, inplace = True)
X_test = test

train = process_time(train)
X_train = train[train["day"] == 8].loc[train["hour"].isin(set(range(4, 16)))]
Y_train = X_train["is_attributed"]
X_valid = train[train["day"] == 9].loc[train["hour"] >= 4]
Y_valid = X_valid["is_attributed"]
X_test = process_time(X_test)

print("Train set shape {}".format(X_train.shape))
print("Valid set shape {}".format(X_valid.shape))
print("Test set shape {}".format(X_test.shape))
print("Split train set [{}] completed!".format(time.time() - start_time))
del train, test; gc.collect()

In [None]:
from contextlib import contextmanager
@contextmanager

def timer(name):
    t0 = time.time()
    yield
    print(f"[{name}] done in {time.time() - t0:.0f}s")

In [None]:
most_freq_hours_in_test_data = [4, 5, 9, 10, 13, 14]
least_freq_hours_in_test_data = [6, 11, 15]

def add_count_feat(data, cols):
    arr_slice = data[cols].values
    unq, unqtags, counts = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(0) + 1),
                                     return_inverse=True, return_counts=True)
    data["_".join(cols) + "_count"] = counts[unqtags]
    return data
    
def add_unique_feat(data, cols):
    feat = data[cols].groupby(by = cols[0:len(cols) - 1])[cols[len(cols) - 1]].nunique()
    feat = feat.reset_index().rename(index = str, columns = {cols[len(cols) - 1]: "_".join(cols) + "_unique"})
    data = data.merge(feat, on = cols[0:len(cols) - 1], how = "left")
    return data

def feature_eng(data):    
    data["in_test_hour"] = (3 - 2*data["hour"].isin(most_freq_hours_in_test_data) 
                            - 1*data["hour"].isin(least_freq_hours_in_test_data)).astype("uint8")
    
    data = add_count_feat(data, ["ip", "in_test_hour"])
    data["ip_in_test_hour_count"] = data["ip_in_test_hour_count"].astype("uint32")

    data = add_count_feat(data, ["ip", "hour"])
    data["ip_hour_count"] = data["ip_hour_count"].astype("uint32")
    
    data = add_count_feat(data, ["ip", "app", "os", "hour"])
    data["ip_app_os_hour_count"] = data["ip_app_os_hour_count"].astype("uint16")
    
    data = add_count_feat(data, ["app", "channel"])
    data["app_channel_count"] = data["app_channel_count"].astype("uint32")
    
    data = add_count_feat(data, ["ip", "channel"])
    data["ip_channel_count"] = data["ip_channel_count"].astype("uint32")
    
    data = add_unique_feat(data, ["ip", "channel"])
    data["ip_channel_unique"] = data["ip_channel_unique"].astype("uint32")
    
    data.drop(["ip", "day", "in_test_hour"], axis = 1, inplace = True)
    gc.collect()
    
    return data

with timer("Feature engineering"):
    X_train = feature_eng(X_train)
    print("Train data completed!")
    X_valid = feature_eng(X_valid)
    print("Valid data completed!")
    X_test = feature_eng(X_test)
    print("Test data completed!")

In [None]:
path = "../input/next-click-data-set/"

with timer("Add clicks"):
    train_next_clicks = pd.read_csv(path + "train_next_clicks.csv", low_memory = True, engine = "c",
                                    dtype = {"next_clicks": "uint32"})
    X_train["next_clicks"] = train_next_clicks.values
    del train_next_clicks; gc.collect()
    
    valid_next_clicks = pd.read_csv(path + "valid_next_clicks.csv", low_memory = True, engine = "c",
                                    dtype = {"next_clicks": "uint32"})
    X_valid["next_clicks"] = valid_next_clicks.values
    del valid_next_clicks; gc.collect()
    
    test_next_clicks = pd.read_csv(path + "test_next_clicks.csv", low_memory = True, engine = "c",
                                   dtype = {"next_clicks": "uint32"})
    X_test["next_clicks"] = test_next_clicks.values
    del test_next_clicks; gc.collect()

In [None]:
max_app = np.max([X_train["app"].max(), X_valid["app"].max(), X_test["app"].max()]) + 1
max_dev = np.max([X_train["device"].max(), X_valid["device"].max(), X_test["app"].max()]) + 1
max_os = np.max([X_train["os"].max(), X_valid["os"].max(), X_test["os"].max()]) + 1
max_ch = np.max([X_train["channel"].max(), X_valid["channel"].max(), X_test["channel"].max()]) + 1
max_h = np.max([X_train["hour"].max(), X_valid["hour"].max(), X_test["hour"].max()]) + 1

max_c1 = np.max([X_train["ip_in_test_hour_count"].max(), 
                 X_valid["ip_in_test_hour_count"].max(), 
                 X_test["ip_in_test_hour_count"].max()]) + 1
max_c2 = np.max([X_train["ip_hour_count"].max(), 
                 X_valid["ip_hour_count"].max(), 
                 X_test["ip_hour_count"].max()]) + 1
max_c3 = np.max([X_train["ip_app_os_hour_count"].max(), 
                 X_valid["ip_app_os_hour_count"].max(), 
                 X_test["ip_app_os_hour_count"].max()]) + 1
max_c4 = np.max([X_train["app_channel_count"].max(), 
                 X_valid["app_channel_count"].max(), 
                 X_test["app_channel_count"].max()]) + 1
max_c5 = np.max([X_train["ip_channel_count"].max(), 
                 X_valid["ip_channel_count"].max(), 
                 X_test["ip_channel_count"].max()]) + 1
max_c6 = np.max([X_train["ip_channel_unique"].max(), 
                 X_valid["ip_channel_unique"].max(), 
                 X_test["ip_channel_unique"].max()]) + 1
max_c7 = np.max([X_train["next_clicks"].max(), 
                 X_valid["next_clicks"].max(), 
                 X_test["next_clicks"].max()]) + 1

In [None]:
def get_keras_data(data):
    df = {"app": np.array(data.app),
          "device": np.array(data.device),
          "os": np.array(data.os),
          "channel": np.array(data.channel),
          "hour": np.array(data.hour),
#           "ip_in_test_hour_count": np.array(data.ip_in_test_hour_count),
          "ip_hour_count": np.array(data.ip_hour_count),
          "ip_app_os_hour_count": np.array(data.ip_app_os_hour_count),
          "app_channel_count": np.array(data.app_channel_count),
#           "ip_channel_count": np.array(data.ip_channel_count),
          "ip_channel_unique": np.array(data.ip_channel_unique),
          "next_clicks": np.array(data.next_clicks)}
    return df

X_train = get_keras_data(X_train)
X_valid = get_keras_data(X_valid)
X_test = get_keras_data(X_test)
print("Get keras data [{}] completed!".format(time.time() - start_time))

In [None]:
from keras.layers import Input, Embedding, Dropout, Dense, concatenate, Conv1D, GRU
from keras.layers import SpatialDropout1D, BatchNormalization, Flatten, LSTM, Bidirectional
from keras.layers.advanced_activations import PReLU
from keras.optimizers import Adam
from keras.models import Model, load_model

def build_model(emb_n = 50, dr = 0.3, dense_n = 1024, lr_i = 1e-3, lr_f = 1e-3, batch_size = 2**13):
    in_app = Input(shape = [1], name = "app")
    emb_app = Embedding(max_app, emb_n)(in_app)
    in_dev = Input(shape = [1], name = "device")
    emb_dev = Embedding(max_dev, emb_n)(in_dev)
    in_ch = Input(shape = [1], name = "channel")
    emb_ch = Embedding(max_ch, emb_n)(in_ch)
    in_os = Input(shape = [1], name = "os")
    emb_os = Embedding(max_os, emb_n)(in_os)
    in_h = Input(shape = [1], name = "hour")
    emb_h = Embedding(max_h, emb_n)(in_h) 

#     in_c1 = Input(shape = [1], name = "ip_in_test_hour_count")
#     emb_c1 = Embedding(max_c1, emb_n)(in_c1) 
    in_c2 = Input(shape = [1], name = "ip_hour_count")
    emb_c2 = Embedding(max_c2, emb_n)(in_c2) 
    in_c3 = Input(shape = [1], name = "ip_app_os_hour_count")
    emb_c3 = Embedding(max_c3, emb_n)(in_c3) 
    in_c4 = Input(shape = [1], name = "app_channel_count")
    emb_c4 = Embedding(max_c4, emb_n)(in_c4) 
#     in_c5 = Input(shape = [1], name = "ip_channel_count")
#     emb_c5 = Embedding(max_c5, emb_n)(in_c5) 
    in_c6 = Input(shape = [1], name = "ip_channel_unique")
    emb_c6 = Embedding(max_c6, emb_n)(in_c6) 
    in_c7 = Input(shape = [1], name = "next_clicks")
    emb_c7 = Embedding(max_c7, emb_n)(in_c7) 
    
    main = concatenate([emb_app, emb_dev, emb_ch, emb_os, emb_h, 
#                         emb_c1, 
                        emb_c2, emb_c3, emb_c4, 
#                         emb_c5, 
                        emb_c6, emb_c7])
    main = SpatialDropout1D(dr)(main)
    main = Bidirectional(LSTM(32, dropout = 0.2, recurrent_dropout = 0.2))(main)
    main = Dense(dense_n, activation = "relu")(main)
    main = Dropout(dr)(main)
    main = Dense(32, activation = "relu")(main)
    main = Dropout(dr)(main)
    outp = Dense(1, activation = "sigmoid")(main)
    model = Model(inputs = [in_app, in_dev, in_ch, in_os, in_h,
#                             in_c1, 
                            in_c2, in_c3, in_c4, 
#                             in_c5, 
                            in_c6, in_c7],
                 outputs = outp)
    exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
    steps = int(len(list(X_train)[0]) / batch_size) * 2
    lr_init, lr_fin = lr_i, lr_f
    lr_decay = exp_decay(lr_init, lr_fin, steps)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr_i, decay = lr_decay), 
                  metrics = ["accuracy"])
    return model

In [None]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import EarlyStopping, ModelCheckpoint

file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
batch_size = 2**16
print("Trainning model [{}]".format(time.time() - start_time))
model = build_model(emb_n = 25, dr = 0.2, dense_n = 64, lr_i = 1e-3, lr_f = 1e-4, batch_size = batch_size)
model.fit(X_train, Y_train, batch_size = batch_size, epochs = 2, verbose = 10, 
          validation_data = (X_valid, Y_valid), class_weight = {0:0.01, 1:0.99},)
# model = load_model(file_path)
oof = model.predict(X_valid, batch_size = batch_size, verbose = 10)[:, 0]
print("cv is {}".format(roc_auc_score(Y_valid, oof)))
del oof; gc.collect()
keras_prediction = model.predict(X_test, batch_size = batch_size, verbose = 10)[:, 0]
print("Train and predict [{}] completed!".format((time.time() - start_time)/3600))

In [None]:
keras_submission["is_attributed"] = keras_prediction
keras_submission.to_csv("keras_submission.csv", index = False)
print("Completed writing keras submission file [{}]".format(time.time() - start_time))

In [None]:
keras_submission.head()