In [None]:
import time, numpy as np, pandas as pd, gc
from scipy.sparse import csr_matrix, hstack

start_time = time.time()
np.random.seed(32)
from sklearn.model_selection import train_test_split

dtypes = {"ip": "uint32",
         "app": "uint16",
         "device": "uint16",
         "os": "uint16",
         "channel": "uint16",
         "is_attributed": "uint8",
         "click_id": "uint32"}

In [None]:
path = "../input/talkingdata-adtracking-fraud-detection/"
train = pd.read_csv(path + "train.csv", low_memory = True, skiprows = range(1, 68000000), 
                    engine = "c", dtype = dtypes,
#                     nrows = 90000000,
                    usecols = ["ip", "app", "device", "os", "channel", "click_time", "is_attributed"])
print("Load train data [{}] completed!".format(time.time() - start_time))

In [None]:
from contextlib import contextmanager
@contextmanager

def timer(name):
    t0 = time.time()
    yield
    print(f"[{name}] done in {time.time() - t0:.0f}s")

In [None]:
def process_time(data):
    dt = pd.to_datetime(data["click_time"]).dt
    data["day"] = dt.day.astype("uint8")
    data["hour"] = dt.hour.astype("uint8")
    del dt; gc.collect()
    return data

with timer("Split training set"):
    train = process_time(train)
    X_train = train[train["day"] == 8].loc[train["hour"] >= 4]
    Y_train = X_train["is_attributed"]
    X_train.drop(["is_attributed"], axis = 1, inplace = True)
    X_valid = train[train["day"] == 9].loc[train["hour"] >= 4]
    Y_valid = X_valid["is_attributed"]
    X_valid.drop(["is_attributed"], axis = 1, inplace = True)

    print("Train set shape {}".format(X_train.shape))
    print("Valid set shape {}".format(X_valid.shape))
    del train; gc.collect()

In [None]:
most_freq_hours_in_test_data = [4, 5, 9, 10, 13, 14]
least_freq_hours_in_test_data = [6, 11, 15]

def add_count_feat(data, cols):
    feat_name = "_".join(cols) + "_count"
    arr_slice = data[cols].values
    unq, unqtags, counts = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(0) + 1),
                                     return_inverse=True, return_counts=True)
    data[feat_name] = counts[unqtags]
    del arr_slice, unq, unqtags, counts; gc.collect()
    
    dtype = "uint32"
    max_value = data[feat_name].max()
    if max_value <= 255:
        dtype = "uint8"
    elif max_value <= 65535:
        dtype = "uint16"
    data[feat_name] = data[feat_name].astype(dtype, copy = False)
    return data
    
def add_unique_feat(data, cols):
    feat_name = "_".join(cols) + "_unique"
    feat = data[cols].groupby(by = cols[:-1])[cols[-1]].nunique().reset_index().rename(columns = {cols[-1]: feat_name})
    data = data.merge(feat, on = cols[:-1], how = "left", copy = False)
    del feat; gc.collect()
    
    dtype = "uint32"
    max_value = data[feat_name].max()
    if max_value <= 255:
        dtype = "uint8"
    elif max_value <= 65535:
        dtype = "uint16"
    data[feat_name] = data[feat_name].astype(dtype, copy = False)
    return data

def feature_eng(data):    
    data["in_test_hour"] = (3 - 2*data["hour"].isin(most_freq_hours_in_test_data) 
                            - 1*data["hour"].isin(least_freq_hours_in_test_data)).astype("uint8")
    
    data = add_count_feat(data, ["ip", "in_test_hour"])
    data = add_count_feat(data, ["ip", "hour"])    
#     data = add_count_feat(data, ["ip", "app"])    
    data = add_count_feat(data, ["ip", "device"])    
    data = add_count_feat(data, ["ip", "app", "os"])
#     data = add_count_feat(data, ["ip", "app", "os", "hour"])    
    data = add_count_feat(data, ["app", "channel"])    
#     data = add_count_feat(data, ["ip", "channel"])    
    data = add_unique_feat(data, ["ip", "channel"])    
    data = add_unique_feat(data, ["ip", "app"])    
#     data = add_unique_feat(data, ["app", "channel"])    
    data = add_unique_feat(data, ["ip", "device", "os", "app"])    
    data.drop(["day", "in_test_hour"], axis = 1, inplace = True)
    gc.collect()
    
    return data

with timer("Feature engineering"):
    X_train = feature_eng(X_train)
    print("Train data completed!")
    X_valid = feature_eng(X_valid)
    print("Valid data completed!")

In [None]:
click_path = "../input/fork-of-next-click-data-set-day-8/"

with timer("Add clicks"):
    train_next_clicks = pd.read_csv(click_path + "train_next_clicks.csv.gz", low_memory = True, engine = "c",
                                    dtype = {"next_clicks": "uint32"})
    X_train["next_clicks"] = train_next_clicks.values
    del train_next_clicks; gc.collect()
    
    valid_next_clicks = pd.read_csv(click_path + "valid_next_clicks.csv.gz", low_memory = True, engine = "c",
                                    dtype = {"next_clicks": "uint32"})
    X_valid["next_clicks"] = valid_next_clicks.values
    del valid_next_clicks; gc.collect()

In [None]:
with timer("Add time delta"):
    time_path = "../input/bidirectional-talkingdata-train-time-deltas/"
    time_delta = pd.DataFrame(pd.read_csv(time_path + "train_time_delta.csv.gz", low_memory = True, engine = "c"))
    X_train["forward_time_delta"] = time_delta["forward_time_delta"].values.astype("uint32")
    del time_delta; gc.collect()

    time_delta = pd.DataFrame(pd.read_csv(time_path + "valid_time_delta.csv.gz", low_memory = True, engine = "c"))
    X_valid["forward_time_delta"] = time_delta["forward_time_delta"].values.astype("uint32")
    del time_delta; gc.collect()

In [None]:
import lightgbm as lgbm

predictors = ["app", "device", "os", "channel", "hour",
              "ip_in_test_hour_count", 
              "ip_hour_count", 
#               "ip_app_count",
              "ip_device_count",
              "ip_app_os_count",
#               "ip_app_os_hour_count", 
              "app_channel_count",
#               "ip_channel_count",
              "ip_channel_unique",
              "ip_app_unique",
              "ip_device_os_app_unique",
#               "app_channel_unique",
              "next_clicks",
#               "forward_time_delta"
#               "app_mean_target", "os_mean_target", "device_mean_target", "channel_mean_target"
             ]
categorical = ["app", "device", "os", "channel", "hour"]

params = {"boosting_type": "gbdt",
          "objective": "binary",
          "metric": "auc",
          "learning_rate": 0.1,
          "num_leaves": 26,
          "max_depth": 4, 
          "min_child_samples": 20,
          "max_bin": 100,
#           "feature_fraction": 1,
          "bagging_fraction": 0.9,
          "bagging_freq": 5, 
#           "subsample": 0.8,
#           "subsample_freq": 1,
          "colsample_bytree": 0.9,
          "min_child_weight": 0,
          "min_split_gain": 0,
#           "reg_alpha": 0,
#           "reg_lambda": 0,
          "nthread": 8,
          "verbose": 0,
          "scale_pos_weight": 200}

max_rounds = 1000
evals_results = {}

with timer("Train"):
    X_train = X_train[predictors].values.astype(np.float32)
    X_valid = X_valid[predictors].values.astype(np.float32)
    dtrain = lgbm.Dataset(X_train, label = Y_train,
                          feature_name = predictors, categorical_feature = categorical)
    del X_train, Y_train; gc.collect()
    dvalid = lgbm.Dataset(X_valid, label = Y_valid,
                          feature_name = predictors, categorical_feature = categorical)
    del X_valid, Y_valid; gc.collect()
    
    print("Start building lgbm model [{}]".format(time.time() - start_time))
    model = lgbm.train(params, dtrain, valid_sets = [dtrain, dvalid], valid_names = ["train", "valid"],
                       evals_result = evals_results, num_boost_round = max_rounds, early_stopping_rounds = 30, 
                       verbose_eval = 10, feval = None)
    del dtrain, dvalid; gc.collect()
    
    n_estimator = model.best_iteration
    print("\nModel Report")
    print("n_estimators is {}".format(n_estimator))
    print("cv auc is {}".format(evals_results["valid"]["auc"][n_estimator-1]))
    print("Training [{}] completed!".format(time.time() - start_time))

In [None]:
with timer("Predict"):
    X_test = pd.read_csv(path + "test_supplement.csv", low_memory = True, engine = "c", dtype = dtypes,
                         usecols = ["ip", "app", "device", "os", "channel", "click_time", "click_id"])
    X_test = process_time(X_test)
    X_test = X_test[X_test["day"] == 10]
    print("Test set shape {}".format(X_test.shape))
    
    X_test = feature_eng(X_test)
    print("Test data completed!")

    test_next_clicks = pd.read_csv(click_path + "test_next_clicks.csv.gz", low_memory = True, engine = "c",
                               dtype = {"next_clicks": "uint32"})
    X_test["next_clicks"] = test_next_clicks.values
    del test_next_clicks; gc.collect()

#     time_path = "../input/bidirectional-talkingdata-test-time-deltas/"
#     time_delta = pd.DataFrame(pd.read_csv(time_path + "test_time_delta.csv.gz", low_memory = True, engine = "c"))
#     X_test["forward_time_delta"] = time_delta["forward_time_delta"].values.astype("uint32")
#     del time_delta; gc.collect()
    
    X_test["is_attributed"] = model.predict(X_test[predictors].values, num_iteration = n_estimator)
    
print("Train and predict [{}] completed!".format((time.time() - start_time)/3600))

In [None]:
join_cols = ["ip", "app", "device", "os", "channel", "click_time"]
all_cols = join_cols + ["is_attributed"]
test = pd.read_csv(path + "test.csv", low_memory = True, engine = "c", dtype = dtypes,
                   usecols = ["ip", "app", "device", "os", "channel", "click_time", "click_id"])
test = test.merge(X_test[all_cols], how = "left", on = join_cols)
del X_test; gc.collect()

test = test.drop_duplicates(subset = ["click_id"])
print("Writing the submission data into a csv file...")
test[["click_id", "is_attributed"]].to_csv("submission.csv", index = False)
print("Completed writing lgbm submission file [{}]".format(time.time() - start_time))

In [None]:
test.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize = (8, 8))
sns.barplot(model.feature_importance(), model.feature_name())
plt.show()