In [None]:
import time, numpy as np, pandas as pd, gc
from scipy.sparse import csr_matrix, hstack

start_time = time.time()
np.random.seed(32)
from sklearn.model_selection import train_test_split

dtypes = {"ip": "uint32",
         "app": "uint16",
         "device": "uint16",
         "os": "uint16",
         "channel": "uint16",
         "is_attributed": "uint8",
         "click_id": "uint32"}

train = pd.read_csv("../input/train.csv", engine = "c", 
                    skiprows= range(1, 68000000), dtype = dtypes,
                    usecols = ["ip", "app", "device", "os", "channel", "click_time", "is_attributed"])

print("Load train data [{}] completed!".format(time.time() - start_time))

In [None]:
def process_time(data):
    data["click_time"] = pd.to_datetime(data["click_time"])
    dt = data["click_time"].dt
    data["day"] = dt.day.astype("uint8")
    data["hour"] = dt.hour.astype("uint8")
#     data.drop(["click_time"], axis = 1, inplace = True)
    del dt; gc.collect()
    return data

train = process_time(train)
X_train = train[train["day"] == 8].loc[train["hour"] >= 4]
X_valid = train[train["day"] == 9].loc[train["hour"] >= 4]
del train; gc.collect()

print("Train set shape {}".format(X_train.shape))
print("Valid set shape {}".format(X_valid.shape))
print("Split train set [{}] completed!".format(time.time() - start_time))

In [None]:
def cal_next_click(data):
    D = 2**26
    data["category"] = (data["ip"].astype(str) + "_" + data["app"].astype(str) 
                        + "_" + data["device"].astype(str) 
                        + "_" + data["os"].astype(str)).apply(hash) % D
    data.drop(["ip", "app", "device", "os", "channel"], axis = 1, inplace = True); gc.collect()
    click_buffer = np.full(D, 3000000000, dtype=np.uint32)
    data["epochtime"]= data["click_time"].astype(np.int64) // 10 ** 9
    next_clicks = []
    for category, time in zip(reversed(data["category"].values), reversed(data["epochtime"].values)):
        next_clicks.append(click_buffer[category]-time)
        click_buffer[category]= time
    data["next_clicks"] = list(reversed(next_clicks))
    return data

# def add_prev_click(data, hours):
#     prev_clicks = []
#     for h in hours:
#         sub_data = data[data["hour"] == h]
#         nc = cal_prev_click(sub_data)
#         prev_clicks.append(nc)
#     prev_clicks = np.concatenate(prev_clicks, axis = 0)
#     data["prev_clicks"] = list(prev_clicks)
#     return data

X_train = cal_next_click(X_train)
print("Train data completed!")
X_valid = cal_next_click(X_valid)
print("Valid data completed!")
print("Add next click [{}] completed!".format(time.time() - start_time))

In [None]:
X_train = X_train.reindex(columns = ["next_clicks"])
X_valid = X_valid.reindex(columns = ["next_clicks"])
X_train.to_csv("train_next_clicks.csv.gz", index = False, compression = "gzip")
X_valid.to_csv("valid_next_clicks.csv.gz", index = False, compression = "gzip")

In [None]:
test = pd.read_csv("../input/test_supplement.csv", low_memory = True, engine = "c", dtype = dtypes,
                   usecols = ["ip", "app", "device", "os", "channel", "click_time", "click_id"])
X_test = process_time(test)
del test; gc.collect()

X_test = cal_next_click(X_test)
print("Test data completed!")
X_test = X_test.reindex(columns = ["next_clicks"])
X_test.to_csv("test_next_clicks.csv.gz", index = False, compression = "gzip")