In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import lightgbm as lgb
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../code/utils')
sys.path.append('../code/pipeline')
sys.path.append('../code')
import data_utils as du
import perf_utils as pu
import config

In [2]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
df_ad = du.load_raw_data("ad")

train_size = df_train.shape[0]
test_size = df_test.shape[0]
print("Training data size: {}".format(train_size))
print("Testing data size: {}".format(test_size))

df_all = pd.concat([df_train, df_test], ignore_index=True)

Training data size: 8798814
Testing data size: 2265879


In [3]:
user_one_feat_names = config.USER_SINGLE_FEAT_NAMES
user_multi_feat_names = config.USER_MULTI_FEAT_NAMES
user_feat_names = config.USER_FEAT_NAMES
ad_feat_names = config.AD_FEAT_NAMES
ad_feat_names.remove("creativeId")

In [6]:
with pu.profiler("loading and joining user and ad data"):
    X_all, feat_names = du.quick_join(ad_user=df_all, 
                                      user_feat_names=user_feat_names,
                                      ad_feat_names=ad_feat_names)
    assert X_all.shape[0] == df_all.shape[0]
    gc.collect()
print("Combined Matrix Shape: {}".format(X_all.shape))

with pu.profiler("splitting train and test data"):
    X_all = sparse.csr_matrix(X_all)  # COO to CSR
    X_tv = X_all[:train_size, :]
    X_test = X_all[train_size:, :]
    assert X_tv.shape[0] == df_train.shape[0]
    assert X_test.shape[0] == df_test.shape[0]
    del X_all
    gc.collect()
print("Train Matrix Shape: {}".format(X_tv.shape))
print("Test Matrix Shape: {}".format(X_test.shape))

Combined Matrix Shape: (11064693, 419702)
[04:33:23] Finish splitting train and test data. △M: -3.19GB. △T: 35.6 seconds.
Train Matrix Shape: (8798814, 419702)
Test Matrix Shape: (2265879, 419702)


In [7]:
save_folder = config.INPUT_DIR
os.makedirs(save_folder, exist_ok=True)

save_file = "train.raw.binary.pkl"
save_path = os.path.join(save_folder, save_file)
du.save_pickle((feat_names, X_tv), save_path)

save_file = "test2.raw.binary.pkl"
save_path = os.path.join(save_folder, save_file)
du.save_pickle((feat_names, X_test), save_path)