In [1]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from collections import Counter
from fastFM import sgd
import lightgbm as lgb
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../../../code/utils')
sys.path.append('../../../code/pipeline')
sys.path.append('../../../code')
import data_utils as du
import perf_utils as pu
# import data_jointer as dj
import config

In [2]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")

In [3]:
train_size = df_train.shape[0]
test_size = df_test.shape[0]
print("Training data size: {}".format(train_size))
print("Testing data size: {}".format(test_size))

Training data size: 8798814
Testing data size: 2265989


In [4]:
df_all = pd.concat([df_train, df_test], ignore_index=True)

In [5]:
user_one_feat_names = config.USER_SINGLE_FEAT_NAMES
user_multi_feat_names = config.USER_MULTI_FEAT_NAMES
user_feat_names = config.USER_FEAT_NAMES
ad_feat_names = config.AD_FEAT_NAMES
ad_feat_names.remove("creativeId")

In [6]:
with pu.profiler("loading and joining user and ad data"):
    X_all, feat_names = du.quick_join(ad_user=df_all, 
                                  user_feat_names=user_feat_names,
                                  ad_feat_names=ad_feat_names)
    assert X_all.shape[0] == df_all.shape[0]
    gc.collect()
print("Combined Matrix Shape: {}".format(X_all.shape))
print("Feature Names Count: {}".format(len(feat_names)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

loading user matrices: 100%|██████████| 23/23 [12:26<00:00, 32.45s/it]
loading ad matrices: 100%|██████████| 7/7 [02:00<00:00, 17.28s/it]

[07:44:25] Finish loading and joining user and ad data. △M: +7.28GB. △T: 14.5 minutes.
Combined Matrix Shape: (11064803, 419701)
Feature Names Count: 419701
Memory usage at this moment :7.88GB





In [7]:
with pu.profiler("splitting train and test data"):
    X_all = sparse.csr_matrix(X_all)  # COO to CSR
    X_tv = X_all[:train_size, :]
    X_test = X_all[train_size:, :]
    assert X_tv.shape[0] == df_train.shape[0]
    assert X_test.shape[0] == df_test.shape[0]
    del X_all
    gc.collect()
    print("Memory usage at this moment :{}".format(pu.get_memory_str()))

Memory usage at this moment :4.69GB
[07:44:54] Finish splitting train and test data. △M: -3.19GB. △T: 29.0 seconds.


In [8]:
y = df_train['label'].values
assert X_tv.shape[0] == y.shape[0]

In [10]:
save_folder = "../../../data/input"
save_file = "train.raw.binary.pkl"
save_path = os.path.join(save_folder, save_file)
os.makedirs(save_folder, exist_ok=True)

du.save_pickle((feat_names, X_tv), save_path)

In [11]:
save_file = "test1.raw.binary.pkl"
save_path = os.path.join(save_folder, save_file)

du.save_pickle((feat_names, X_test), save_path)