In [1]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from collections import Counter
from fastFM import sgd
import lightgbm as lgb
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../code/utils')
sys.path.append('../code/pipeline')
sys.path.append('../code')
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import joblib
import config

In [2]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")

In [3]:
train_size = df_train.shape[0]
test_size = df_test.shape[0]
print("Training data size: {}".format(train_size))
print("Testing data size: {}".format(test_size))

Training data size: 8798814
Testing data size: 2265989


In [4]:
df_all = pd.concat([df_train, df_test], ignore_index=True)

In [5]:
user_one_feat_names = config.USER_SINGLE_FEAT_NAMES
user_multi_feat_names = config.USER_MULTI_FEAT_NAMES
user_feat_names = config.USER_FEAT_NAMES
ad_feat_names = config.AD_FEAT_NAMES
ad_feat_names.remove("creativeId")

In [6]:
user_jointer = dj.PandasMatrixJointer("uid")
ad_jointer = dj.PandasMatrixJointer("aid")
X_all = None
col_names = []

with pu.profiler("loading and joining user and ad data"):
    for user_feat_name in tqdm.tqdm(user_feat_names, desc="processing user features..."):
        # load matrix
        row_uids, (word_to_index, word_idf, tfidf_matrix) = du.load_user_tfidf(user_feat_name)
        matrix_all = user_jointer.join(df_all, tfidf_matrix, row_uids)
        del tfidf_matrix
        gc.collect()
        
        # concat matrices
        if X_all is None:
            X_all = matrix_all
        else:
            X_all = sparse.hstack((X_all, matrix_all))
            del matrix_all
            gc.collect()
            
        # define feature names
        col_names += ["{}_{}".format(user_feat_name, val)
                      for val, index in sorted(word_to_index.items(), key=lambda x: x[1])]
        
    for ad_feat_name in tqdm.tqdm(ad_feat_names, desc="processing ad features..."):
        row_aids, (words, cnt_matrix) = du.load_ad_cnt(ad_feat_name)
        matrix_all = ad_jointer.join(df_all, cnt_matrix, row_aids)
        del cnt_matrix
        gc.collect()
        
        # concat matrices
        if X_all is None:
            X_all = matrix_all
        else:
            X_all = sparse.hstack((X_all, matrix_all))
            del matrix_all
            gc.collect()
            
        # define feature names
        col_names += ["{}_{}".format(ad_feat_name, val) for val in words]
        
print("Combined Matrix Shape: {}".format(X_all.shape))
print("Feature Names Count: {}".format(len(col_names)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

processing user features...: 100%|██████████| 23/23 [14:49<00:00, 38.66s/it]
processing ad features...: 100%|██████████| 7/7 [02:47<00:00, 23.91s/it]

[03:24:10] Finish loading and joining user and ad data. △M: +12.5GB. △T: 17.6 minutes.
Combined Matrix Shape: (11064803, 419701)
Feature Names Count: 419701
Memory usage at this moment :13.1GB





In [7]:
with pu.profiler("splitting train and test data"):
    X_all = sparse.csr_matrix(X_all, dtype=np.float32)  # COO to CSR
    X_tv = X_all[:train_size, :]
    X_test = X_all[train_size:, :]
    assert X_tv.shape[0] == df_train.shape[0]
    assert X_test.shape[0] == df_test.shape[0]
    del X_all
    gc.collect()
    
    y = df_train['label'].values
    assert X_tv.shape[0] == y.shape[0]
    
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

[03:27:12] Finish splitting train and test data. △M: -3.28GB. △T: 1.2 minutes.
Memory usage at this moment :9.82GB


In [41]:
save_folder = "../data/input"
save_file = "train.raw.tfidf.pkl"
save_path = os.path.join(save_folder, save_file)
os.makedirs(save_folder, exist_ok=True)

X_tv = X_tv.astype(np.float32)
gc.collect()
# du.save_pickle((col_names, X_tv), save_path)
joblib.dump((col_names, X_tv), save_path)

['../data/input/train.raw.tfidf.pkl']

In [33]:
save_file = "test1.raw.tfidf.pkl"
save_path = os.path.join(save_folder, save_file)

X_test = X_test.astype(np.float32)
gc.collect()
# du.save_pickle((col_names, X_test), save_path)
joblib.dump((col_names, X_test), save_path)

['../data/input/test1.raw.tfidf.pkl']

In [34]:
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

Memory usage at this moment :10.13GB
