In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
from functools import partial
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import copy
import gc
import os
import sys
sys.path.append('../../../code/utils')
sys.path.append('../../../code/pipeline')
sys.path.append('../../../code')
import ffm_utils as fu
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import data_jointer as dj
import data_pipeline as dp
import config

In [2]:
def feature_field(feat_name):
    return feat_name.split("_")[0]

In [3]:
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("raw", "binary")

In [4]:
with pu.profiler("loading training data"):
    cols_train, X_tv = bin_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()

[16:24:05] Finish loading training data. △M: +3.27GB. △T: 24.4 seconds.


In [5]:
fields_train = [feature_field(col) for col in cols_train]
fields_unique = set(fields_train)
field_to_index = {field:i for i,field in enumerate(fields_unique)}
col_index_to_string = {col_index:"{}:{}:{}".format(field_to_index[field], col_index, 1) 
                       for col_index, field in enumerate(fields_train)}

formatter = fu.BinaryFFMFormatter(col_index_to_string)

In [6]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")

y = df_train["label"].values.copy()
y = (y + 1) / 2
y = y.astype(int)

In [7]:
n_splits = 5
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=.2, random_state=20180505)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

In [8]:
aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()

[16:27:07] Finish splitting train/valid set. △M: +151.34MB. △T: 1.3 minutes.


In [10]:
ffm_folder = os.path.join(config.DATA_DIR, "ffm", config.PRELIMINARY_CONTEST_DATA_SUBDIR[1:])
os.makedirs(ffm_folder, exist_ok=True)
buffer_size = 1000

ffm_file = "train.raw.binary.ffm_input"
ffm_path = os.path.join(ffm_folder, ffm_file)
formatter.to_ffm(X_train, y_train, ffm_path, buffer_size)

ffm_file = "valid.raw.binary.ffm_input"
ffm_path = os.path.join(ffm_folder, ffm_file)
formatter.to_ffm(X_valid, y_valid, ffm_path, buffer_size)

7039051it [59:35, 1968.65it/s]
1759763it [13:39, 2147.06it/s]


In [11]:
with pu.profiler("releasing memory"):
    del X_train  # don't delete y
    del X_valid
    gc.collect()
    
with pu.profiler("loading testing data"):
    cols_test, X_test = bin_loader.load("test1")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()
    
y_test = np.zeros(df_test.shape[0])

ffm_file = "test.raw.binary.ffm_input"
ffm_path = os.path.join(ffm_folder, ffm_file)
formatter.to_ffm(X_test, y_test, ffm_path, buffer_size)

[23:45:46] Finish releasing memory. △M: -3.25GB. △T: 0.6 seconds.


0it [00:00, ?it/s]

[23:45:49] Finish loading testing data. △M: +883.05MB. △T: 2.7 seconds.


2265989it [06:45, 5592.93it/s]


In [14]:
model_folder = "../../../model/ffm"
os.makedirs(model_folder, exist_ok=True)

In [18]:
subm_folder = "../../../subm/ffm"
subm_folder = os.path.join(subm_folder, "0511_v1")
os.makedirs(subm_folder, exist_ok=True)

In [19]:
# do the training and predicting in command line
# note that you need to compile ffm-train and ffm-predict and locate them in xxx/TencentAlgo2018/bin/
# command for training (call in the xxx/TencentAlgo2018/data/ffm/preliminary_contest_data/)
# `./../../../bin/ffm-train -l 0.00002 -k 8 -r 0.05 -s 4 -t 25 -p ./valid.raw.binary.ffm_input ./train.raw.binary.ffm_input ./../../../model/ffm/0511_v1.ffm_model`
# command for prediction
# `./../../../bin/ffm-predict ./test.raw.binary.ffm_input ./../../../model/ffm/0511_v1.ffm_model ./../../../subm/ffm/0511_v1/submission.ffm_output `

In [20]:
pred_file = "submission.ffm_output"
pred_path = os.path.join(subm_folder, pred_file)

preds = []
with open(pred_path, "r") as f:
    for line in f.readlines():
        preds.append(float(line.strip()))
        
print("Prediction Length: {}".format(len(preds)))

Prediction Length: 2265989


In [22]:
df_test = du.load_raw_data("test")

subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)

subm = df_test.copy()
subm["score"] = preds
subm.to_csv(subm_path, index=False)

In [23]:
pred_file = "valid.ffm_output"  # prediction of validation
pred_path = os.path.join(subm_folder, pred_file)

valid_proba = []
with open(pred_path, "r") as f:
    for line in f.readlines():
        valid_proba.append(float(line.strip()))
valid_proba = np.array(valid_proba)

print("Validation Prediction Length: {}".format(len(valid_proba)))

Validation Prediction Length: 1759763


In [26]:
# y_train, y_valid = y[train_index], y[valid_index]

In [27]:
df_score = eu.online_auc(aids_valid, y_valid, valid_proba, ret_verbose=True)
df_score

Unnamed: 0,aid,auc
0,6,0.676434
1,7,0.803015
2,12,0.851404
3,18,0.577215
4,70,0.858886
5,74,0.692595
6,86,0.586467
7,98,0.776898
8,113,0.541335
9,117,0.604999


In [28]:
online_auc = df_score['auc'].mean()
overall_auc = metrics.roc_auc_score(y_valid, valid_proba)
print("Online AUC: {:.6f}".format(online_auc))
print("Overall AUC: {:.6f}".format(overall_auc))

Online AUC: 0.726485
Overall AUC: 0.730530


In [29]:
log_folder = os.path.join(config.LOG_DIR, 'ffm/pipeline/0511/')
log_file = 'v1.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

In [31]:
path = "/mnt/c/Users/cheng/Desktop/Competitons/TencentAlgo2018/log/lgbm/pipeline/0507/v2.online_auc.csv"
df_tmp = pd.read_csv(path)
df_tmp = df_tmp.sort_values("auc", ascending=False)
df_tmp.to_csv("/mnt/c/Users/cheng/Desktop/Competitons/TencentAlgo2018/log/lgbm/pipeline/0507/v2.online_auc.sorted.csv", 
              index=False)

In [32]:
subm_folder = "../../../subm/ffm"
subm_folder = os.path.join(subm_folder, "0511_v2")
os.makedirs(subm_folder, exist_ok=True)

In [33]:
pred_file = "valid.ffm_output"  # prediction of validation
pred_path = os.path.join(subm_folder, pred_file)

valid_proba = []
with open(pred_path, "r") as f:
    for line in f.readlines():
        valid_proba.append(float(line.strip()))
valid_proba = np.array(valid_proba)

print("Validation Prediction Length: {}".format(len(valid_proba)))

df_score = eu.online_auc(aids_valid, y_valid, valid_proba, ret_verbose=True)
df_score

Validation Prediction Length: 1759763


Unnamed: 0,aid,auc
0,6,0.674348
1,7,0.804496
2,12,0.851126
3,18,0.577563
4,70,0.859005
5,74,0.694430
6,86,0.585513
7,98,0.777271
8,113,0.542094
9,117,0.605536


In [34]:
online_auc = df_score['auc'].mean()
overall_auc = metrics.roc_auc_score(y_valid, valid_proba)
print("Online AUC: {:.6f}".format(online_auc))
print("Overall AUC: {:.6f}".format(overall_auc))

Online AUC: 0.726967
Overall AUC: 0.730872


In [35]:
log_folder = os.path.join(config.LOG_DIR, 'ffm/pipeline/0511/')
log_file = 'v2.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

In [37]:
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)

subm = df_test.copy()
subm["score"] = preds
subm.to_csv(subm_path, index=False)

In [36]:
subm_folder

'../../../subm/ffm/0511_v2'