In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import pandas as pd
import numpy as np
import time
import gc
import os
import sys
sys.path.append("../../../code/utils")
import data_utils as du

In [2]:
one_feat_names = ['age', 'gender', 'education', 'consumptionAbility', 'LBS',
                  'carrier', 'house']  # one user has only one value
multi_feat_names = ['marriageStatus', 'interest1', 'interest2', 'interest3',
                   'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
                   'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os']  # one user can have more than one value
feat_names = one_feat_names + multi_feat_names

In [3]:
df_train = du.load_raw_data("train")
y = df_train["label"]
y = (y + 1) / 2  # 1/-1 to 1/0

In [4]:
n_splits = 3  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [5]:
df_log = pd.DataFrame(columns=["model", "feature", "regularization", "C", "auc_mean", "auc_std", 
                               "featureDiscarded_mean", "featureDiscarded_std", "time_mean", "time_std"])
for feat_name in feat_names:
    print("training model on '{}'".format(feat_name))
    uid_index, (val_to_index, cnt_feat) = du.load_user_cnt(feat_name)
    uid_to_index = dict(zip(uid_index, list(range(len(uid_index)))))

    train = df_train.copy()
    train["uid_index"] = train["uid"].map(uid_to_index)
    X_feat = cnt_feat[train["uid_index"].values, :]

    # =========================
    # LR with L2 regularization
    # =========================
    for c in [1e-6, 1e-3, 1, 1e3, 1e6]:
        aucs = np.zeros(n_splits)
        times = np.zeros(n_splits)

        for i, (train_index, valid_index) in enumerate(split_indices):
            # split train/valid sets
            X_train, y_train = X_feat[train_index], y[train_index]
            X_valid, y_valid = X_feat[valid_index], y[valid_index]

            # train and predict
            t0 = time.time()
            lr = LogisticRegression(penalty='l2', C=c)
            lr.fit(X_train, y_train)
            proba_valid = lr.predict_proba(X_valid)

            # get info
            times[i] = time.time() - t0
            aucs[i] = metrics.roc_auc_score(y_valid, proba_valid[:, :1])

        auc_mean = aucs.mean()
        auc_std = aucs.std()
        time_mean = times.mean()
        time_std = times.std()
        print("LR with c={:.4g} (L2) AUC: {:.6f} (+/-{:.6f})".format(c, auc_mean, auc_std))
        df_log.loc[df_log.shape[0]] = {"model": "LR",
                                       "feature": feat_name,
                                       "regularization": "L2", 
                                       "C": c,
                                       "auc_mean": auc_mean, 
                                       "auc_std": auc_std,
                                       "featureDiscarded_mean": np.nan,
                                       "featureDiscarded_std": np.nan,
                                       "time_mean": time_mean, 
                                       "time_std": time_std}
        
    # =========================
    # LR with L1 regularization
    # =========================
    for c in [1e-6, 1e-3, 1, 1e3, 1e6]:
        aucs = np.zeros(n_splits)
        zero_ceof_counts = np.zeros(n_splits)
        times = np.zeros(n_splits)

        for i, (train_index, valid_index) in enumerate(split_indices):
            # split train/valid sets
            X_train, y_train = X_feat[train_index], y[train_index]
            X_valid, y_valid = X_feat[valid_index], y[valid_index]

            # train and predict
            t0 = time.time()
            lr = LogisticRegression(penalty='l1', C=c)
            lr.fit(X_train, y_train)
            proba_valid = lr.predict_proba(X_valid)

            # get info
            times[i] = time.time() - t0
            aucs[i] = metrics.roc_auc_score(y_valid, proba_valid[:, :1])
            zero_ceof_counts[i] = (lr.coef_==0).sum()
        
        zero_coef_count_mean = zero_ceof_counts.mean()
        zero_coef_count_std = zero_ceof_counts.std()
        auc_mean = aucs.mean()
        auc_std = aucs.std()
        time_mean = times.mean()
        time_std = times.std()
        msg = "LR with c={:.4g} (L1) AUC: {:.6f} (+/-{:.6f})".format(c, auc_mean, auc_std)
        msg +=", {:.6f} (+/-{:.6f}) features discarded, {} in total.".format(zero_coef_count_mean,
                                                                            zero_coef_count_std,
                                                                            X_feat.shape[1])
        print(msg)
        df_log.loc[df_log.shape[0]] = {"model": "LR",
                                       "feature": feat_name,
                                       "regularization": "L1", 
                                       "C": c,
                                       "auc_mean": auc_mean, 
                                       "auc_std": auc_std,
                                       "featureDiscarded_mean": zero_coef_count_mean,
                                       "featureDiscarded_std": zero_coef_count_std,
                                       "time_mean": time_mean, 
                                       "time_std": time_std}
        
    del uid_index
    del val_to_index 
    del cnt_feat
    del uid_to_index
    del train
    del X_feat
    del X_train
    del X_valid
    gc.collect()

training model on 'age'
LR with c=1e-06 (L2) AUC: 0.479691 (+/-0.000712)
LR with c=0.001 (L2) AUC: 0.436254 (+/-0.001059)
LR with c=1 (L2) AUC: 0.436220 (+/-0.001049)
LR with c=1000 (L2) AUC: 0.436220 (+/-0.001049)
LR with c=1e+06 (L2) AUC: 0.436220 (+/-0.001049)
LR with c=1e-06 (L1) AUC: 0.500000 (+/-0.000000), 6.000000 (+/-0.000000) features discarded, 6 in total.
LR with c=0.001 (L1) AUC: 0.436223 (+/-0.001047), 2.000000 (+/-0.000000) features discarded, 6 in total.
LR with c=1 (L1) AUC: 0.436220 (+/-0.001049), 0.000000 (+/-0.000000) features discarded, 6 in total.
LR with c=1000 (L1) AUC: 0.436220 (+/-0.001049), 0.000000 (+/-0.000000) features discarded, 6 in total.
LR with c=1e+06 (L1) AUC: 0.436220 (+/-0.001049), 0.000000 (+/-0.000000) features discarded, 6 in total.
training model on 'gender'
LR with c=1e-06 (L2) AUC: 0.511313 (+/-0.000158)
LR with c=0.001 (L2) AUC: 0.488547 (+/-0.000106)
LR with c=1 (L2) AUC: 0.488547 (+/-0.000106)
LR with c=1000 (L2) AUC: 0.488547 (+/-0.000106

LR with c=1e-06 (L1) AUC: 0.500000 (+/-0.000000), 11.000000 (+/-0.000000) features discarded, 11 in total.
LR with c=0.001 (L1) AUC: 0.497798 (+/-0.000194), 10.000000 (+/-0.000000) features discarded, 11 in total.
LR with c=1 (L1) AUC: 0.497796 (+/-0.000194), 0.000000 (+/-0.000000) features discarded, 11 in total.
LR with c=1000 (L1) AUC: 0.497796 (+/-0.000194), 0.000000 (+/-0.000000) features discarded, 11 in total.
LR with c=1e+06 (L1) AUC: 0.497796 (+/-0.000194), 0.000000 (+/-0.000000) features discarded, 11 in total.
training model on 'interest4'
LR with c=1e-06 (L2) AUC: 0.498476 (+/-0.000203)
LR with c=0.001 (L2) AUC: 0.498564 (+/-0.000296)
LR with c=1 (L2) AUC: 0.498463 (+/-0.000191)
LR with c=1000 (L2) AUC: 0.498466 (+/-0.000187)
LR with c=1e+06 (L2) AUC: 0.498466 (+/-0.000187)
LR with c=1e-06 (L1) AUC: 0.500000 (+/-0.000000), 11.000000 (+/-0.000000) features discarded, 11 in total.
LR with c=0.001 (L1) AUC: 0.498480 (+/-0.000200), 10.000000 (+/-0.000000) features discarded, 11

  np.exp(prob, prob)


LR with c=1000 (L1) AUC: 0.499691 (+/-0.000107), 35560.333333 (+/-1078.163356) features discarded, 64856 in total.
LR with c=1e+06 (L1) AUC: 0.499707 (+/-0.000160), 28471.666667 (+/-267.783910) features discarded, 64856 in total.
training model on 'appIdAction'
LR with c=1e-06 (L2) AUC: 0.498455 (+/-0.000135)
LR with c=0.001 (L2) AUC: 0.498475 (+/-0.000145)
LR with c=1 (L2) AUC: 0.498766 (+/-0.000080)
LR with c=1000 (L2) AUC: 0.499088 (+/-0.000045)
LR with c=1e+06 (L2) AUC: 0.499038 (+/-0.000070)
LR with c=1e-06 (L1) AUC: 0.500000 (+/-0.000000), 6215.000000 (+/-0.000000) features discarded, 6215 in total.
LR with c=0.001 (L1) AUC: 0.498456 (+/-0.000133), 6214.000000 (+/-0.000000) features discarded, 6215 in total.
LR with c=1 (L1) AUC: 0.498664 (+/-0.000120), 4481.333333 (+/-21.296844) features discarded, 6215 in total.
LR with c=1000 (L1) AUC: 0.499102 (+/-0.000064), 360.666667 (+/-27.740864) features discarded, 6215 in total.
LR with c=1e+06 (L1) AUC: 0.499101 (+/-0.000065), 300.3333

In [8]:
df_log['solver'] = 'liblinear'
df_log['regularization'] = df_log['regularization'].map({"L1": "l1", "L2":"l2"})  # to consist with sklearn parameters

In [10]:
log_folder = '../../../log/lr/starter/'
os.makedirs(log_folder, exist_ok=True)

In [11]:
log_file = '0427.csv'
log_path = os.path.join(log_folder, log_file)
df_log.to_csv(log_path, index=False)