In [None]:
# LightGBM install: use conda: https://anaconda.org/conda-forge/lightgbm
# StratifiedKFold: This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.
# KFold: Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default).

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import numpy as np 
import pandas as pd
import os
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
sns.set(font_scale=1)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
os.chdir('/Users/hanbosun/Documents/GitHub/TrasactionPrediction/')

In [4]:
random_state = 42
np.random.seed(random_state)
df_train = pd.read_csv('input/train.csv')
df_test = pd.read_csv('input/test.csv')

In [6]:
df_train = df_train.iloc[:2000,:]

In [7]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [8]:
# https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment#500381
# thanks to @vishalsrinirao
def shuffle_col_vals(x1):
    rand_x = np.array([np.random.choice(x1.shape[0], size=x1.shape[0], replace=False) for i in range(x1.shape[1])]).T
    grid = np.indices(x1.shape)
    rand_y = grid[1]
    return x1[(rand_x, rand_y)]

def augment_fast1(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        x1 = shuffle_col_vals(x1)
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        x1 = shuffle_col_vals(x1)
        xn.append(x1)

    xs = np.vstack(xs); xn = np.vstack(xn)
    ys = np.ones(xs.shape[0]);yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn]); y = np.concatenate([y,ys,yn])
    return x,y

In [9]:
# https://stackoverflow.com/questions/50554272/randomly-shuffle-items-in-each-row-of-numpy-array
def disarrange(a, axis=-1):
    """
    Shuffle `a` in-place along the given axis.

    Apply numpy.random.shuffle to the given axis of `a`.
    Each one-dimensional slice is shuffled independently.
    """
    b = a.swapaxes(axis, -1)
    # Shuffle `b` in-place along the last axis.  `b` is a view of `a`,
    # so `a` is shuffled in place, too.
    shp = b.shape[:-1]
    for ndx in np.ndindex(shp):
        np.random.shuffle(b[ndx])
    return

def augment_fast2(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        disarrange(x1,axis=0)
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        disarrange(x1,axis=0)
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [10]:
lgb_params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    "bagging_seed" : random_state,
    "verbosity" : 1,
    "seed": random_state
}


In [11]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
oof = df_train[['ID_code', 'target']]
oof['predict'] = 0
predictions = df_test[['ID_code']]
val_aucs = []
feature_importance_df = pd.DataFrame()

features = [col for col in df_train.columns if col not in ['target', 'ID_code']]
X_test = df_test[features].values

In [19]:
for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train, df_train['target'])):
    X_train, y_train = df_train.iloc[trn_idx][features], df_train.iloc[trn_idx]['target']
    X_valid, y_valid = df_train.iloc[val_idx][features], df_train.iloc[val_idx]['target']
    
    N = 5
    p_valid,yp = 0,0
    for i in range(N):
        X_t, y_t = augment(X_train.values, y_train.values)
#        X_t, y_t = augment_fast1(X_train.values, y_train.values)
        X_t, y_t = augment_fast2(X_train.values, y_train.values)
    
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label=y_t)
        val_data = lgb.Dataset(X_valid, label=y_valid)
        evals_result = {}
        lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        100000,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds=3000,
                        verbose_eval = 1000,
                        evals_result=evals_result
                       )
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = lgb_clf.feature_importance()
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    oof['predict'][val_idx] = p_valid/N
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    
    predictions['fold{}'.format(fold+1)] = yp/N

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.999904	valid_1's auc: 0.707668
[2000]	training's auc: 1	valid_1's auc: 0.697277
[3000]	training's auc: 1	valid_1's auc: 0.694272
Early stopping, best iteration is:
[263]	training's auc: 0.995855	valid_1's auc: 0.787105
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.999909	valid_1's auc: 0.734147
[2000]	training's auc: 1	valid_1's auc: 0.731017
[3000]	training's auc: 1	valid_1's auc: 0.725884
Early stopping, best iteration is:
[284]	training's auc: 0.997234	valid_1's auc: 0.762817
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.999824	valid_1's auc: 0.740908
[2000]	training's auc: 1	valid_1's auc: 0.736213
[3000]	training's auc: 1	valid_1's auc: 0.721315
Early stopping, best iteration is:
[215]	training's auc: 0.99499	valid_1's auc: 0.782222
Training until validation scores don't improve for 3000 rounds.
[1000]	traini

ValueError: [  24   31   32   35   49   55   57   68   69   75   77   79   80   81
   83   86   88  106  115  118  124  128  131  135  144  145  156  162
  184  190  192  198  199  201  210  211  216  225  232  240  241  243
  251  265  269  271  272  276  279  282  284  288  293  295  305  306
  310  329  339  342  344  347  361  365  368  370  379  380  387  392
  394  402  403  404  406  417  418  419  424  427  434  435  436  450
  454  464  467  469  470  471  473  479  482  485  487  489  490  493
  501  502  515  518  529  531  538  544  547  548  550  551  553  556
  568  580  585  590  594  602  603  608  609  611  613  614  620  624
  625  629  633  634  635  638  640  651  656  661  662  667  670  672
  675  677  678  684  687  690  698  699  702  707  709  721  723  729
  736  755  764  766  767  776  778  783  793  794  808  819  826  833
  842  849  852  853  861  863  866  869  874  875  882  887  892  893
  910  911  922  925  930  933  935  939  954  957  961  963  964  968
  980  983  985  986  990  995 1004 1017 1035 1042 1047 1048 1054 1058
 1059 1065 1071 1075 1081 1083 1118 1120 1122 1124 1136 1142 1143 1147
 1149 1150 1152 1153 1157 1163 1172 1175 1196 1197 1202 1215 1216 1219
 1228 1236 1245 1249 1250 1252 1254 1255 1257 1259 1262 1268 1276 1284
 1285 1286 1289 1292 1297 1298 1301 1311 1312 1313 1320 1321 1326 1330
 1333 1335 1339 1341 1344 1359 1363 1364 1374 1381 1382 1391 1392 1397
 1399 1401 1412 1414 1415 1417 1421 1426 1435 1438 1442 1443 1444 1446
 1450 1458 1460 1464 1468 1487 1489 1491 1505 1506 1516 1517 1522 1524
 1526 1529 1532 1535 1540 1557 1560 1562 1571 1589 1592 1607 1609 1619
 1621 1624 1628 1630 1636 1645 1653 1663 1665 1666 1669 1681 1688 1689
 1690 1691 1695 1696 1700 1727 1732 1742 1755 1758 1764 1768 1774 1775
 1779 1789 1795 1804 1806 1820 1823 1824 1825 1827 1831 1837 1847 1849
 1852 1857 1870 1871 1872 1878 1880 1891 1897 1907 1913 1917 1920 1921
 1928 1930 1936 1937 1938 1941 1946 1958 1961 1972 1974 1975 1983 1984
 1986 1991 1993] not contained in the index

In [None]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
all_auc = roc_auc_score(oof['target'], oof['predict'])
print("Mean auc: %.9f, std: %.9f. All auc: %.9f." % (mean_auc, std_auc, all_auc))

cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,26))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('LightGBM Features (averaged over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

# submission
predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('lgb_all_predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions['target']
sub_df.to_csv("lgb_submission.csv", index=False)
oof.to_csv('lgb_oof.csv', index=False)
