In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import re
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import make_scorer
from sklearn import svm

from numba import jit
import time
import gc

In [2]:
# Compute gini
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

In [3]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

In [4]:
def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    #assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)['target'].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': 'target', 'target': 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': 'target', 'target': 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [5]:
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
    #'ps_reg_F',
    #'ps_reg_M'
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]


In [6]:
# Read data
train_df = pd.read_csv('../data/train.csv', na_values="-1") # .iloc[0:200,:]
test_df = pd.read_csv('../data/test.csv', na_values="-1")
print('imported data')

# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

print('begining feature engineering')
start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('\n\tCurrent feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)
    
X = train_df[train_features]
test_df = test_df[train_features]

print('\nDone adding features, smoothing categorical featuers now.. ')
#target categoricals 
f_cats = [f for f in X.columns if "_cat" in f]

for f in f_cats:
    X[f + "_avg"], test_df[f + "_avg"] = target_encode(trn_series=X[f],
                                         tst_series=test_df[f],
                                         target=train_df['target'],
                                         min_samples_leaf=200,
                                         smoothing=10,
                                         noise_level=0)
print('Done ... I can start learning now.. ')

X = X.fillna(-1)
test_df = test_df.fillna(-1)
y_valid_pred = 0*y
y_test_pred = 0

imported data
begining feature engineering

	Current feature                                 ps_reg_01_plus_ps_car_02_cat    1 in   0.0
	Current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.1
Done adding features, smoothing categorical featuers now.. 
Done ... I can start learning now.. 


In [7]:
x_train = X.values
target = y.values
x_test = test_df.values

In [8]:
y['label'] =0

In [9]:
train_df.loc[train_df['target'] == 1, "label"] = -1
train_df.loc[train_df['target'] == 0, "label"] = 1  

In [10]:
target = train_df['label']

In [11]:
outliers = target[target == -1] 

In [12]:
print("outliers.shape", outliers.shape)  
print("outlier fraction", outliers.shape[0]/target.shape[0])

outliers.shape (21694,)
outlier fraction 0.036447517859182946


In [13]:
train_data, test_data, train_target, test_target = train_test_split(x_train, target, train_size = 0.8)  

In [14]:
nu = 0.95 *(outliers.shape[0] / target.shape[0])  +0.05

In [15]:
model = svm.OneClassSVM(nu=nu, kernel='rbf', gamma=0.00005)  

In [None]:
model.fit(train_data)

## Tune


In [None]:
from sklearn import metrics  
preds = model.predict(train_data)  
targs = train_target

print("accuracy: ", metrics.accuracy_score(targs, preds))  
print("precision: ", metrics.precision_score(targs, preds))  
print("recall: ", metrics.recall_score(targs, preds))  
print("f1: ", metrics.f1_score(targs, preds))  
print("area under curve (auc): ", metrics.roc_auc_score(targs, preds))  

In [None]:
preds = model.predict(test_data)  
targs = test_target

print("accuracy: ", metrics.accuracy_score(targs, preds))  
print("precision: ", metrics.precision_score(targs, preds))  
print("recall: ", metrics.recall_score(targs, preds))  
print("f1: ", metrics.f1_score(targs, preds))  
print("area under curve (auc): ", metrics.roc_auc_score(targs, preds))  

In [18]:
range??

In [29]:

# Setting refit='AUC', refits an estimator on the whole dataset with the
# parameter setting that has the best cross-validated AUC score.
# That estimator is made available at ``gs.best_estimator_`` along with
# parameters like ``gs.best_score_``, ``gs.best_parameters_`` and
# ``gs.best_index_``

gini_scorer = make_scorer(eval_gini, greater_is_better = True)
scoring = {'AUC': 'roc_auc', 'gini': gini_scorer}


gs = GridSearchCV(svm.OneClassSVM(nu=0.95 * nu + 0.05 , kernel='rbf'),
                  param_grid={'gamma': np.linspace(0.0005, 0.1, 24)},
                  scoring=scoring, cv=5, refit='gini', n_jobs=12, verbose=10)
gs.fit(train_data, train_target)
results = gs.cv_results_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] gamma=0.0005 ....................................................
[CV] gamma=0.0005 ....................................................
[CV] gamma=0.0005 ....................................................
[CV] gamma=0.0005 ....................................................
[CV] gamma=0.0005 ....................................................
[CV] gamma=0.00482608695652 ..........................................
[CV] gamma=0.00482608695652 ..........................................
[CV] gamma=0.00482608695652 ..........................................
[CV] gamma=0.00482608695652 ..........................................
[CV] gamma=0.00482608695652 ..........................................
[CV] gamma=0.00915217391304 ..........................................
[CV] gamma=0.00915217391304 ..........................................


KeyboardInterrupt: 