In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [4]:
train = pd.read_csv('data/train.csv', na_values=-1)
train = train.fillna(train.median())

In [5]:
train = train.drop(train.columns[train.columns.str.startswith('ps_calc') == True], axis=1)
train = train.drop(['ps_ind_10_bin','ps_ind_11_bin','ps_ind_12_bin','ps_ind_13_bin',
                    'ps_ind_18_bin','ps_car_03_cat', 'ps_car_10_cat'], axis=1)

In [15]:
test = pd.read_csv('data/test.csv', na_values=-1)
test = test.fillna(test.median())
test = test.drop(test.columns[test.columns.str.startswith('ps_calc') == True], axis=1)
test = test.drop(['ps_ind_10_bin','ps_ind_11_bin','ps_ind_12_bin','ps_ind_13_bin',
                    'ps_ind_18_bin','ps_car_03_cat', 'ps_car_10_cat'], axis=1)
X_test = test.drop('id', axis=1)
xg_test = xgb.DMatrix(X_test.values)

In [7]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

In [8]:
X = train.drop(['target','id'], axis=1)
y = train.target

In [13]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=False, random_state=123)
X = X.values
y = y.values

In [14]:
param = {'objective': 'binary:logistic', 
         'eta': 0.01, 
         'max_depth': 5, 
         'silent': True, 
         'nthread': 4, 
         'tree_method': 'gpu_hist', 
         'gamma': 0.3, 'min_child_weight': 7, 
         'subsample': 0.7, 
         'colsample_bytree': 0.6, 
         'max_delta_step': 5}

In [17]:
from sklearn.externals import joblib
model_num = 1
score = []
pred = pd.DataFrame({'id': test.id,'target': np.zeros_like(test.id)})
for train_index, val_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", val_index)
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    # Set Var for XGB
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_val = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(xg_train, 'train'), (xg_val, 'test')]
    #
    xgb_skf = xgb.train(param, xg_train, 10000, watchlist, early_stopping_rounds=100
                        , feval=gini_xgb, maximize=True, verbose_eval=100)
    print(xgb_skf.best_score)
    score.append(xgb_skf.best_score)
    pred_lap = xgb_skf.predict(xg_test)
    pred['target'] += pred_lap 
#     joblib.dump(xgb_skf, 'model/xgb'+str(model_num)+'.pkl')
#     model_num += 1'
pred['target'] = pred['target']/5

TRAIN: [117955 117960 118013 ..., 595209 595210 595211] TEST: [     0      1      2 ..., 119082 119083 119084]
[0]	train-error:0.036447	test-error:0.036449	train-gini:0.193286	test-gini:0.1803
Multiple eval metrics have been passed: 'test-gini' will be used for early stopping.

Will train until test-gini hasn't improved in 100 rounds.
[100]	train-error:0.036447	test-error:0.036449	train-gini:0.266438	test-gini:0.253769
[200]	train-error:0.036447	test-error:0.036449	train-gini:0.272358	test-gini:0.257264
[300]	train-error:0.036447	test-error:0.036449	train-gini:0.280633	test-gini:0.261605
[400]	train-error:0.036447	test-error:0.036449	train-gini:0.292691	test-gini:0.267616
[500]	train-error:0.036447	test-error:0.036449	train-gini:0.304787	test-gini:0.272761
[600]	train-error:0.036447	test-error:0.036449	train-gini:0.315008	test-gini:0.276405
[700]	train-error:0.036447	test-error:0.036449	train-gini:0.324612	test-gini:0.279155
[800]	train-error:0.036447	test-error:0.036449	train-gini:0.3

[1100]	train-error:0.036449	test-error:0.036441	train-gini:0.353562	test-gini:0.274742
[1200]	train-error:0.036447	test-error:0.036441	train-gini:0.359261	test-gini:0.275255
[1300]	train-error:0.036447	test-error:0.036441	train-gini:0.364932	test-gini:0.275452
Stopping. Best iteration:
[1289]	train-error:0.036447	test-error:0.036441	train-gini:0.364168	test-gini:0.275508

0.275508


In [19]:
pred.to_csv('sub/sub15.csv', index=False)

In [18]:
np.mean(score)

0.28459440000000003

In [24]:
train = pd.get_dummies(train, columns=train.columns[train.columns.str.endswith('cat')==True])

In [25]:
train.shape

(595212, 190)

In [1]:
import lightgbm as lgb

In [12]:
# custom objective function (similar to auc)

def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

In [9]:
from sklearn.model_selection import train_test_split  
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

X_train, X_val, y_train, y_val = train_test_split(X, y , random_state=123, test_size=0.2)

In [11]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [16]:
params = {'device': 'gpu',
          'num_threads':4,
          'metric': 'auc', 
          'learning_rate' : 0.01, 
          'max_depth':5,  
          'objective': 'binary', 
          'feature_fraction': 0.6,
          'bagging_fraction':0.7,
          'bagging_freq':5,}

In [None]:
lgb_model = lgb.train(params, lgb_train, 10000, lgb_eval, verbose_eval=100, 
                      feval=gini_lgb, early_stopping_rounds=100)