Another unsung [hero](https://github.com/fukatani/rgf_python) that you can use in your "stack-tastic" models

In [1]:
import gc
import numpy as np
import pandas as pd
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import StratifiedKFold, cross_val_score
from rgf.sklearn import RGFClassifier
from sklearn.metrics import roc_auc_score

In [2]:
def ProjectOnMean(data1, data2, columnName):
    grpOutcomes = data1.groupby(list([columnName]))['target'].mean().reset_index()
    grpCount = data1.groupby(list([columnName]))['target'].count().reset_index()
    grpOutcomes['cnt'] = grpCount.target
    grpOutcomes.drop('cnt', inplace=True, axis=1)
    outcomes = data2['target'].values
    x = pd.merge(data2[[columnName, 'target']], grpOutcomes,
                 suffixes=('x_', ''),
                 how='left',
                 on=list([columnName]),
                 left_index=True)['target']

    
    return x.values

def GiniScore(y_actual, y_pred):
  return 2*roc_auc_score(y_actual, y_pred)-1

In [3]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
train.drop(unwanted,inplace=True,axis=1)
test.drop(unwanted,inplace=True,axis=1)
test.insert(1,'target',np.nan)

In [4]:
highcardinality =[]
for i in train.columns[1:-1]:
    if(((i.find('bin')!=-1) or (i.find('cat')!=-1))):
        highcardinality.append(i)

highcardinality

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [5]:
blindloodata = None
folds = 5
kf = StratifiedKFold(n_splits=folds,shuffle=True,random_state=42)
for i, (train_index, test_index) in enumerate(kf.split(range(train.shape[0]),train.target)):
    print('Fold:',i)
    blindtrain = train.loc[test_index].copy() 
    vistrain = train.loc[train_index].copy()



    for c in highcardinality:
        blindtrain['loo'+c] = ProjectOnMean(vistrain,
                                            blindtrain,c)
    if(blindloodata is None):
        blindloodata = blindtrain.copy()
    else:
        blindloodata = pd.concat([blindloodata,blindtrain])

for c in highcardinality:
    test['loo'+c] = ProjectOnMean(train,
                                  test,c)
test.drop(highcardinality,inplace=True,axis=1)

train = blindloodata
train.drop(highcardinality,inplace=True,axis=1)
train = train.fillna(train.mean())
test = test.fillna(train.mean())

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4


In [6]:
rgf = RGFClassifier(max_leaf=1000, #Try increasing this as a starter
                    algorithm="RGF_Sib",
                    test_interval=250,
                    loss="Log",
                    verbose=True)
rgf.fit(train[train.columns[2:]],train.target)
x = rgf.predict_proba(train[train.columns[2:]])
print(GiniScore(train.target,x[:,1]))

"train": 
   algorithm=RGF_Sib
   train_x_fn=/tmp/rgf/caa2cd72-8dab-4da6-8f40-a2ea860545f21.train.data.x
   train_y_fn=/tmp/rgf/caa2cd72-8dab-4da6-8f40-a2ea860545f21.train.data.y
   train_w_fn=/tmp/rgf/caa2cd72-8dab-4da6-8f40-a2ea860545f21.train.data.weight
   Log:ON
   model_fn_prefix=/tmp/rgf/caa2cd72-8dab-4da6-8f40-a2ea860545f21.model
--------------------
Wed Nov  8 10:25:28 2017: Reading training data ... 
Wed Nov  8 10:25:36 2017: Start ... #train=595212
--------------------
Changing test interval: 250->300
Forest-level: 
   loss=Log
   max_leaf_forest=1000
   max_tree=500
   opt_interval=100
   test_interval=300
   num_tree_search=1
   Verbose:ON
   memory_policy=Generous
Turning on Force_to_refresh_all
-------------
Training data: 37x595212, nonzero_ratio=0.9557; managed as dense data.
-------------
Optimization: 
   loss=Log
   num_iteration_opt=5
   reg_L2=0.1
   opt_stepsize=0.5
   max_delta=1
Tree-level: min_pop=10
Node split: reg_L2=0.1
--------------------
Sum of data poin

In [7]:
sub = pd.read_csv('../input/sample_submission.csv')
x = rgf.predict_proba(test[test.columns[2:]])
sub.target = x[:,1]
sub.to_csv('rgfsubmission.csv',index=False)

"predict": 
   model_fn=/tmp/rgf/caa2cd72-8dab-4da6-8f40-a2ea860545f21.model-04
   test_x_fn=/tmp/rgf/caa2cd72-8dab-4da6-8f40-a2ea860545f21.test.data.x
   prediction_fn=/tmp/rgf/caa2cd72-8dab-4da6-8f40-a2ea860545f21.predictions.txt
   Log:ON
--------------------
Wed Nov  8 10:36:50 2017: Reading test data ... 
Wed Nov  8 10:37:00 2017: Predicting ... 
elapsed: 8.86028
/tmp/rgf/caa2cd72-8dab-4da6-8f40-a2ea860545f21.predictions.txt: /tmp/rgf/caa2cd72-8dab-4da6-8f40-a2ea860545f21.model-04,#leaf=1000,#tree=255
Wed Nov  8 10:37:15 2017: Done ... 

None
