In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 24, 8
%matplotlib inline

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.utils import resample

In [2]:
def gini(solution, submission):
    df = zip(solution, submission, range(len(solution)))
    df = sorted(df, key=lambda x: (x[1],-x[2]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

def normalized_gini(solution, submission):
    normalized_gini = gini(solution, submission)/gini(solution, solution)
    return normalized_gini

In [3]:
def prepare_data():
    non_imp = ['ps_ind_16_bin','ps_car_08_cat','ps_car_11_cat','ps_ind_06_bin','ps_car_02_cat','ps_ind_07_bin','ps_ind_09_bin','ps_ind_08_bin','ps_ind_14','ps_ind_18_bin','ps_car_10_cat','ps_ind_11_bin','ps_ind_10_bin']
    
    train = (pd.read_csv('../data/train.csv', na_values=-1)
              .fillna(value=-1))
    unwanted = list((set(train.columns[train.columns.str.startswith('ps_calc_')])|set(non_imp)))
    train.drop(unwanted, axis=1, inplace=True)
            
    
#     minority_class = train.target.sum()
#     majority_class = train.target.shape[0] - minority_class

#     df_majority = train[train['target']==0]
#     df_minority = train[train['target']==1]

#     df_majority_downsampled = resample(df_majority, 
#                                      replace=False,     # sample with replacement
#                                      n_samples=minority_class,    # to match majority class
#                                      random_state=np.random.randint(1999)) # reproducible results

#     train = pd.concat([df_majority_downsampled, df_minority])

    
    test  = (pd.read_csv('../data/test.csv', na_values=-1)
              .fillna(value=-1)
              .drop(unwanted, axis=1)) 
    
    y = train.target.values        
    train = train.drop(['id', 'target'], axis=1)
    
    test_id = test.id.values
    test = test.drop('id', axis=1)
    
    fl = train.columns.tolist()
    
    return train.values, y, test.values, test_id, fl

In [4]:
X, y, X_test, test_id, _ = prepare_data()

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, shuffle=True,  random_state=np.random.randint(1000), stratify=y)

In [None]:
gbc = GradientBoostingClassifier(max_depth=2, warm_start=True, subsample=0.25)
val_error = np.zeros((500))
for i, n_estimators in enumerate(range(1, 500)):
    gbc.n_estimators = n_estimators
    gbc.fit(X_train, y_train)
    y_pred = gbc.predict(X_val)
    val_error[i] = gini_normalized(y_val, y_pred)
max(val_error), np.argmax(val_error)
plt.plot(range(1,500), val_error[:-1])
plt.grid()
plt.show()

In [None]:
clf = GradientBoostingClassifier()

# A list of dictionaries to specify the parameters we want to tune.
param_dist = {"n_estimators":[200, 250,300],
              "max_depth": [1,2,3,9],
              "max_features": np.random.uniform(0.1,0.5,5),
              "min_samples_split": np.random.uniform(0.125,0.5,5),
              "min_samples_leaf": np.random.uniform(0.125,0.5,5),
              "subsample":  np.random.uniform(0.8,.9,3)}


# Initialize GridSearchSV object to train and tune clf
n_iter_search = 10
gini_scorer = make_scorer(normalized_gini, greater_is_better = True)
random_search = RandomizedSearchCV(clf, 
                                   param_distributions=param_dist,
                                   scoring = gini_scorer,
                                   n_iter=n_iter_search,
                                   n_jobs=-1,
                                   iid = false,
                                   cv= StratifiedKFold(n_splits=3, shuffle=True),
                                   verbose=2)
random_search.fit(X, y)

In [None]:
distribution="bernoulli",
                                      ntrees=100,
                                      max_depth=27,
                                      min_rows=2048,
                                      learn_rate=0.2,
                                      sample_rate=0.37,
                                      col_sample_rate=0.63,
                                      col_sample_rate_per_tree=0.85,
                                      col_sample_rate_change_per_level=1.0,
                                      nbins=128,
                                      nbins_cats=4096,
                                      min_split_improvement=0.0,
                                      histogram_type='RoundRobin',
                                      nfolds=nfolds,
                                      fold_assignment="Modulo",
                                      keep_cross_validation_predictions=True,
                                      seed=1

In [None]:
random_search.best_params_

In [None]:
class_weight = {1 : (len(y_train) - sum(y_train))/y_train.shape[0], 0: sum(y_train)/y_train.shape[0]}

In [6]:
gini_scorer = make_scorer(normalized_gini, greater_is_better = True)

In [7]:
RF = GradientBoostingClassifier(n_estimators = 81,
                                max_features = 0.5,
                                max_depth = 27,
                                min_samples_leaf = 50,
                                min_samples_split = 500,
                                subsample = 0.8,
                                learning_rate = 0.55)

In [8]:
scores = cross_val_score(estimator=RF, X=X, y=y, cv=5, n_jobs=-1, scoring=gini_scorer)
print('CV accuracy scores: %s' % scores); print()
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [ 0.01637837  0.01610228  0.00327906  0.00118265 -0.00038704]

CV accuracy: 0.007 +/- 0.007


In [9]:
scores

array([ 0.01637837,  0.01610228,  0.00327906,  0.00118265, -0.00038704])