In [None]:
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pylab as plt
from itertools import combinations
from scipy.stats import uniform
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 24, 9

In [None]:
import xgboost as xgb

In [None]:
def gini(solution, submission):
    df = zip(solution, submission, range(len(solution)))
    df = sorted(df, key=lambda x: (x[1],-x[2]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

def normalized_gini(solution, submission):
    normalized_gini = gini(solution, submission)/gini(solution, solution)
    return normalized_gini

In [None]:
def make_dicts(df):
    f_dicts ={}
    
    features_dict = {}
    for x in ['ind', 'reg', 'car', 'calc']:
        for y in ['cat', 'bin', 'con']:
            features_dict[x+'_'+y] =[]
            for i in df.columns.tolist()[2:]:
                j = i.split('_')
                if len(j) == 3:
                    j.append('con')
                if j[1]==x and j[3]==y:
                    features_dict[x+'_'+y].append(i)
    f_dicts['combo'] = features_dict
    
    features_dict_data_type = {}
    for y in ['cat', 'bin', 'con']:
        features_dict_data_type[y]=[]
        for i in df.columns.tolist()[2:]:
            j = i.split('_')
            if len(j) == 3:
                j.append('con')
            if j[3]==y:
                features_dict_data_type[y].append(i)
    f_dicts['type'] = features_dict_data_type
    
    features_dict_data_label = {}
    for x in ['ind', 'reg', 'car', 'calc']:
        features_dict_data_label[x] =[]
        for i in df.columns.tolist()[2:]:
            j = i.split('_')
            if j[1]==x:
                features_dict_data_label[x].append(i)
    f_dicts['label'] = features_dict_data_label
    return f_dicts

In [None]:
def prepare_data():
    #non_imp = ['ps_ind_16_bin','ps_car_08_cat','ps_car_11_cat','ps_ind_06_bin','ps_ind_12_bin','ps_ind_13_bin','ps_car_02_cat','ps_ind_07_bin','ps_ind_09_bin','ps_ind_08_bin','ps_ind_14','ps_ind_18_bin','ps_car_10_cat','ps_ind_11_bin','ps_ind_10_bin']
    
    train = (pd.read_csv('../data/train.csv', na_values=999)
              .fillna(value=999))
    #unwanted = list((set(train.columns[train.columns.str.startswith('ps_calc_')])|set(non_imp)))
    #unwanted = ['ps_ind_10_bin','ps_ind_11_bin','ps_ind_12_bin','ps_ind_13_bin','ps_car_10_cat','ps_ind_14','ps_car_11_cat',
    #           'ps_car_02_cat','ps_car_08_cat','ps_ind_18_bin','ps_ind_08_bin','ps_calc_15_bin','ps_calc_20_bin']
    unwanted =  ['ps_calc_19_bin',
                 'ps_car_10_cat',
                 'ps_ind_11_bin',
                 'ps_calc_06',
                 'ps_calc_08',
                 'ps_calc_13',
                 'ps_calc_11',
                 'ps_calc_07',
                 'ps_calc_18_bin',
                 'ps_calc_04',
                 'ps_calc_14',
                 'ps_calc_01',
                 'ps_calc_17_bin',
                 'ps_calc_09',
                 'ps_calc_02',
                 'ps_ind_13_bin',
                 'ps_ind_18_bin',
                 'ps_calc_16_bin',
                 'ps_ind_09_bin',
                 'ps_calc_03',
                 'ps_ind_10_bin',
                 'ps_calc_05',
                 'ps_ind_14',
                 'ps_calc_15_bin',
                 'ps_ind_12_bin',
                 'ps_calc_12',
                 'ps_calc_10',
                 'ps_calc_20_bin']
    train.drop(unwanted, axis=1, inplace=True)
            
    test  = (pd.read_csv('../data/test.csv', na_values=999)
              .fillna(value=999)
              .drop(unwanted, axis=1)) 
    
    y = train.target.values        
    train = train.drop(['id', 'target'], axis=1)
    
    test_id = test.id.values
    test = test.drop('id', axis=1)
    
    ## interactions:
    interaction_features = ['ps_car_13', 'ps_reg_03', 'ps_car_14', 'ps_reg_02', 'ps_ind_15', 'ps_ind_03']
    feature_combintation  = list(combinations(interaction_features, 2))
    for first_feature, second_feature in feature_combintation:
        new_name  = first_feature+'_'+second_feature+'comb'
        train[new_name] = train[first_feature].values*train[second_feature].values
        test[new_name] = test[first_feature].values*test[second_feature].values

    fl = train.columns.tolist()
    
    return train.values, y, test.values, test_id, fl

In [None]:
def test():
    train, labels, _, _ = prepare_data()
    # Create the pipeline

    # Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {
                    'min_child_weight': [11], # due to high class imbalance
                    'objective': ['binary:logistic'],
                    'max_depth': [8],
                    'nthread': [12],
                    'max_delta_step': [1.8],
                    'colsample_bytree': [0.4],
                    'subsample': [0.88],
                    'learning_rate': [0.025], #0.025
                    'gamma': [i/10.0 for i in range(0,10)],
                    'n_estimators' : [260],
                    #'tree_method': ['gpu_hist'],
                    'silent': [1],
                    'missing':[-999],
                    'n_jobs': [12],
                    #'updater': 'grow_gpu',
                    'scale_pos_weight' : [1] # due to high class imbalance
                  }

    # Normalized Gini Scorer
    gini_scorer = metrics.make_scorer(normalized_gini, greater_is_better = True)

    # Initialize Grid Search Model
    model = GridSearchCV(estimator  = xgb.XGBClassifier(),
                         param_grid = param_grid,
                         scoring    = 'roc_auc',
                         verbose    = 2,
                         n_jobs     = 12,
                         iid        = True,
                         refit      = True,
                         cv         = StratifiedKFold(labels, n_folds=5, shuffle=True))
    # Fit Grid Search Model
    model.fit(train, labels)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:")
    best_parameters = model.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Get best model
    best_model = model.best_estimator_

    # Fit model with best parameters optimized for normalized_gini
    #best_model.fit(train,labels)

    return best_model, model

In [None]:
xgb.XGBClassifier().get_params().keys()

In [None]:
def test_rand():
    train, labels, _, _, _ = prepare_data()
    # Create the pipeline
    # Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {
                    'min_child_weight': [10,11,12], # due to high class imbalance
                    'objective': ['binary:logistic'],
                    'max_depth': [7,8,9],
                    'nthread': [12],
                    'max_delta_step': [1.8],
                    'colsample_bytree': [0.51880508157068184],
                    'subsample': [0.69706874031839083],
                    'learning_rate': [0.024],#[0.028084362074445819], #0.025
                    'gamma': [0.068326902334014492], #np.random.uniform(0.066,.07,1000),
                    'n_estimators' : [350],
                    'reg_alpha':[0.00975],#[0.01],
                    #'tree_method': ['gpu_hist'],
                    'silent': [1],
                    'missing':[999],
                    'n_jobs': [12],
                    #'updater': 'grow_gpu',
                    'scale_pos_weight' : [1,2,3] # due to high class imbalance
                  }

    # Normalized Gini Scorer
    #gini_scorer = metrics.make_scorer(normalized_gini, greater_is_better = True)

    # Initialize Grid Search Model
    model = RandomizedSearchCV(estimator  = xgb.XGBClassifier(),
                         param_distributions = param_grid,
                         #param_grid = param_grid,
                         scoring    = 'roc_auc',
                         n_iter     = 10,     
                         verbose    = 2,
                         n_jobs     = 12,
                         iid        = True,
                         refit      = True,
                         cv         = StratifiedKFold( n_splits=2, shuffle=True))
    # Fit Grid Search Model
    model.fit(train, labels)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:")
    best_parameters = model.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Get best model
    best_model = model.best_estimator_

    # Fit model with best parameters optimized for normalized_gini
    #best_model.fit(train,labels)

    return best_model, model

In [None]:
t = time.time()
bstmdl, grd = test_rand()
print()
print('this took {}-seconds'.format(time.time()-t))