In [2]:
import pandas as pd 
import sys
print(sys.version)

import numpy as np 
import xgboost as xgb
from scipy.optimize import fmin_powell
from ml_metrics import quadratic_weighted_kappa

def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)
    
def get_params():
    
    params = {}
    params["objective"] = "count:poisson"     
    params["eta"] = 0.05
    params["min_child_weight"] = 240
    params["subsample"] = 0.9
    params["colsample_bytree"] = 0.67
    params["silent"] = 1
    params["max_depth"] = 6
    plst = list(params.items())

    return plst
    
def apply_offset(data, bin_offset, sv, scorer=eval_wrapper):
    # data has the format of pred=0, offset_pred=1, labels=2 in the first dim
    data[1, data[0].astype(int)==sv] = data[0, data[0].astype(int)==sv] + bin_offset
    score = scorer(data[1], data[2])
    return score

# global variables
columns_to_drop = ['Id', 'Response', 'Medical_History_10','Medical_History_24']
xgb_num_rounds = 800
num_classes = 8
eta_list = [0.05] * 200 
eta_list = eta_list + [0.02] * 500
eta_list = eta_list + [0.01] * 100


print("Load the data using pandas")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# combine train and test
all_data = train.append(test)

# Found at https://www.kaggle.com/marcellonegro/prudential-life-insurance-assessment/xgb-offset0501/run/137585/code
# create any new variables    
all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0]
all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1]

# factorize categorical variables
all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]
all_data['Product_Info_2_char'] = pd.factorize(all_data['Product_Info_2_char'])[0]
all_data['Product_Info_2_num'] = pd.factorize(all_data['Product_Info_2_num'])[0]

all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']

med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')]
all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)

print('Eliminate missing values')    
# Use -1 for any others
all_data.fillna(-1, inplace=True)

# fix the dtype on the label column
all_data['Response'] = all_data['Response'].astype(int)

# split train and test
train = all_data[all_data['Response']>0].copy()
test = all_data[all_data['Response']<1].copy()

# convert data to xgb data structure
xgtrain = xgb.DMatrix(train.drop(columns_to_drop, axis=1), train['Response'].values)
xgtest = xgb.DMatrix(test.drop(columns_to_drop, axis=1), label=test['Response'].values)    

# get the parameters for xgboost
plst = get_params()
print(plst)      

# train model
model = xgb.train(plst, xgtrain, xgb_num_rounds, learning_rates=eta_list) 

# get preds
train_preds = model.predict(xgtrain, ntree_limit=model.best_iteration)
print('Train score is:', eval_wrapper(train_preds, train['Response'])) 
test_preds = model.predict(xgtest, ntree_limit=model.best_iteration)
train_preds = np.clip(train_preds, -0.99, 8.99)
test_preds = np.clip(test_preds, -0.99, 8.99)

# train offsets 
offsets = np.array([0.1, -1, -2, -1, -0.8, 0.02, 0.8, 1])
data = np.vstack((train_preds, train_preds, train['Response'].values))
for j in range(num_classes):
    data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j] 
for j in range(num_classes):
    train_offset = lambda x: -apply_offset(data, x, j)
    offsets[j] = fmin_powell(train_offset, offsets[j])  

# apply offsets to test
data = np.vstack((test_preds, test_preds, test['Response'].values))
for j in range(num_classes):
    data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j] 

final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)

preds_out = pd.DataFrame({"Id": test['Id'].values, "Response": final_test_preds})
preds_out = preds_out.set_index('Id')
preds_out.to_csv('Submission2.csv')

3.6.3 (v3.6.3:2c5fed8, Oct  3 2017, 17:26:49) [MSC v.1900 32 bit (Intel)]
Load the data using pandas
Eliminate missing values
[('objective', 'count:poisson'), ('eta', 0.05), ('min_child_weight', 240), ('subsample', 0.9), ('colsample_bytree', 0.67), ('silent', 1), ('max_depth', 6)]
Train score is: 0.6365266181190505
Optimization terminated successfully.
         Current function value: -0.697078
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.697078
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.697078
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.697378
         Iterations: 2
         Function evaluations: 39
Optimization terminated successfully.
         Current function value: -0.697540
         Iterations: 2
         Function evaluatio

In [4]:
train.describe()

Unnamed: 0,BMI,Employment_Info_1,Employment_Info_2,Employment_Info_3,Employment_Info_4,Employment_Info_5,Employment_Info_6,Family_Hist_1,Family_Hist_2,Family_Hist_3,...,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Response,Wt,Product_Info_2_char,Product_Info_2_num,BMI_Age,Med_Keywords_Count
count,59381.0,59381.0,59381.0,59381.0,59381.0,59381.0,59381.0,59381.0,59381.0,59381.0,...,59381.0,59381.0,59381.0,59381.0,59381.0,59381.0,59381.0,59381.0,59381.0,59381.0
mean,0.469462,0.077237,8.641821,1.300904,-0.108596,2.142958,0.112612,2.68623,-0.237036,-0.365906,...,0.328952,2.006955,2.673599,1.043583,5.636837,0.292587,0.497179,1.941345,0.193702,1.264765
std,0.122213,0.084559,4.227082,0.715034,0.321495,0.350033,0.61379,0.483159,0.745217,0.745627,...,0.282562,0.083107,0.739103,0.291949,2.456833,0.089037,0.821474,1.723506,0.1115,1.480236
min,0.0,-1.0,1.0,1.0,-1.0,2.0,-1.0,1.0,-1.0,-1.0,...,0.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.385517,0.035,9.0,1.0,0.0,2.0,0.0,2.0,-1.0,-1.0,...,0.076923,2.0,3.0,1.0,4.0,0.225941,0.0,0.0,0.104285,0.0
50%,0.451349,0.06,9.0,1.0,0.0,2.0,0.15,3.0,0.217391,-1.0,...,0.230769,2.0,3.0,1.0,6.0,0.288703,0.0,2.0,0.183607,1.0
75%,0.532858,0.1,9.0,1.0,0.0,2.0,0.5,3.0,0.463768,0.480392,...,0.487179,2.0,3.0,1.0,8.0,0.345188,1.0,3.0,0.267714,2.0
max,1.0,1.0,38.0,3.0,1.0,3.0,1.0,3.0,1.0,1.0,...,1.0,3.0,3.0,3.0,8.0,1.0,4.0,7.0,0.80597,16.0
