In [16]:

import numpy as np
import pandas as pd
from datetime import datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

from xgboost import XGBRegressor, XGBRFRegressor, XGBClassifier

In [17]:
train_data = pd.read_csv("data/train.csv",index_col = 'id')
test_data = pd.read_csv("data/test.csv",index_col = 'id')

In [18]:
train_data = train_data.fillna(train_data.mean())
test_data = test_data.fillna(train_data.mean())


In [19]:
Y_train = train_data['claim'].copy()
X_train = train_data.copy().drop('claim', axis = 1)

In [20]:
scaler = preprocessing.StandardScaler()

X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns])
test_data[test_data.columns] = scaler.transform(test_data[test_data.columns])


In [21]:
kfold = KFold(n_splits=5, shuffle=False)
model = XGBClassifier(n_estimators=4000,
                          reg_alpha=26,
                          reg_lambda=3,
                          subsample=0.6,
                          colsample_bytree=0.6,
                          max_depth=9,
                          min_child_weight=5,
                          gamma=13.05,
                          learning_rate=0.01,
                          n_jobs=8,
                          booster='gbtree',
                          verbosity=1)
for train_index, test_index in kfold.split(X_train, Y_train):

    model.fit(X_train.iloc[train_index], Y_train.iloc[train_index],
                 early_stopping_rounds=5,
                 eval_metric='auc',
                 eval_set=[(X_train.iloc[train_index], Y_train.iloc[train_index]),(X_train.iloc[test_index], Y_train.iloc[test_index])],
                 verbose=True)
    val_predicts = model.predict(X_train.iloc[test_index])
    roc_value = roc_curve(y_true=Y_train.iloc[test_index], y_score=val_predicts)
    print("Roc Value: ", roc_value)





[0]	validation_0-auc:0.55069	validation_1-auc:0.54038
[1]	validation_0-auc:0.57025	validation_1-auc:0.55891
[2]	validation_0-auc:0.58683	validation_1-auc:0.57437
[3]	validation_0-auc:0.59538	validation_1-auc:0.58140
[4]	validation_0-auc:0.59897	validation_1-auc:0.58431
[5]	validation_0-auc:0.60501	validation_1-auc:0.59022
[6]	validation_0-auc:0.61511	validation_1-auc:0.59993
[7]	validation_0-auc:0.61880	validation_1-auc:0.60350
[8]	validation_0-auc:0.62016	validation_1-auc:0.60461
[9]	validation_0-auc:0.62755	validation_1-auc:0.61178
[10]	validation_0-auc:0.63214	validation_1-auc:0.61715
[11]	validation_0-auc:0.63558	validation_1-auc:0.62064
[12]	validation_0-auc:0.63681	validation_1-auc:0.62142
[13]	validation_0-auc:0.63798	validation_1-auc:0.62277
[14]	validation_0-auc:0.64021	validation_1-auc:0.62511
[15]	validation_0-auc:0.64704	validation_1-auc:0.63180
[16]	validation_0-auc:0.64814	validation_1-auc:0.63306
[17]	validation_0-auc:0.64817	validation_1-auc:0.63288
[18]	validation_0-au

In [22]:
predicts = model.predict(test_data)


In [23]:

output = pd.DataFrame(predicts, columns = ['claim'])
output['id'] = test_data.index
output.to_csv('submission.csv', index=False)





