In [1]:
import pandas as pd
import numpy  as np
import lightgbm as lgb

from matplotlib import pyplot
%matplotlib inline

## Read Data

In [3]:
# [1] Read data
train = pd.read_csv("truthfull_set.csv")
probe = pd.read_csv('holdout_set2.csv')
train.head()

Unnamed: 0,retorno,var38,time_dif,month_x,month_y,numday_x,numday_y,var1,categoria
0,Nao,36,455,9,6,27,29,1,tipo2
1,Sim,76,686,11,1,23,6,1,tipo2
2,Sim,99,61,8,6,10,10,1,tipo4
3,Sim,84,70,10,7,5,27,1,tipo4
4,Sim,91,1,1,1,25,24,1,tipo5


## Data Preprocessing

In [4]:
# [2] One hot encode 'categoria'
train = pd.concat([train,pd.get_dummies(train['categoria'])],axis=1)
train.drop('categoria',axis=1,inplace=True)

probe = pd.concat([probe,pd.get_dummies(probe['categoria'])],axis=1)
probe.drop('categoria',axis=1,inplace=True)


In [5]:
# [3] Encode 'retorno' as binary, isolate from the main set
train_y = np.where(train['retorno'].values == 'Sim',1,0)
probe_y = np.where(probe['retorno'].values == 'Sim',1,0)


In [6]:
# [4] Drop unnecessary columns
train.drop('retorno',axis=1,inplace=True)
probe.drop('retorno',axis=1,inplace=True)
probe.drop('ID',axis=1,inplace=True)

## LightGBM

In [10]:
# [5] Create lgb datasets
dtrain = lgb.Dataset(train, train_y.tolist())
dvalid = lgb.Dataset(probe, probe_y.tolist(), reference=dtrain)

In [11]:
# [6] Set lightgbm parameters
params = {
    'num_leaves': 63,
    'boosting_type':'gbdt',
    'objective': 'regression',
    'learning_rate':0.005,
    'min_data_in_leaf': 30,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 3,
    'max_bin':256,
    "min_child_samples": 10,
    'metric': 'auc',
    'reg_lambda': 0,
    'min_split_gain': 0,
    #'device': 'gpu'
}

num_rounds = 501

In [12]:
# [7] Create cross validation model for hyp. tunning
lgb_cv = lgb.cv(params = params,
                train_set = dtrain,
                nfold = 4,
                num_boost_round = num_rounds,
                early_stopping_rounds = 100,
                stratified = True,
                shuffle = True,
                metrics = 'auc',
                verbose_eval = 10,
                seed = 451)

[10]	cv_agg's auc: 0.902041 + 0.00254329
[20]	cv_agg's auc: 0.901776 + 0.00256346
[30]	cv_agg's auc: 0.902051 + 0.00250556
[40]	cv_agg's auc: 0.901601 + 0.00257185
[50]	cv_agg's auc: 0.901142 + 0.00261741
[60]	cv_agg's auc: 0.901228 + 0.00259362
[70]	cv_agg's auc: 0.901277 + 0.00261505
[80]	cv_agg's auc: 0.901551 + 0.00259653
[90]	cv_agg's auc: 0.901804 + 0.00260093
[100]	cv_agg's auc: 0.901732 + 0.00259859
[110]	cv_agg's auc: 0.901727 + 0.0025564


In [31]:
# [8] Fit Lgb model and evaluate on the holdout set
lgb_model = lgb.train(params = params, 
                      train_set  = dtrain, 
                      valid_sets = [dtrain,dvalid], 
                      num_boost_round = 200, 
                      early_stopping_rounds = 100,
                      verbose_eval = 50)

Training until validation scores don't improve for 100 rounds.
[50]	training's auc: 0.904435	valid_1's auc: 0.887593
[100]	training's auc: 0.905552	valid_1's auc: 0.888237
Early stopping, best iteration is:
[16]	training's auc: 0.905453	valid_1's auc: 0.888545


## Keras/Tensorflow Neural Network

In [26]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.layers.advanced_activations import PReLU
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score

In [15]:
# [9] Create Keras-TF bad boi
model = Sequential()

model.add(Dense(512, init='he_normal', input_shape=(train.shape[1],), ))
model.add(Dropout(.5))
model.add(PReLU())
model.add(BatchNormalization())

model.add(Dense(128, init='he_normal'))
model.add(Dropout(.5))
model.add(PReLU())
model.add(BatchNormalization())


model.add(Dense(1,activation='sigmoid'))
jensen = Adam(lr=1e-4, decay=0)
model.compile(loss='binary_crossentropy',optimizer=jensen, metrics=['accuracy'])


Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [16]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_1 (Dense)                  (None, 512)           7168        dense_input_1[0][0]              
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 512)           0           dense_1[0][0]                    
____________________________________________________________________________________________________
prelu_1 (PReLU)                  (None, 512)           512         dropout_1[0][0]                  
____________________________________________________________________________________________________
batchnormalization_1 (BatchNorma (None, 512)           2048        prelu_1[0][0]                    
___________________________________________________________________________________________

In [22]:
# [10] Change data scale
scaler = MinMaxScaler()
train  = scaler.fit_transform(train)

scaler = MinMaxScaler()
probe  = scaler.fit_transform(probe)


In [25]:
# [11] Fit bad boi, evaluate on the holdout set
model.fit(x = train,
          y = train_y,
          batch_size = 64,
          nb_epoch = 100,
          verbose=2, 
          validation_data=(probe,probe_y) )

Train on 123837 samples, validate on 581903 samples
Epoch 1/100
17s - loss: 0.4957 - acc: 0.7752 - val_loss: 0.4008 - val_acc: 0.8278
Epoch 2/100
16s - loss: 0.4285 - acc: 0.8085 - val_loss: 0.3958 - val_acc: 0.8298
Epoch 3/100
15s - loss: 0.4105 - acc: 0.8173 - val_loss: 0.3939 - val_acc: 0.8309
Epoch 4/100
15s - loss: 0.4000 - acc: 0.8236 - val_loss: 0.3909 - val_acc: 0.8303
Epoch 5/100
15s - loss: 0.3941 - acc: 0.8257 - val_loss: 0.3899 - val_acc: 0.8291
Epoch 6/100
15s - loss: 0.3898 - acc: 0.8290 - val_loss: 0.3879 - val_acc: 0.8299
Epoch 7/100
15s - loss: 0.3881 - acc: 0.8280 - val_loss: 0.3873 - val_acc: 0.8291
Epoch 8/100
15s - loss: 0.3858 - acc: 0.8304 - val_loss: 0.3860 - val_acc: 0.8291
Epoch 9/100
15s - loss: 0.3832 - acc: 0.8318 - val_loss: 0.3852 - val_acc: 0.8300
Epoch 10/100
16s - loss: 0.3825 - acc: 0.8316 - val_loss: 0.3844 - val_acc: 0.8312
Epoch 11/100
15s - loss: 0.3804 - acc: 0.8319 - val_loss: 0.3837 - val_acc: 0.8310
Epoch 12/100
15s - loss: 0.3807 - acc: 0.831

15s - loss: 0.3573 - acc: 0.8422 - val_loss: 0.3727 - val_acc: 0.8347


<keras.callbacks.History at 0x7fd4f78eaf60>

## Model Evaluation & Ensembling

In [27]:
# [12] Evaluate NN in the holdout set
probe_pred = model.predict(probe)
roc_auc_score(probe_y, probe_pred)



0.8862473179273296

In [34]:
# [13] Evaluate Lgbm in the holdout set
probe_pred_lgb = lgb_model.predict(scaler.inverse_transform(probe))
roc_auc_score(probe_y, probe_pred_lgb)


0.8885450006107667

In [69]:
# [14] Nothing fancy, just a linear stacking
ensemble_probe_pred = np.add(np.multiply(probe_pred_lgb,0.96), (np.multiply(probe_pred,0.04).ravel()))
roc_auc_score(probe_y, ensemble_probe_pred)


0.8886615658398007

## Generate Final Predictions

In [71]:
# [15] Read Testing Data
test = pd.read_csv("testing_set.csv")
test.head()

Unnamed: 0,ID,var38,time_dif,month_x,month_y,numday_x,numday_y,var1,categoria
0,cli_100,36,766,8,6,4,29,0,tipo2
1,cli_100,36,766,8,6,4,29,0,tipo4
2,cli_100,36,788,8,6,26,29,0,tipo4
3,cli_100,36,788,8,6,26,29,0,tipo2
4,cli_100003,99,418,8,6,2,10,0,tipo2


In [72]:
# [16] One Hot Encode 'categoria'
test = pd.concat([test,pd.get_dummies(test['categoria'])],axis=1)
test.drop('categoria',axis=1,inplace=True)

# [17] Isolate IDs
test_id = test['ID']
test.drop('ID',axis=1,inplace=True)

In [75]:
# [18] Generate Lgb and DNN predictions
lgb_final_pred = lgb_model.predict(test)
dnn_final_pred = model.predict(scaler.transform(test))

In [76]:
# [19] Ensemble Predictions
ensemble_final_pred = np.add(np.multiply(lgb_final_pred,0.96), (np.multiply(dnn_final_pred,0.04).ravel()))

# [20] Create dataframe with IDs and export file
final_df = pd.DataFrame(ensemble_final_pred, index=test_id)
final_df.to_csv('final_pred.csv')

#### All done, thank you for the challenge.