In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from featuretools.primitives import AggregationPrimitive, make_agg_primitive
from datetime import datetime, timedelta
import re
import featuretools as ft
from sklearn.preprocessing import MinMaxScaler, Imputer

In [2]:
data = pd.read_csv("\\Users\\JoonH\\total_feature_matrix_spec.csv")

In [3]:
train = data[data['set'] == 'train']
test = data[data['set'] == 'test']

In [4]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)
train_labels= train['TARGET']
train, test = train.align(test, join = 'inner', axis = 1)
train['TARGET'] = train_labels

In [5]:
test = test.drop('TARGET', axis = 1)

In [6]:
# Devide train data into 3: train A, B, and C
from sklearn.model_selection import train_test_split
train_A, train_B = train_test_split(train, test_size = 0.6, random_state = 3)
train_B, train_C = train_test_split(train_B, test_size = 0.3, random_state = 3)


In [7]:
print(train_A.shape, train_B.shape, train_C.shape, test.shape)

(123004, 1157) (129154, 1157) (55353, 1157) (48744, 1156)


In [8]:
def process_data(data):
    
    if 'TARGET' in data.columns:
        data_features = data.drop('TARGET', axis = 1)
    else:
        data_features = data.copy()
        
    #encoded_data_features = pd.get_dummies(data_features)
    imputer = Imputer(strategy = 'median')
    filled_data_features = imputer.fit_transform(data_features)
    scaler = MinMaxScaler(feature_range = (0, 10))
    scaled_data_features = scaler.fit_transform(filled_data_features)
    
    return scaled_data_features

In [9]:
# First layer models: 3 xgboost, 1 lightgbm, 2 neural nets
import xgboost as xgb

def train_xgb (clf, params, features, num_round): # features should include target variable in them
    train_labels = features['TARGET']
    features = process_data(features)
    dtrain = clf.DMatrix(features, label = train_labels)
    bst = clf.train(params, dtrain, num_round)
    
    return bst

In [10]:
# short tree
param_1 = {'max_depth' : 10, 'eta': 0.001, 'silent': 1, 'objective': 'binary:logistic'}
param_1['nthread'] = 4
param_1['eval_metric'] = 'auc'
param_1['eval_metric']= ['auc', 'ams@0']

bst_1 = train_xgb(xgb, param_1, train_A, 1000)

In [11]:
# medium tree
param_2 = {'max_depth' : 100, 'eta': 0.0001, 'silent': 1, 'objective': 'binary:logistic'}
param_2['nthread'] = 4
param_2['eval_metric'] = 'auc'
param_2['eval_metric']= ['auc', 'ams@0']

bst_2 = train_xgb(xgb, param_2, train_A, 1200)

In [12]:
# high(deep) tree
param_3 = {'max_depth' : 750, 'eta': 0.0001, 'silent': 1, 'objective': 'binary:logistic'}
param_3['nthread'] = 4
param_3['eval_metric'] = 'auc'
param_3['eval_metric']= ['auc', 'ams@0']

bst_3 = train_xgb(xgb, param_3, train_A, 850)

In [16]:
def predict_first_layer(xgb, clf, dataB, dataC, test):
    print('Preparing data...')
    B_features = process_data(dataB)
    C_features = process_data(dataC)
    test_features = process_data(test)
    db = xgb.DMatrix(B_features)
    dc = xgb.DMatrix(C_features)
    dtest = xgb.DMatrix(test_features)
    print('Data prepared, predicting...')
    B_meta = clf.predict(db)
    C_meta = clf.predict(dc)
    test_meta = clf.predict(dtest)
    print('Done!')
    
    return B_meta, C_meta, test_meta

In [17]:
bst_1_B_meta, bst_1_C_meta, bst_1_test_meta = predict_first_layer(xgb, bst_1, train_B, 
                                                            train_C, test)

Preparing data...
Data prepared, predicting...
Done!


In [19]:
bst_2_B_meta, bst_2_C_meta, bst_2_test_meta = predict_first_layer(xgb, bst_2, train_B, 
                                                            train_C, test)

Preparing data...
Data prepared, predicting...
Done!


In [20]:
bst_3_B_meta, bst_3_C_meta, bst_3_test_meta = predict_first_layer(xgb, bst_3, train_B, 
                                                            train_C, test)

Preparing data...
Data prepared, predicting...
Done!


In [21]:
bst_1_B_meta = pd.DataFrame(bst_1_B_meta, columns = ['eng_xgb_1'])
bst_2_B_meta = pd.DataFrame(bst_2_B_meta, columns = ['eng_xgb_2'])
bst_3_B_meta = pd.DataFrame(bst_3_B_meta, columns = ['eng_xgb_3'])

In [22]:
bst_1_C_meta = pd.DataFrame(bst_1_C_meta, columns = ['eng_xgb_1'])
bst_2_C_meta = pd.DataFrame(bst_2_C_meta, columns = ['eng_xgb_2'])
bst_3_C_meta = pd.DataFrame(bst_3_C_meta, columns = ['eng_xgb_3'])

In [23]:
bst_1_test_meta = pd.DataFrame(bst_1_test_meta, columns = ['eng_xgb_1'])
bst_2_test_meta = pd.DataFrame(bst_2_test_meta, columns = ['eng_xgb_2'])
bst_3_test_meta = pd.DataFrame(bst_3_test_meta, columns = ['eng_xgb_3'])

In [24]:
B_meta_data = pd.concat([bst_1_B_meta, bst_2_B_meta, bst_3_B_meta],axis = 1)
C_meta_data = pd.concat([bst_1_C_meta, bst_2_C_meta, bst_3_C_meta],axis = 1)
test_meta_data = pd.concat([bst_1_test_meta, bst_2_test_meta, bst_3_test_meta],axis = 1)

In [25]:
#Saving xgboost results to csv
B_meta_data.to_csv('eng_data_B_meta_xgb')
C_meta_data.to_csv('eng_data_C_meta_xgb')
test_meta_data.to_csv('eng_data_test_meta_xgb')

### add Neural network --- work on this point after logging out as local machine will most likely crash due to memory overload

In [None]:
# Run up to line 7 for basic import and data

In [None]:
from keras.models import *
from keras.layers import *
