In [1]:
import pandas as pd
from math import ceil
import numpy as np
import pydot as pt
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_curve, auc, roc_auc_score

import lightgbm as lgb
import os
#print(os.listdir("../input"))
import sys

In [3]:
train_df = pd.read_csv('train.csv', low_memory = False)
test_df = pd.read_csv('test.csv')

columns_train = train_df.iloc[0].values

train_df.columns = columns_train

train_df = train_df.drop(0, axis = 0)

train_df = train_df.drop('ID_code', axis = 1)
test_df = test_df.drop('ID_code', axis = 1)

train_df = train_df.apply(pd.to_numeric)
test_df = test_df.apply(pd.to_numeric)

In [4]:
train_df.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [5]:
train_df.describe()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,10.679914,-1.627622,10.715192,6.796529,11.078333,-5.065317,5.408949,16.54585,0.284162,...,3.23444,7.438408,1.927839,3.331774,17.993784,-0.142088,2.303335,8.908158,15.87072,-3.326537
std,0.300653,3.040051,4.050044,2.640894,2.043319,1.62315,7.863267,0.866607,3.418076,3.332634,...,4.559922,3.023272,1.478423,3.99203,3.135162,1.429372,5.454369,0.921625,3.010945,10.438015
min,0.0,0.4084,-15.0434,2.1171,-0.0402,5.0748,-32.5626,2.3473,5.3497,-10.5055,...,-14.0933,-2.6917,-3.8145,-11.7834,8.6944,-5.261,-14.2096,5.9606,6.2993,-38.8528
25%,0.0,8.45385,-4.740025,8.722475,5.254075,9.883175,-11.20035,4.7677,13.9438,-2.3178,...,-0.058825,5.1574,0.889775,0.5846,15.6298,-1.1707,-1.946925,8.2528,13.8297,-11.208475
50%,0.0,10.52475,-1.60805,10.58,6.825,11.10825,-4.83315,5.3851,16.4568,0.3937,...,3.2036,7.34775,1.9013,3.39635,17.95795,-0.1727,2.4089,8.8882,15.93405,-2.81955
75%,0.0,12.7582,1.358625,12.5167,8.3241,12.261125,0.9248,6.003,19.1029,2.9379,...,6.4062,9.512525,2.9495,6.2058,20.396525,0.8296,6.556725,9.5933,18.064725,4.8368
max,1.0,20.315,10.3768,19.353,13.1883,16.6714,17.2516,8.4477,27.6918,10.1513,...,18.4409,16.7165,8.4024,18.2818,27.9288,4.2729,18.3215,12.0004,26.0791,28.5007


In [6]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target = train_df['target']
param = {
    'bagging_freq': 5,          'bagging_fraction': 0.38,   'boost_from_average':'false',   'boost': 'gbdt',
    'feature_fraction': 0.045,   'learning_rate': 0.0105,     'max_depth': -1,                'metric':'auc',
    'min_data_in_leaf': 80,     'min_sum_hessian_in_leaf': 10.0,'num_leaves': 13,           'num_threads': 8,
    'tree_learner': 'serial',   'objective': 'binary',      'verbosity': 1
}

In [7]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits = 3, shuffle = False, random_state = 44000)  #split = 12
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))

In [8]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold :{}".format(fold_ + 1))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label = target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label = target.iloc[val_idx])
    clf = lgb.train(param, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval = 1000, early_stopping_rounds = 1000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration = clf.best_iteration)
    predictions += clf.predict(test_df[features], num_iteration = clf.best_iteration) / folds.n_splits

Fold :1
Training until validation scores don't improve for 1000 rounds.
[1000]	training's auc: 0.907343	valid_1's auc: 0.883443
[2000]	training's auc: 0.919281	valid_1's auc: 0.89092
[3000]	training's auc: 0.927248	valid_1's auc: 0.894543
[4000]	training's auc: 0.933367	valid_1's auc: 0.896366
[5000]	training's auc: 0.938884	valid_1's auc: 0.897476
[6000]	training's auc: 0.943735	valid_1's auc: 0.898016
[7000]	training's auc: 0.948279	valid_1's auc: 0.898232
[8000]	training's auc: 0.952616	valid_1's auc: 0.89832
[9000]	training's auc: 0.956711	valid_1's auc: 0.898291
Early stopping, best iteration is:
[8235]	training's auc: 0.95358	valid_1's auc: 0.898349
Fold :2
Training until validation scores don't improve for 1000 rounds.
[1000]	training's auc: 0.906269	valid_1's auc: 0.883906
[2000]	training's auc: 0.918638	valid_1's auc: 0.892155
[3000]	training's auc: 0.926792	valid_1's auc: 0.895534
[4000]	training's auc: 0.933218	valid_1's auc: 0.897256
[5000]	training's auc: 0.938722	valid_1'

NameError: name 'roc_auc_score' is not defined

In [11]:
sys.stdout.write("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

CV score: 0.89921 CV score: 0.89921 


In [None]:
#sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
#sub["target"] = predictions
#sub.to_csv('submission.csv', index=False)

In [12]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [14]:
columns_train

array(['ID_code', 'target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4',
       'var_5', 'var_6', 'var_7', 'var_8', 'var_9', 'var_10', 'var_11',
       'var_12', 'var_13', 'var_14', 'var_15', 'var_16', 'var_17',
       'var_18', 'var_19', 'var_20', 'var_21', 'var_22', 'var_23',
       'var_24', 'var_25', 'var_26', 'var_27', 'var_28', 'var_29',
       'var_30', 'var_31', 'var_32', 'var_33', 'var_34', 'var_35',
       'var_36', 'var_37', 'var_38', 'var_39', 'var_40', 'var_41',
       'var_42', 'var_43', 'var_44', 'var_45', 'var_46', 'var_47',
       'var_48', 'var_49', 'var_50', 'var_51', 'var_52', 'var_53',
       'var_54', 'var_55', 'var_56', 'var_57', 'var_58', 'var_59',
       'var_60', 'var_61', 'var_62', 'var_63', 'var_64', 'var_65',
       'var_66', 'var_67', 'var_68', 'var_69', 'var_70', 'var_71',
       'var_72', 'var_73', 'var_74', 'var_75', 'var_76', 'var_77',
       'var_78', 'var_79', 'var_80', 'var_81', 'var_82', 'var_83',
       'var_84', 'var_85', 'var_86', 'var_87', 'var

In [16]:
X_train, X_test, y_train, y_test = train_test_split(train_df[columns_train[2:]], 
                                                    train_df[columns_train[1]], 
                                                    test_size = 0.3, random_state = 2019)

In [17]:
print(X_train.shape, X_test.shape)

(140000, 200) (60000, 200)


In [18]:
# work with X_train, X_test, y_train, y_test
# Create an object of Logistic Regression with parameters C and class_weight
logist = LogisticRegression(C = 0.001, class_weight = 'balanced')

# Fit the training data on this object
logist.fit(X_train, y_train)

LogisticRegression(C=0.001, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [19]:
# Predict the Target for validation dataset 
logist_pred = logist.predict_proba(X_test)[:,1]

In [20]:
def performance(Y_test, logist_pred):
    logist_pred_var = [0 if i < 0.5 else 1 for i in logist_pred]
    print('Confusion Matrix:')
    print(confusion_matrix(Y_test, logist_pred_var)) 
      
    #print(classification_report(Y_test, logist_pred)) 

    fpr, tpr, thresholds = roc_curve(Y_test, logist_pred, pos_label = 1)
    print('AUC:')
    print(auc(fpr, tpr))

In [21]:
performance(y_test, logist_pred)

Confusion Matrix:
[[42214 11737]
 [ 1404  4645]]
AUC:
0.8541974369026266


In [22]:
# Create Decision Tree Classifier object with few parameters
tree_clf = DecisionTreeClassifier(class_weight = 'balanced', random_state = 2019, 
                                  max_features = 0.7, min_samples_leaf = 80)

# Fit the object on training data
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=0.7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=80, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=2019,
            splitter='best')

In [23]:
# Predict for validation set and check the performance
tree_preds = tree_clf.predict_proba(X_test)[:, 1]
performance(y_test, tree_preds)

Confusion Matrix:
[[35314 18637]
 [ 2629  3420]]
AUC:
0.6514727493199708


In [None]:
# Create random Forest Object using the mentioned parameters
random_forest = RandomForestClassifier(n_estimators = 25, random_state = 2019, verbose = 1, #n_estimators = 100
                                      class_weight = 'balanced', max_features = 0.5, 
                                       min_samples_leaf = 25) # min_samples_leaf = 100

# Fit the object on training set 
random_forest.fit(X_train, y_train)

In [None]:
# Predict the validation set target and check the performance
forest_preds = random_forest.predict_proba(X_test)[:, 1]
performance(y_test, forest_preds)

In [None]:
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Dropout, RepeatVector, Lambda, Permute, Activation, Masking, Reshape
from keras.layers import recurrent, Input, TimeDistributed, add, concatenate, Multiply, Bidirectional
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.models import Sequential, Model
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, Callback, LearningRateScheduler
from keras.optimizers import SGD, Adam
from keras.layers.normalization import BatchNormalization

from keras.activations import softmax
from keras.metrics import categorical_accuracy
import keras.backend as K
from keras.regularizers import l2
from keras import initializers
#from keras.utils.visualize_util import plot
from keras.layers.core import Layer

from keras.activations import softmax, tanh, sigmoid, hard_sigmoid, relu

In [None]:
def create_models(in_shape):
    adam = Adam(lr = 0.0003)  # Best learning found in previous exp
    input_layer = Input(shape = (in_shape,), name = 'input_layer')
    dense = Dense(100, activation = 'relu')(input_layer)
    #dense = Dropout(0.35)(dense)
    dense = Dense(50, activation = 'relu')(dense)
    dense = Dropout(0.30)(dense)
    output = Dense(2, activation = 'softmax')(dense)
    model = Model(inputs = input_layer, outputs = output)
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    #model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

    #tf_thetas = tf.get_variable("tf_thetas",
    #                        initializer=thetas)

    #sample_output = Lambda(lambda x: 
    #               gumbel_softmax(x, temperature,  hard = hard_val), 
    #               output_shape = (2,))(output)
    #cond_prob = Lambda(lambda x: tf.einsum('ai,ij->aj', x[0], tf_thetas*1.),
    #          output_shape = (2, ))([sample_output])

    #model_ask = Model(inputs = input_layer, outputs = cond_prob)
    #model_ask.compile(loss = 'categorical_crossentropy', 
    #              optimizer = adam, metrics = ['accuracy'])
    
    return model#, model_ask

In [None]:
random_idx = np.random.choice(len(X_train), len(X_train), replace = False)

X_train_nn = X_train.iloc[random_idx]
y_train_nn = y_train.iloc[random_idx]

In [None]:
K.clear_session()
model = create_models(X_train.shape[1])

In [None]:
from keras.utils import to_categorical
y_binary_train = to_categorical(y_train)
y_binary_test = to_categorical(y_test)

In [None]:
# Make severl models, computes and average them. Consider Kfold, SVM, NN, Logisteic Regression, Light GBM and several configurations of them
h = model.fit(X_train, y_binary_train, 
              epochs = 50, validation_data = (X_test, y_binary_test),
              batch_size = 25,
              verbose = 1)

In [None]:
y_predicted_nn = model.predict(X_test)

In [None]:
temp_nn_predic = []
for i in range(len(y_predicted_nn)):
    if y_predicted_nn[i,0] > y_predicted_nn[i,1]:
        temp_nn_predic.append(0)
    else:
        temp_nn_predic.append(1)

In [None]:
# There is no gain with more layers!
fpr_nn, tpr_nn, threshold_nn = roc_curve(y_test, temp_nn_predic)
roc_auc_nn = auc(fpr_nn, tpr_nn)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr_nn, tpr_nn, 'b', label = 'AUC = %0.2f' % roc_auc_nn)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
#TODO: run several configurations for each model, then combine