In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
import gc

In [2]:
os.chdir('/home/luke/Desktop/kaggle/Home_Credit_Default_Risk')

In [24]:
application_test = pd.read_csv('application_test.csv')
application_train = pd.read_csv('application_train.csv')
merged_df = pd.read_csv('processed_input_data.csv')

In [27]:
labels = merged_df['TARGET']

In [28]:
def get_unique_high_corr_feature(df,first_n_features):
    
    #Geting sorted list of contributio of TARGET
    correlations = df.corr()['TARGET'].sort_values(ascending=False)
    
    abs_corr = abs(correlations).sort_values(ascending=False)
    
    #Many of the features contain(Max, Min and Mean) contribute the same, so dedup
    
    corr_top_unique_features = abs_corr.head(first_n_features).drop_duplicates()
    
    #Return a list of distince features contribute to the model
    
    return np.array(corr_top_unique_features.index)

In [57]:
name_list = get_unique_high_corr_feature(merged_df,600)

In [58]:
new_high_corr_data = merged_df[name_list]

In [59]:
def missing_values_tables(df):
    
    mis_val = df.isnull().sum()
    mis_val_percent = 100*df.isnull().sum()/len(df)
    mis_val_table = pd.concat([mis_val,mis_val_percent],axis = 1)
    
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0:'Missing Values',1:'% of Total Values'})
    
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values',ascending=False).round(1)
    
    #print('Your selected dataframe has '+str(df.shape[1])+" columns.\n" "There are " + str(mis_val_table_ren_columns.shape[0])+
    #    ' column that have missing values.')
    return mis_val_table_ren_columns

In [27]:
#missing_values_tables(new_high_corr_data)

In [60]:
def drop_toomuchmissing_columns(df,cutoff_percent):
    thresholds_list = missing_values_tables(df)['% of Total Values']<=cutoff_percent
    thresholds_list = thresholds_list[thresholds_list==True]
    
    maintain_list = np.array(thresholds_list.index)
    
    return df[maintain_list]

In [61]:
merged_data = drop_toomuchmissing_columns(new_high_corr_data,70)

In [62]:
#Label Encoding
def process_dataframe(input_df, encoder_dict=None):
    """ Process a dataframe into a form useable by LightGBM """

    # Label encode categoricals
    print('Label encoding categorical features...')
    categorical_feats = input_df.columns[input_df.dtypes == 'object']
    for feat in categorical_feats:
        encoder = LabelEncoder()
        input_df[feat] = encoder.fit_transform(input_df[feat].fillna('NULL'))
    print('Label encoding complete.')

    return input_df, categorical_feats.tolist(), encoder_dict

In [63]:
process_dataframe(new_high_corr_data)

Label encoding categorical features...
Label encoding complete.


(        TARGET  EXT_SOURCE_3  EXT_SOURCE_2  EXT_SOURCE_1  \
 0          1.0      0.139376      0.262949      0.083037   
 1          0.0           NaN      0.622246      0.311267   
 2          0.0      0.729567      0.555912           NaN   
 3          0.0           NaN      0.650442           NaN   
 4          0.0           NaN      0.322738           NaN   
 5          0.0      0.621226      0.354225           NaN   
 6          0.0      0.492060      0.724000      0.774761   
 7          0.0      0.540654      0.714279           NaN   
 8          0.0      0.751724      0.205747      0.587334   
 9          0.0           NaN      0.746644           NaN   
 10         0.0      0.363945      0.651862      0.319760   
 11         0.0      0.652897      0.555183      0.722044   
 12         0.0      0.176653      0.715042      0.464831   
 13         0.0      0.770087      0.566907           NaN   
 14         0.0           NaN      0.642656      0.721940   
 15         0.0      0.6

In [64]:
#Get dummies
merged_df = pd.get_dummies(new_high_corr_data)

In [65]:
len_train = len(application_train)
app_train = new_high_corr_data[:len_train]
app_test = new_high_corr_data[len_train:]

In [66]:
# Make a new dataframe for polynomial features
poly_features = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]
poly_features_test = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

# imputer for handling missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median')

poly_target = poly_features['TARGET']

poly_features = poly_features.drop(columns = ['TARGET'])

# Need to impute missing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures
                                  
# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree = 3)

In [67]:
poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features shape: ', poly_features.shape)

Polynomial Features shape:  (307511, 35)


In [68]:
poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])[:15]

['1',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'DAYS_BIRTH',
 'EXT_SOURCE_1^2',
 'EXT_SOURCE_1 EXT_SOURCE_2',
 'EXT_SOURCE_1 EXT_SOURCE_3',
 'EXT_SOURCE_1 DAYS_BIRTH',
 'EXT_SOURCE_2^2',
 'EXT_SOURCE_2 EXT_SOURCE_3',
 'EXT_SOURCE_2 DAYS_BIRTH',
 'EXT_SOURCE_3^2',
 'EXT_SOURCE_3 DAYS_BIRTH',
 'DAYS_BIRTH^2']

In [69]:
# Create a dataframe of the features 
poly_features = pd.DataFrame(poly_features, 
                             columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                           'EXT_SOURCE_3', 'DAYS_BIRTH']))

# Add in the target
poly_features['TARGET'] = poly_target

# Find the correlations with the target
poly_corrs = poly_features.corr()['TARGET'].sort_values()

# Display most negative and most positive
print(poly_corrs.head(10))
print(poly_corrs.tail(5))

EXT_SOURCE_2 EXT_SOURCE_3                -0.193939
EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3   -0.189605
EXT_SOURCE_2^2 EXT_SOURCE_3              -0.176428
EXT_SOURCE_2 EXT_SOURCE_3^2              -0.172282
EXT_SOURCE_1 EXT_SOURCE_2                -0.166625
EXT_SOURCE_1 EXT_SOURCE_3                -0.164065
EXT_SOURCE_2                             -0.160295
EXT_SOURCE_1 EXT_SOURCE_2^2              -0.156867
EXT_SOURCE_3                             -0.155892
EXT_SOURCE_1 EXT_SOURCE_3^2              -0.150822
Name: TARGET, dtype: float64
EXT_SOURCE_1 EXT_SOURCE_2 DAYS_BIRTH    0.155891
EXT_SOURCE_2 DAYS_BIRTH                 0.156873
EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH    0.181283
TARGET                                  1.000000
1                                            NaN
Name: TARGET, dtype: float64


In [70]:
# Put test features into dataframe
poly_features_test = pd.DataFrame(poly_features_test, 
                                  columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                                'EXT_SOURCE_3', 'DAYS_BIRTH']))

# Merge polynomial features into training dataframe
poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR']
app_train_poly = app_train.merge(poly_features, on = 'SK_ID_CURR', how = 'left')

# Merge polnomial features into testing dataframe
poly_features_test['SK_ID_CURR'] = app_test['SK_ID_CURR']
app_test_poly = app_test.merge(poly_features_test, on = 'SK_ID_CURR', how = 'left')

# Align the dataframes
app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join = 'inner', axis = 1)

# Print out the new shapes
print('Training data with polynomial features shape: ', app_train_poly.shape)
print('Testing data with polynomial features shape:  ', app_test_poly.shape)

Training data with polynomial features shape:  (307511, 332)
Testing data with polynomial features shape:   (48744, 332)


In [82]:
app_train_poly.shape

(307511, 332)

In [73]:
from sklearn.preprocessing import MinMaxScaler, Imputer

# Drop the target from the training data
if 'TARGET' in app_train_poly:
    train = app_train_poly.drop(columns = ['TARGET'])
else:
    train = app_train_poly.copy()
features = list(train.columns)

# Copy of the testing data
if 'TARGET' in app_test_poly:
    test = app_test_poly.drop(columns = ['TARGET'])
else:
    test = app_test_poly.copy()

# Median imputation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train)

# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(test)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (307511, 332)
Testing data shape:  (48744, 332)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [74]:

gc.collect()

77

In [56]:
# Format the training and testing data 
#train = np.array(train_df)
#test = np.array(test_df.drop(columns='TARGET',inplace= True))

#train_labels = train_df.pop('TARGET')

# 10 fold cross validation
folds = KFold(n_splits=5, shuffle=True, random_state=50)

# Validation and test predictions
valid_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])

# Iterate through each fold
for n_fold, (train_indices, valid_indices) in enumerate(folds.split(train)):
    # Training data for the fold
    train_fold, train_fold_labels = train[train_indices, :], labels[train_indices]
    
    # Validation data for the fold
    valid_fold, valid_fold_labels = train[valid_indices, :], labels[valid_indices]
    
    # LightGBM classifier with hyperparameters
    clf = LGBMClassifier(
        n_estimators=10000,
        learning_rate=0.1,
        subsample=.8,
        max_depth=-1,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01,
        min_child_weight=2,
        boosting = 'dart',
        drop_rate = 0.02
    )
    
    # Fit on the training data, evaluate on the validation data
    clf.fit(train_fold, train_fold_labels, 
            eval_set= [(train_fold, train_fold_labels), (valid_fold, valid_fold_labels)], 
            eval_metric='auc', early_stopping_rounds=100, verbose = False
           )
    
    # Validation preditions
    valid_preds[valid_indices] = clf.predict_proba(valid_fold, num_iteration=clf.best_iteration_)[:, 1]
    
    # Testing predictions
    test_preds += clf.predict_proba(test_df, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
    # Display the performance for the current fold
    print('Fold %d AUC : %0.6f' % (n_fold + 1, roc_auc_score(valid_fold_labels, valid_preds[valid_indices])))
    
    # Delete variables to free up memory
    del clf, train_fold, train_fold_labels, valid_fold, valid_fold_labels
    gc.collect()

Fold 1 AUC : 0.782187
Fold 2 AUC : 0.780728
Fold 3 AUC : 0.784836
Fold 4 AUC : 0.784356
Fold 5 AUC : 0.780832


In [93]:
train.shape,app_train.shape

((307511, 806), (307511, 807))

In [94]:
x_train = train
y_train = app_train.TARGET

In [95]:
x_train.shape,y_train.shape

((307511, 806), (307511,))

In [96]:
partial_x_train_merged = x_train[:300000]
x_val_merged = x_train[3000001:]

In [97]:
partial_y_train_merged = y_train[:300000]
y_val_merged = y_train[300001:]

In [98]:
from keras import models
from keras import layers
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier

In [101]:
%%time
# Function to create model, required for KerasClassifier
def create_model(optimizer, init,reg):
    model = models.Sequential()
    model.add(layers.Dense(64, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg),input_shape=(806,)))
    model.add(layers.Dense(64, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg),activation='relu'))
    model.add(layers.Dense(1, kernel_initializer=init, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])
    return(model)
model = KerasClassifier(build_fn=create_model, verbose=0)

# grid search epochs, batch size and optimizer
optimizers = ['adam']
init = ['normal']
reg = [0.001]
epochs = [20,25,50]
batches = [200,400,500]

param_grid = dict(optimizer = optimizers, init = init,epochs = epochs,batch_size = batches,reg =reg)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc')
grid_result = grid.fit(partial_x_train_merged, partial_y_train_merged)

CPU times: user 48min 21s, sys: 4min 9s, total: 52min 31s
Wall time: 32min 5s


In [102]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.762180 using {'batch_size': 500, 'epochs': 50, 'init': 'normal', 'optimizer': 'adam', 'reg': 0.001}
0.761158 (0.003153) with: {'batch_size': 200, 'epochs': 20, 'init': 'normal', 'optimizer': 'adam', 'reg': 0.001}
0.760063 (0.003258) with: {'batch_size': 200, 'epochs': 25, 'init': 'normal', 'optimizer': 'adam', 'reg': 0.001}
0.760901 (0.003037) with: {'batch_size': 200, 'epochs': 50, 'init': 'normal', 'optimizer': 'adam', 'reg': 0.001}
0.761003 (0.002995) with: {'batch_size': 400, 'epochs': 20, 'init': 'normal', 'optimizer': 'adam', 'reg': 0.001}
0.761618 (0.003116) with: {'batch_size': 400, 'epochs': 25, 'init': 'normal', 'optimizer': 'adam', 'reg': 0.001}
0.761124 (0.003346) with: {'batch_size': 400, 'epochs': 50, 'init': 'normal', 'optimizer': 'adam', 'reg': 0.001}
0.761168 (0.003311) with: {'batch_size': 500, 'epochs': 20, 'init': 'normal', 'optimizer': 'adam', 'reg': 0.001}
0.761258 (0.003373) with: {'batch_size': 500, 'epochs': 25, 'init': 'normal', 'optimizer': 'adam', 'r