In [17]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import lightgbm as lgb
import re

In [24]:
train = pd.read_csv('application_train.csv')
test = pd.read_csv('application_test.csv')

In [69]:
train.shape

(307511, 122)

In [70]:
test.shape

(48744, 121)

In [48]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(2)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [50]:
missing_values_table(train)

Your selected dataframe has 240 columns.
There are 61 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
COMMONAREA_MODE,214865,69.87
COMMONAREA_MEDI,214865,69.87
COMMONAREA_AVG,214865,69.87
NONLIVINGAPARTMENTS_MODE,213514,69.43
NONLIVINGAPARTMENTS_AVG,213514,69.43
...,...,...
EXT_SOURCE_2,660,0.21
AMT_GOODS_PRICE,278,0.09
AMT_ANNUITY,12,0.00
CNT_FAM_MEMBERS,2,0.00


In [13]:
missing_values = missing_values_table(train)
missing_values.head()

Your selected dataframe has 122 columns.
There are 67 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
COMMONAREA_MEDI,214865,69.87
COMMONAREA_AVG,214865,69.87
COMMONAREA_MODE,214865,69.87
NONLIVINGAPARTMENTS_MEDI,213514,69.43
NONLIVINGAPARTMENTS_MODE,213514,69.43


In [25]:
###LABEL ENCODING

le = LabelEncoder()

for col in train:
    if train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(train[col].unique())) <= 2:
            le.fit(train[col])
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])

###ONE HOT ENCODING

train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [26]:
labels = train['TARGET']

In [27]:
train, test = train.align(test, join='inner', axis=1)
train['TARGET'] = labels

In [9]:
train.shape

(307511, 240)

In [10]:
test.shape

(48744, 239)

In [28]:
# Initialize an empty array to hold feature importances
feature_importances = np.zeros(train.shape[1])

# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

In [29]:
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Fit the model twice to avoid overfitting
for i in range(2):
    
    # Split into training and validation set
    train_features, valid_features, train_y, valid_y = train_test_split(train, labels, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
              eval_metric = 'auc', verbose = 200)
    
    # Record the feature importances
    feature_importances += model.feature_importances_

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.598139
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.598139


In [30]:
feature_importances = feature_importances / 2
feature_importances = pd.DataFrame({'feature': list(train.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

In [32]:
feature_importances.head(20)

Unnamed: 0,feature,importance
0,SK_ID_CURR,2.0
239,TARGET,1.0
6,AMT_CREDIT,0.5
2,FLAG_OWN_CAR,0.0
166,ORGANIZATION_TYPE_Agriculture,0.0
154,OCCUPATION_TYPE_Salesstaff,0.0
155,OCCUPATION_TYPE_Secretaries,0.0
156,OCCUPATION_TYPE_Securitystaff,0.0
157,OCCUPATION_TYPE_Waitersbarmenstaff,0.0
158,WEEKDAY_APPR_PROCESS_START_FRIDAY,0.0


In [None]:
for col in train:
    if train[col].dtype == 'int64':
        print(train[col].describe())

In [None]:
###FEATURE SELECTION

train_missing = (train.isnull().sum() / len(train)).sort_values(ascending = False)
train_missing.head(20)

In [None]:
test_missing = (test.isnull().sum() / len(test)).sort_values(ascending = False)
test_missing.head(20)

In [67]:
train_missing = (train.isnull().sum() / len(train)).sort_values(ascending = False)
train_missing.head(20)

test_missing = (test.isnull().sum() / len(test)).sort_values(ascending = False)
test_missing.head(20)

train_missing = train_missing.index[train_missing > 0.5]
test_missing = test_missing.index[test_missing > 0.5]

all_missing = list(set(set(train_missing) | set(test_missing)))
print('There are %d columns with more than 50%% missing values' % len(all_missing))

There are 38 columns with more than 50% missing values
