In [139]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import xgboost as xgb

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import lightgbm as lgb
import re

In [143]:
train = pd.read_csv('application_train.csv')
test = pd.read_csv('application_test.csv')

###LABEL ENCODING

le = LabelEncoder()

for col in train:
    if train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(train[col].unique())) <= 2:
            le.fit(train[col])
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])

###ONE HOT ENCODING

train = pd.get_dummies(train)
test = pd.get_dummies(test)

train = train.dropna(axis='columns')
test = test.dropna(axis='columns')

labels = train['TARGET']

###ALIGN TRAIN AND TEST

train, test = train.align(test, join='inner', axis=1)
train['TARGET'] = labels

###DEFINE X AND Y

X = train.drop(columns=['TARGET'])
y = train['TARGET']

In [144]:
###RANDOM FOREST

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [145]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.07998699238266045
Mean Squared Error: 0.07998699238266045
Root Mean Squared Error: 0.282819717103777


In [146]:
print(accuracy_score(y_test, y_pred))

0.9200130076173395


In [150]:
print(confusion_matrix(y_test,y_pred))

[[93362     0]
 [ 8117     0]]


In [None]:
a = pd.Series(y_pred)
a.value_counts()

In [None]:
labels.value_counts()

In [None]:
train.shape

In [None]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(2)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
missing_values_table(train)

In [None]:
missing_values = missing_values_table(train)
missing_values.head()

In [None]:
labels = train['TARGET']

In [None]:
train, test = train.align(test, join='inner', axis=1)
train['TARGET'] = labels

In [None]:
train.shape

In [None]:
test.shape

In [None]:
# Initialize an empty array to hold feature importances
feature_importances = np.zeros(train.shape[1])

# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

In [None]:
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Fit the model twice to avoid overfitting
for i in range(2):
    
    # Split into training and validation set
    train_features, valid_features, train_y, valid_y = train_test_split(train, labels, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
              eval_metric = 'auc', verbose = 200)
    
    # Record the feature importances
    feature_importances += model.feature_importances_

In [None]:
feature_importances = feature_importances / 2
feature_importances = pd.DataFrame({'feature': list(train.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

In [None]:
feature_importances.head(20)

In [None]:
for col in train:
    if train[col].dtype == 'int64':
        print(train[col].describe())

In [None]:
###FEATURE SELECTION

train_missing = (train.isnull().sum() / len(train)).sort_values(ascending = False)
train_missing.head(20)

In [None]:
test_missing = (test.isnull().sum() / len(test)).sort_values(ascending = False)
test_missing.head(20)

In [None]:
train_missing = (train.isnull().sum() / len(train)).sort_values(ascending = False)
train_missing.head(20)

test_missing = (test.isnull().sum() / len(test)).sort_values(ascending = False)
test_missing.head(20)

train_missing = train_missing.index[train_missing > 0.5]
test_missing = test_missing.index[test_missing > 0.5]

all_missing = list(set(set(train_missing) | set(test_missing)))
print('There are %d columns with more than 50%% missing values' % len(all_missing))