In [1]:
import pandas as pd
import numpy as np
#from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv("\\Users\\JoonH\\Desktop\\Home_Credit_Challenge\\all\\application_train.csv")
test = pd.read_csv("\\Users\\JoonH\\Desktop\\Home_Credit_Challenge\\all\\application_test.csv")

In [3]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)
train_labels= train['TARGET']
train, test = train.align(test, join = 'inner', axis = 1)
train['TARGET'] = train_labels

In [4]:
test_id = test['SK_ID_CURR']
train = train.drop('SK_ID_CURR', axis = 1)
test = test.drop('SK_ID_CURR', axis = 1)

In [5]:
# Devide train data into 3: train A, B, and C
from sklearn.model_selection import train_test_split
train_A, train_B = train_test_split(train, test_size = 0.6, random_state = 3)
train_B, train_C = train_test_split(train_B, test_size = 0.5, random_state = 3)


In [6]:
print(train_A.shape, train_B.shape, train_C.shape, test.shape)

(123004, 242) (92253, 242) (92254, 242) (48744, 241)


In [7]:
# First level models to use

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier


In [8]:
ada = AdaBoostClassifier(n_estimators = 100, learning_rate = 0.01) # AdaBoost
bag = BaggingClassifier(n_estimators = 50, max_samples = 0.2,
                        max_features = 1.0, warm_start = True,
                        n_jobs = -1) # Bagging
gbc = GradientBoostingClassifier(learning_rate = 0.01,
                                 n_estimators = 600,
                                 max_depth = 10,
                                 subsample = 0.5) # Gradient Boosting Classifier
rf = RandomForestClassifier(n_estimators = 100,
                            n_jobs = -1)# Random Forest
gp = GaussianProcessClassifier(n_jobs = -1) # Gaussian Process
log = LogisticRegression(C = 0.1, n_jobs = -1) # Logistic Regression
rid = RidgeClassifier(alpha = 0.5) # Ridge Classifier
sgd = SGDClassifier(n_jobs = -1) # SGD Classifier
gnb = GaussianNB() # Gaussian Naive Bayes
knn = KNeighborsClassifier(n_neighbors = 2, n_jobs = -1) # K Neighbors Classifier

In [9]:
from sklearn.preprocessing import MinMaxScaler, Imputer

In [10]:
def train_first_layer_model(clf, train_data):
    train_labels = train_data['TARGET']
    train_features = train_data.drop('TARGET', axis = 1)
    #encoded_train_features = pd.get_dummies(train_features)

    imputer = Imputer(strategy = 'median')
    filled_train_features = imputer.fit_transform(train_features)
    scaler = MinMaxScaler(feature_range = (0, 1))
    scaled_train_features = scaler.fit_transform(filled_train_features)
    
    clf.fit(scaled_train_features, train_labels)
    
    return clf

In [None]:
# Train all the first level models

In [11]:
ada = train_first_layer_model(ada, train_A)

In [12]:
bag = train_first_layer_model(bag, train_A)

In [13]:
gbc = train_first_layer_model(gbc, train_A)

In [14]:
rf = train_first_layer_model(rf, train_A)

In [15]:
log = train_first_layer_model(log, train_A) #log

  " = {}.".format(self.n_jobs))


In [16]:
rid = train_first_layer_model(rid, train_A) #rid

In [17]:
sgd = train_first_layer_model(sgd, train_A) #sgd



In [18]:
gnb = train_first_layer_model(gnb, train_A) #gnb

In [19]:
knn = train_first_layer_model(knn, train_A) #knn

In [23]:
# make predictions of train_B, train_C, and test with first layer models
# build a meta model and train it with B, and validate with C for hypter parameter tuning
# train meta model on A,B,C and predict test

def process_data(data):
    
    if 'TARGET' in data.columns:
        data_features = data.drop('TARGET', axis = 1)
    else:
        data_features = data.copy()
        
    #encoded_data_features = pd.get_dummies(data_features)
    imputer = Imputer(strategy = 'median')
    filled_data_features = imputer.fit_transform(data_features)
    scaler = MinMaxScaler(feature_range = (0, 10))
    scaled_data_features = scaler.fit_transform(filled_data_features)
    
    return scaled_data_features

def predict_first_layer(clf, dataB, dataC, test):
    B_features = process_data(dataB)
    C_features = process_data(dataC)
    print(B_features.shape, C_features.shape)
    test_features = process_data(test)
    B_meta = clf.predict(B_features)
    C_meta = clf.predict(C_features)
    test_meta = clf.predict(test_features)
    
    return B_meta, C_meta, test_meta


In [335]:
ada_B_meta, ada_C_meta, ada_test_meta = predict_first_layer(ada, train_B, 
                                                            train_C, test)

(92253, 241) (92254, 241)


In [336]:
bag_B_meta, bag_C_meta, bag_test_meta = predict_first_layer(bag, train_B, 
                                                            train_C, test)

(92253, 241) (92254, 241)


In [337]:
#et_B_meta, et_C_meta, et_test_meta = predict_first_layer(et, train_B, 
#                                                            train_C, test)
# keeps generating model error during prediction, left untouched for now

In [338]:
gbc_B_meta, gbc_C_meta, gbc_test_meta = predict_first_layer(gbc, train_B, 
                                                            train_C, test)


(92253, 241) (92254, 241)


In [339]:
rf_B_meta, rf_C_meta, rf_test_meta = predict_first_layer(rf, train_B, 
                                                            train_C, test)


(92253, 241) (92254, 241)


In [340]:
log_B_meta, log_C_meta, log_test_meta = predict_first_layer(log, train_B, 
                                                            train_C, test)

(92253, 241) (92254, 241)


In [341]:
rid_B_meta, rid_C_meta, rid_test_meta = predict_first_layer(rid, train_B, 
                                                            train_C, test)
sgd_B_meta, sgd_C_meta, sgd_test_meta = predict_first_layer(sgd, train_B, 
                                                            train_C, test)
gnb_B_meta, gnb_C_meta, gnb_test_meta = predict_first_layer(gnb, train_B, 
                                                            train_C, test)
knn_B_meta, knn_C_meta, knn_test_meta = predict_first_layer(knn, train_B, 
                                                            train_C, test)

(92253, 241) (92254, 241)
(92253, 241) (92254, 241)
(92253, 241) (92254, 241)
(92253, 241) (92254, 241)


In [319]:
#Build meta_models, fit them to train B, validate on train C

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc

In [342]:
#create metadata for meta_B

ada_B_meta = pd.DataFrame(ada_B_meta, columns = ['ada'])
bag_B_meta = pd.DataFrame(bag_B_meta, columns = ['bag'])
gbc_B_meta = pd.DataFrame(gbc_B_meta, columns = ['gbc'])
rf_B_meta = pd.DataFrame(rf_B_meta, columns = ['rf'])
log_B_meta = pd.DataFrame(log_B_meta, columns = ['log'])
rid_B_meta = pd.DataFrame(rid_B_meta, columns = ['rid'])
sgd_B_meta = pd.DataFrame(sgd_B_meta, columns = ['sgd'])
gnb_B_meta = pd.DataFrame(gnb_B_meta, columns = ['gnb'])
knn_B_meta = pd.DataFrame(knn_B_meta, columns = ['knn'])



In [350]:
ada_C_meta = pd.DataFrame(ada_C_meta, columns = ['ada'])
bag_C_meta = pd.DataFrame(bag_C_meta, columns = ['bag'])
gbc_C_meta = pd.DataFrame(gbc_C_meta, columns = ['gbc'])
rf_C_meta = pd.DataFrame(rf_C_meta, columns = ['rf'])
log_C_meta = pd.DataFrame(log_C_meta, columns = ['log'])
rid_C_meta = pd.DataFrame(rid_C_meta, columns = ['rid'])
sgd_C_meta = pd.DataFrame(sgd_C_meta, columns = ['sgd'])
gnb_C_meta = pd.DataFrame(gnb_C_meta, columns = ['gnb'])
knn_C_meta = pd.DataFrame(knn_C_meta, columns = ['knn'])

In [380]:
ada_test_meta = pd.DataFrame(ada_test_meta, columns = ['ada'])
bag_test_meta = pd.DataFrame(bag_test_meta, columns = ['bag'])
gbc_test_meta = pd.DataFrame(gbc_test_meta, columns = ['gbc'])
rf_test_meta = pd.DataFrame(rf_test_meta, columns = ['rf'])
log_test_meta = pd.DataFrame(log_test_meta, columns = ['log'])
rid_test_meta = pd.DataFrame(rid_test_meta, columns = ['rid'])
sgd_test_meta = pd.DataFrame(sgd_test_meta, columns = ['sgd'])
gnb_test_meta = pd.DataFrame(gnb_test_meta, columns = ['gnb'])
knn_test_meta = pd.DataFrame(knn_test_meta, columns = ['knn'])

In [381]:

B_meta_data = pd.concat([ada_B_meta, bag_B_meta, gbc_B_meta, rf_B_meta,
                         log_B_meta, rid_B_meta, sgd_B_meta,
                         gnb_B_meta, knn_B_meta],axis = 1)

C_meta_data = pd.concat([ada_C_meta, bag_C_meta, gbc_C_meta, rf_C_meta,
                         log_C_meta, rid_C_meta, sgd_C_meta,
                         gnb_C_meta, knn_C_meta],axis = 1)

test_meta_data = pd.concat([ada_test_meta, bag_test_meta, gbc_test_meta, 
                            rf_test_meta, log_test_meta, rid_test_meta,
                            sgd_test_meta, gnb_test_meta, knn_test_meta],axis = 1)

In [393]:
features = pd.concat([B_meta_data.copy(), labels],axis = 1)
test_features = test_meta_data.copy()

In [388]:
features = pd.get_dummies(features)
test_features = pd.get_dummies(test_features)
        
# Align the dataframes by the columns
features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
# No categorical indices to record
cat_indices = 'auto'
      
print('Training Data Shape: ', features.shape)
print('Testing Data Shape: ', test_features.shape)
    
# Extract feature names
feature_names = list(features.columns)
    
# Convert to np arrays
features = np.array(features)
test_features = np.array(test_features)
    
# Create the kfold object
k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
# Empty array for feature importances
feature_importance_values = np.zeros(len(feature_names))
    
# Empty array for test predictions
test_predictions = np.zeros(test_features.shape[0])
    
# Empty array for out of fold validation predictions
out_of_fold = np.zeros(features.shape[0])
    
# Lists for recording validation and training scores
valid_scores = []
train_scores = []
    
# Iterate through each fold
for train_indices, valid_indices in k_fold.split(features):
        
    # Training data for the fold
    train_features, train_labels = features[train_indices], labels[train_indices]
    # Validation data for the fold
    valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
    # Create the model
    model = lgb.LGBMClassifier(n_estimators=1000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.001, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
    # Train the model
    model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
    # Record the best iteration
    best_iteration = model.best_iteration_
        
    # Record the feature importances
    feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
    # Make predictions
    test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
    # Record the out of fold predictions
    out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
    # Record the best score
    valid_score = model.best_score_['valid']['auc']
    train_score = model.best_score_['train']['auc']
        
    valid_scores.append(valid_score)
    train_scores.append(train_score)
        
    # Clean up memory
    gc.enable()
    del model, train_features, valid_features
    gc.collect()
        
# Make the submission dataframe
submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
# Make the feature importance dataframe
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
# Overall validation score
valid_auc = roc_auc_score(labels, out_of_fold)
    
# Add the overall scores to the metrics
valid_scores.append(valid_auc)
train_scores.append(np.mean(train_scores))
    
# Needed for creating dataframe of validation scores
fold_names = list(range(n_folds))
fold_names.append('overall')
    
# Dataframe of validation scores
metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
#return submission, feature_importances, metrics

Training Data Shape:  (92253, 9)
Testing Data Shape:  (48744, 9)


KeyError: '[    0     1     2 ... 92249 92250 92252] not in index'

In [396]:
# thank you Will Koehrsen for an amazing kernel / this method!

def model(features, test_features, encoding = 'ohe', n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    #train_ids = features['SK_ID_CURR']
    #test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    #features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    features = features.drop(columns = ['TARGET'])
    #test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=1000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.01, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    #submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    submission = pd.DataFrame({'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [397]:
submission, fi, metrics = model(features, test_features)
print('Baseline metrics')
print(metrics)

Training Data Shape:  (92253, 9)
Testing Data Shape:  (48744, 9)
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[67]	valid's auc: 0.526473	train's auc: 0.520415
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[98]	valid's auc: 0.51951	train's auc: 0.522189
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	valid's auc: 0.522222	train's auc: 0.521511
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	valid's auc: 0.51921	train's auc: 0.522252
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[68]	valid's auc: 0.520697	train's auc: 0.521894
Baseline metrics
      fold     train     valid
0        0  0.520415  0.526473
1        1  0.522189  0.519510
2        2  0.521511  0.522222
3        3  0.522252  0.519210
4        4  0.521894  0.520697
5

In [398]:
submission.to_csv('credit_meta_baseline.csv', index = False)

In [None]:
#metamodel - log
log_meta = LogisticRegression()
log_meta.fit(B_meta, B_labels)
#find validation score of baseline meta classifier with C_meta

#gridsearch, randomsearch on log_meta with C_meta, C labels



In [None]:
#fit meta models to A,B,C combined to train data again, predict on test

#### Plan:
##### 1. Break down training into A,B,C
##### 2. Build numerous various models (fitted to A) and form predictions for B,C.test
##### 3. Build a meta_model (lightgbm) that trains on B_meta and B_true_label
##### 4. Validate model on C_meta and C_true_label
##### 5. Make predictions on test with meta_model via test_meta