In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from featuretools.primitives import AggregationPrimitive, make_agg_primitive
from datetime import datetime, timedelta
import re
import featuretools as ft

In [3]:
feature_spec = pd.read_csv('total_feature_matrix_spec.csv')

In [4]:
train = feature_spec[feature_spec['set'] == 'train']
test = feature_spec[feature_spec['set'] == 'test']

In [5]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)
train_labels= train['TARGET']
train, test = train.align(test, join = 'inner', axis = 1)
train['TARGET'] = train_labels

In [6]:
# Devide train data into 3: train A, B, and C
from sklearn.model_selection import train_test_split
train_A, train_B = train_test_split(train, test_size = 0.6, random_state = 3)
train_B, train_C = train_test_split(train_B, test_size = 0.3, random_state = 3)


In [7]:
print(train_A.shape, train_B.shape, train_C.shape, test.shape)

(123004, 1157) (129154, 1157) (55353, 1157) (48744, 1157)


In [8]:
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.model_selection import KFold
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc

In [9]:
# thank you Will Koehrsen for an amazing kernel / this method!

def model(features,test_features, encoding = 'ohe', n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    #train_ids = features['SK_ID_CURR']
    #test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    #features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    features = features.drop(columns = ['TARGET'])
    #test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=1000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.01, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    #submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    submission = pd.DataFrame({'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [10]:
submission_B, fi, metrics = model(train_A.reset_index(drop = True),train_B.reset_index(drop = True))
print('Baseline metrics')
print(metrics)

Training Data Shape:  (123004, 1156)
Testing Data Shape:  (129154, 1156)
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.748836	train's auc: 0.79306
[400]	valid's auc: 0.762898	train's auc: 0.826754
[600]	valid's auc: 0.768636	train's auc: 0.849862
[800]	valid's auc: 0.770298	train's auc: 0.868858
[1000]	valid's auc: 0.770781	train's auc: 0.884638
Did not meet early stopping. Best iteration is:
[998]	valid's auc: 0.770821	train's auc: 0.884496
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.745249	train's auc: 0.794775
[400]	valid's auc: 0.75815	train's auc: 0.828156
[600]	valid's auc: 0.763797	train's auc: 0.850979
[800]	valid's auc: 0.766018	train's auc: 0.869352
Early stopping, best iteration is:
[858]	valid's auc: 0.766346	train's auc: 0.874016
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.75719	train's auc: 0.790891
[400]	valid's auc: 0.773682	train's auc: 0.824592
[600]	v

In [11]:
submission_C, fi, metrics = model(train_A.reset_index(drop = True),train_C.reset_index(drop = True))
print('Baseline metrics')
print(metrics)

Training Data Shape:  (123004, 1156)
Testing Data Shape:  (55353, 1156)
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.748836	train's auc: 0.79306
[400]	valid's auc: 0.762898	train's auc: 0.826754
[600]	valid's auc: 0.768636	train's auc: 0.849862
[800]	valid's auc: 0.770298	train's auc: 0.868858
[1000]	valid's auc: 0.770781	train's auc: 0.884638
Did not meet early stopping. Best iteration is:
[998]	valid's auc: 0.770821	train's auc: 0.884496
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.745249	train's auc: 0.794775
[400]	valid's auc: 0.75815	train's auc: 0.828156
[600]	valid's auc: 0.763797	train's auc: 0.850979
[800]	valid's auc: 0.766018	train's auc: 0.869352
Early stopping, best iteration is:
[858]	valid's auc: 0.766346	train's auc: 0.874016
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.75719	train's auc: 0.790891
[400]	valid's auc: 0.773682	train's auc: 0.824592
[600]	va

In [12]:
submission_test, fi, metrics = model(train_A.reset_index(drop = True),test)
print('Baseline metrics')
print(metrics)

Training Data Shape:  (123004, 1156)
Testing Data Shape:  (48744, 1156)
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.748836	train's auc: 0.79306
[400]	valid's auc: 0.762898	train's auc: 0.826754
[600]	valid's auc: 0.768636	train's auc: 0.849862
[800]	valid's auc: 0.770298	train's auc: 0.868858
[1000]	valid's auc: 0.770781	train's auc: 0.884638
Did not meet early stopping. Best iteration is:
[998]	valid's auc: 0.770821	train's auc: 0.884496
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.745249	train's auc: 0.794775
[400]	valid's auc: 0.75815	train's auc: 0.828156
[600]	valid's auc: 0.763797	train's auc: 0.850979
[800]	valid's auc: 0.766018	train's auc: 0.869352
Early stopping, best iteration is:
[858]	valid's auc: 0.766346	train's auc: 0.874016
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.75719	train's auc: 0.790891
[400]	valid's auc: 0.773682	train's auc: 0.824592
[600]	va

In [19]:
submission_B.columns = ['eng_gbm']
submission_C.columns =['eng_gbm']
submission_test.columns = ['eng_gbm']

In [22]:
submission_B.to_csv('eng_data_gbm_B.csv')
submission_C.to_csv('eng_data_gbm_C.csv')
submission_test.to_csv('eng_data_gbm_test.csv')