In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Use plotly offline for fancy plots
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
# use cufflinks to bind plotly to pandas
import cufflinks as cf 
from os import listdir
# for display control
from IPython.display import display
# Gradient boosting using LightBGM
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
# Use LabelEncoder that seems to yield better result
from sklearn.preprocessing import LabelEncoder
# For parallel multi-threading
from multiprocessing import Pool, cpu_count, Array
# Garbage collection
import gc
gc.enable()
# Lock pseudo-number seed
randSeed = 1
np.random.seed(randSeed)

In [2]:
# Global verbose control
PREVIEW_DATASET = 0
ADD_STATS_FEATURES = 1

In [3]:
# Utility function to transform all catagorical fields using one hot ending
def oneHotEncoding(df):
    # Get list categorical features
    catFeatures = [col for col in df.columns if df[col].dtype == 'object']
    # Convert to one hot encoding
    ohe = pd.get_dummies(df, columns=catFeatures)
    return ohe

# Utility function to encode catagorical columns using LabelEncoder
le = LabelEncoder()
def encodeLabel(field):
    return le.fit_transform(field.astype(str))

# Utility function to add descriptive statistics as secondary fields in the dataframe
def addStatsFields(df, field):
    df['MEAN_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .mean()[field]
    )
    df['MEDIAN_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .median()[field]
    )
    df['MAX_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .max()[field]
    )
    df['MIN_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .min()[field]
    )
    df['SUM_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .sum()[field]
    )
    df['VAR_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .var()[field]
    )
    df['CNT_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .count()[field]
    )
    # Drop current field
    df.drop(
        [field], 
        axis = 1, 
        inplace = True
    )
    return df

In [5]:
# Load all data
dataPath = '/media/ephemeral0/data/home-credit-default-risk/'
dataFiles = listdir(f"{dataPath}")
for filename in dataFiles:
    print(f'loading {filename} ...')
    if '.csv.zip' in filename:
        # compressed data file
        locals()[filename.rstrip('.csv.zip')] = pd.read_csv(
            f'{dataPath}/{filename}',
            compression='zip', 
            header=0, 
            sep=',', 
            quotechar='"'
        )

# Get output label and remove it from feature list
dataTrain = application_train
dataTest = application_test
y = dataTrain['TARGET']

# Transform using One Hot Encoding 
# (using only the training dataset features as reference)
catFeatures = [
    col 
    for col in dataTrain.columns 
    if dataTrain[col].dtype == 'object'
]
ohe = pd.concat([dataTrain,dataTest], sort=False)
ohe = pd.get_dummies(ohe, columns = catFeatures)
dataTrain = ohe.iloc[:dataTrain.shape[0],:]
dataTest = ohe.iloc[dataTrain.shape[0]:,]
del dataTrain['TARGET']
del dataTest['TARGET']

# Summarize dataset
featureCnt = len(dataTrain.keys()) - 1
numSamples = len(dataTrain)
print(f'Training dataset has {numSamples} samples, and {featureCnt} features')
featureCnt = len(dataTest.keys()) - 1
numSamples = len(dataTest)
print(f'Testing dataset has {numSamples} samples, and {featureCnt} features')

print('done')

loading bureau_balance.csv.zip ...
loading HomeCredit_columns_description.csv ...
loading installments_payments.csv.zip ...
loading previous_application.csv.zip ...
loading application_test.csv.zip ...
loading POS_CASH_balance.csv.zip ...
loading credit_card_balance.csv.zip ...
loading application_train.csv.zip ...
loading bureau.csv.zip ...
loading sample_submission.csv.zip ...
Training dataset has 307511 samples, and 244 features
Testing dataset has 48744 samples, and 244 features
done


In [6]:
## Preprocess bureau datasets
if PREVIEW_DATASET:
    print('Raw bureau_balance dataset')
    display(bureau_balance.head(5))

# Count by status
bureauBalance = (
    bureau_balance
        .groupby('SK_ID_BUREAU')
        .STATUS
        .value_counts(normalize = False)
        .unstack('STATUS')
)
# Rename columns to avoid conflict
renameDict = {}
for col in bureauBalance.columns:
    renameDict[col] = 'STATUS_' + col
bureauBalance.rename(columns = renameDict)

# Add months balance data as new features
bureauBalance['MONTHS_COUNT'] = (
    bureau_balance
        .groupby('SK_ID_BUREAU') 
        .MONTHS_BALANCE          
        .size()
)
bureauBalance['MONTHS_MAX'] = (
    bureau_balance
        .groupby('SK_ID_BUREAU')
        .MONTHS_BALANCE
        .max()
)
bureauBalance['MONTHS_MIN'] = (
    bureau_balance
        .groupby('SK_ID_BUREAU')
        .MONTHS_BALANCE
        .min()
)
if PREVIEW_DATASET:
    print('Formatted')
    display(bureauBalance.head(5))

# Finally, merge the two bureau table together 
bureauData = bureau.join(bureauBalance, how='left', on='SK_ID_BUREAU')

# Transform features
bureauData = oneHotEncoding(bureauData).groupby('SK_ID_CURR').mean()
bureauData['CNT_BURO'] = (
    bureau[['SK_ID_BUREAU', 'SK_ID_CURR']]
        .groupby('SK_ID_CURR')
        .count()['SK_ID_BUREAU']
)
del bureauData['SK_ID_BUREAU']
if PREVIEW_DATASET:
    print('Merged and transformed')
    display(bureauData.head(5))

# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = bureauData.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = bureauData.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'bureau datasets processed, {newFeatureCnt} new features added')

# Remove temporary variables and clean up memory
del bureauBalance
del bureauData
del bureau_balance
del bureau
gc.collect();

bureau datasets processed, 47 new features added


In [7]:
## Preproces previous_application
# Transform with one hot encoding
prevApplication = oneHotEncoding(previous_application)

# Compute number of previous applications by counting SK_ID_PREV
prevApplicationCnt = (
    prevApplication[['SK_ID_CURR', 'SK_ID_PREV']]
        .groupby('SK_ID_CURR')
        .count()
        .SK_ID_PREV
)
# Group by mean
prevApplication = prevApplication.groupby('SK_ID_CURR').mean()
# Add back the number of previous applications
prevApplication['CNT_PREV_APPLICATION'] = prevApplicationCnt
# Remove the SK_ID_PREV feature because the average of it is meaningless
del prevApplication['SK_ID_PREV']

# Display merged dataset
if PREVIEW_DATASET:
    print('Merged dataset')
    display(prevApplication.head(5))

# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = prevApplication.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = prevApplication.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'previous_application datasets processed, {newFeatureCnt} new features added')

# Remove temporary variables and clean up memory
del prevApplicationCnt
del prevApplication
del previous_application
gc.collect();

previous_application datasets processed, 163 new features added


In [8]:
## Preproces POS_CASH_balance
# Encode using LabelEncoder
posCashBal = POS_CASH_balance
posCashBal.NAME_CONTRACT_STATUS = encodeLabel(POS_CASH_balance.NAME_CONTRACT_STATUS)
posCashBal['CNT_UNIQUE_STATUS'] = (
    posCashBal[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']]
        .groupby('SK_ID_CURR')
        .nunique()
        .NAME_CONTRACT_STATUS
)
posCashBal['MAX_UNIQUE_STATUS'] = (
    posCashBal[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']]
        .groupby('SK_ID_CURR')
        .max()
        .NAME_CONTRACT_STATUS
)

# Add some secondary features
if ADD_STATS_FEATURES:
    posCashBal = addStatsFields(posCashBal, 'SK_DPD')
    posCashBal = addStatsFields(posCashBal, 'SK_DPD_DEF')
    posCashBal = addStatsFields(posCashBal, 'CNT_INSTALMENT_FUTURE')
    posCashBal = addStatsFields(posCashBal, 'CNT_INSTALMENT')

# Group by UserID
#del posCashBal['SK_ID_PREV']
posCashBal.drop(
    ['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], 
    axis = 1, 
    inplace = True
)
posCashBal = posCashBal.groupby('SK_ID_CURR').mean()

# Display merged dataset
if PREVIEW_DATASET:
    print('Processed dataset')
    display(posCashBal.head())
    
# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = posCashBal.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = posCashBal.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'POS_CASH_balance dataset processed, {newFeatureCnt} new features added')

# Remove temporary variables
del posCashBal
del POS_CASH_balance
gc.collect();

POS_CASH_balance dataset processed, 31 new features added


In [9]:
## Preproces credit_card_balance
# Transform with one hot encoding
# Rename fields that have already used in POS_CASH_balance
creditCardBal = credit_card_balance.rename(
    index = str, 
    columns = {
        'NAME_CONTRACT_STATUS' : 'NAME_CONTRACT_STATUS_CREDIT',
        'SK_DPD' : 'SK_DPD_CREDIT',
        'SK_DPD_DEF' : 'SK_DPD_DEF_CREDIT'
    }
)
#creditCardBal = oneHotEncoding(creditCardBal)
creditCardBal.NAME_CONTRACT_STATUS_CREDIT = encodeLabel(
    creditCardBal.NAME_CONTRACT_STATUS_CREDIT
)
creditCardBal['CNT_UNIQUE_STATUS_CREDIT'] = (
    creditCardBal[['SK_ID_CURR', 'NAME_CONTRACT_STATUS_CREDIT']]
        .groupby('SK_ID_CURR')
        .nunique()
        .NAME_CONTRACT_STATUS_CREDIT
)
creditCardBal['MAX_UNIQUE_STATUS_CREDIT'] = (
    creditCardBal[['SK_ID_CURR', 'NAME_CONTRACT_STATUS_CREDIT']]
        .groupby('SK_ID_CURR')
        .max()
        .NAME_CONTRACT_STATUS_CREDIT
)
# Add some secondary features
if ADD_STATS_FEATURES:
    creditCardBal = addStatsFields(creditCardBal, 'AMT_BALANCE')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_CREDIT_LIMIT_ACTUAL')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_DRAWINGS_ATM_CURRENT')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_DRAWINGS_CURRENT')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_DRAWINGS_OTHER_CURRENT')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_DRAWINGS_POS_CURRENT')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_INST_MIN_REGULARITY')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_PAYMENT_CURRENT')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_PAYMENT_TOTAL_CURRENT')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_RECEIVABLE_PRINCIPAL')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_RECIVABLE')
    creditCardBal = addStatsFields(creditCardBal, 'AMT_TOTAL_RECEIVABLE')
    creditCardBal = addStatsFields(creditCardBal, 'CNT_DRAWINGS_ATM_CURRENT')
    creditCardBal = addStatsFields(creditCardBal, 'CNT_DRAWINGS_CURRENT')
    creditCardBal = addStatsFields(creditCardBal, 'CNT_DRAWINGS_OTHER_CURRENT')
    creditCardBal = addStatsFields(creditCardBal, 'CNT_DRAWINGS_POS_CURRENT')
    creditCardBal = addStatsFields(creditCardBal, 'CNT_INSTALMENT_MATURE_CUM')
    creditCardBal = addStatsFields(creditCardBal, 'SK_DPD_CREDIT')
    creditCardBal = addStatsFields(creditCardBal, 'SK_DPD_DEF_CREDIT')

# Group by ID
creditCardBal.drop(
    ['SK_ID_PREV', 'NAME_CONTRACT_STATUS_CREDIT'], 
    axis = 1, 
    inplace = True
)
creditCardBal = creditCardBal.groupby('SK_ID_CURR').mean()

# Display merged dataset
if PREVIEW_DATASET:
    print('Processed dataset')
    display(creditCardBal.head())
    
# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = creditCardBal.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = creditCardBal.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'credit_card_balance dataset processed, {newFeatureCnt} new features added')

# Remove temporary variables
del creditCardBal
del credit_card_balance
gc.collect();

credit_card_balance dataset processed, 136 new features added


In [11]:
colDictMean = {}
colDictMin = {}
colDictMax = {}
colDictMedian = {}
colDictVar = {}
for col in installments_payment.columns:
    if col not in ['SK_ID_CURR', 'SK_ID_PREV']:
        colDictMean[col] = 'MEAN_' + col
        colDictMin[col] = 'MIN_' + col
        colDictMax[col] = 'MAX_' + col
        colDictMedian[col] = 'MEDIAN_' + col
        colDictVar[col] = 'VAR_' + col

# Add mean
installPayment = installments_payment.groupby('SK_ID_CURR').mean()
del installPayment['SK_ID_PREV']
installPayment = installPayment.rename(
    columns = colDictMean
)
# Add min
installPaymentMin = installments_payment.groupby('SK_ID_CURR').min()
del installPaymentMin['SK_ID_PREV']
installPaymentMin = installPaymentMin.rename(
    columns = colDictMin
)
installPayment = installPayment.merge(
    right = installPaymentMin,
    how = 'left',
    on = 'SK_ID_CURR'
)
del installPaymentMin
# Add max
installPaymentMax = installments_payment.groupby('SK_ID_CURR').max()
del installPaymentMax['SK_ID_PREV']
installPaymentMax = installPaymentMax.rename(
    columns = colDictMax
)
installPayment = installPayment.merge(
    right = installPaymentMax,
    how = 'left',
    on = 'SK_ID_CURR'
)
del installPaymentMax
# Add median
installPaymentMedian = installments_payment.groupby('SK_ID_CURR').median()
del installPaymentMedian['SK_ID_PREV']
installPaymentMedian = installPaymentMedian.rename(
    columns = colDictMedian
)
installPayment = installPayment.merge(
    right = installPaymentMedian,
    how = 'left',
    on = 'SK_ID_CURR'
)
del installPaymentMedian
# Add variance
installPaymentVar = installments_payment.groupby('SK_ID_CURR').var()
del installPaymentVar['SK_ID_PREV']
installPaymentVar = installPaymentVar.rename(
    columns = colDictVar
)
installPayment = installPayment.merge(
    right = installPaymentVar,
    how = 'left',
    on = 'SK_ID_CURR'
)
del installPaymentVar

# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = installPayment.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = installPayment.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'installments_payment dataset processed, {newFeatureCnt} new features added')

# Remove temporary variables
del installPayment
del installments_payment
gc.collect();

installments_payment dataset processed, 30 new features added


In [12]:
## Final processing
# Remove features too many missing values
dataTestFinal  = dataTest[dataTest.columns[dataTrain.isnull().mean() < 0.85]]
dataTrainFinal = dataTrain[dataTrain.columns[dataTrain.isnull().mean() < 0.85]]

# Delete SK_ID_CURR field (not a feature) 
del dataTestFinal['SK_ID_CURR']
del dataTrainFinal['SK_ID_CURR']

# Summarize dataset
featureCnt = len(dataTrainFinal.keys()) - 1
numSamples = len(dataTrainFinal)
print(f'Training dataset has {numSamples} samples, and {featureCnt} features')
featureCnt = len(dataTestFinal.keys()) - 1
numSamples = len(dataTestFinal)
print(f'Testing dataset has {numSamples} samples, and {featureCnt} features')

Training dataset has 307511 samples, and 508 features
Testing dataset has 48744 samples, and 508 features


In [104]:
# Split training data randomly using train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    dataTrainFinal, y, 
    test_size = 0.15,
    random_state = randSeed,
    shuffle = True
)

# Create lgb dataset
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_test = lgb.Dataset(data=x_test, label=y_test)

# Free up memory
gc.collect();

### Parameter Tuning and Performance References
1. [LightGBM - Parameters](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst)
2. [LightGBM - 
Parallel Learning Guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst)
3. [LightGBM - Parameters Tuning Guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters-Tuning.rst)
4. [Kaggle - GBM vs. XGBoost vs. LightGBM](https://www.kaggle.com/nschneider/gbm-vs-xgboost-vs-lightgbm)

In [105]:
params = {
    'task': 'train',
    'device' : 'cpu',
    'nthread': 8,            # [CPU] number of OpenMP threads
    'tree_learner' : 'feature',
    'gpu_use_dp' : 'false',  # [GPU] set to 1 to enable 64bit float point
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_leaves': 64,
    'metric': 'auc',
    'reg_alpha': 2,
    'reg_lambda': 4,
    'learning_rate': 0.05,
    'max_bin': 384,
    'max_depth' : 11,
    'min_split_gain': 0.5,
    'min_child_weight': 1e-0,
    'min_child_samples': 2,
    'subsample_for_bin': 200000,
    'subsample': 1,
    'subsample_freq': 2,
    'colsample_bytree': 0.8,
    'verbose': 0
}

gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round = 5000,
    valid_sets = lgb_test,
    early_stopping_rounds = 50,
    verbose_eval = 25
)

Training until validation scores don't improve for 50 rounds.
[25]	valid_0's auc: 0.747366
[50]	valid_0's auc: 0.757548
[75]	valid_0's auc: 0.768338
[100]	valid_0's auc: 0.777192
[125]	valid_0's auc: 0.782582
[150]	valid_0's auc: 0.785385
[175]	valid_0's auc: 0.78696
[200]	valid_0's auc: 0.788181
[225]	valid_0's auc: 0.789046
[250]	valid_0's auc: 0.789768
[275]	valid_0's auc: 0.790552
[300]	valid_0's auc: 0.790796
[325]	valid_0's auc: 0.790991
[350]	valid_0's auc: 0.79109
[375]	valid_0's auc: 0.79094
[400]	valid_0's auc: 0.791312
[425]	valid_0's auc: 0.791466
[450]	valid_0's auc: 0.791721
[475]	valid_0's auc: 0.791994
[500]	valid_0's auc: 0.792069
[525]	valid_0's auc: 0.792007
[550]	valid_0's auc: 0.791727
Early stopping, best iteration is:
[507]	valid_0's auc: 0.792155


In [None]:
# Predict and save to csv
predResult = gbm.predict(dataTestFinal)
submissionDataset = sample_submission
submissionDataset.TARGET = predResult
submissionDataset.to_csv('./lowpass_submission_v2.csv', index = False)

## Research zone for parameter optimization

### Attempt 0 - Cross-validation using AUC

In [118]:
gc.enable()
gc.collect()
nCVSamps = 24
nThread = min(nCVSamps, 6) # cpu_count()
# Cross validation
def crossValidate(seed, cvData, idx):
    # Generate new set of test dataset
    _, x_test, _, y_test = train_test_split(
        dataTrainFinal, y, 
        test_size = 0.50,
        random_state = seed,
        shuffle = True
    )
    predResult = gbm.predict(x_test)
    cvData[idx] = roc_auc_score(y_true=y_test, y_score=predResult)

# The following code submits parallel jobs using multiprocessing 
# to speed up the execution
cvData = Array('f', np.empty([nCVSamps]))
pool = Pool(processes=nThread)
jobs = []
for it in range(nCVSamps):
    if len(jobs) == nThread:
        print("All thread used, waiting until finish... ...")
        # Wait until all the active thread to finish
        for job in jobs:
            job.join()
        print("Simulation Batch done!")
        # Clear the pool
        jobs = []

    # Start a new job thread
    p = pool.Process(target=crossValidate, args=(it, cvData, it))
    p.start()
    jobs.append(p)

    print("Starting new simulation thread ({0:d}/{1:d}), progress {2:.2f}% ({3:d}/{4:d})".format(
        len(jobs), nThread,
        100.0*(it+1)/nCVSamps, it+1, nCVSamps))

print("Waiting for remaining jobs to finish... ...")
# Wait until all the active thread to finish
for job in jobs:
    job.join()
# Retreve data from shared memory
cvData = np.array(cvData[:])
# Clean up
pool.close()
gc.collect()
print("Simulation done!")

Starting new simulation thread (1/6), progress 4.17% (1/24)
Starting new simulation thread (2/6), progress 8.33% (2/24)
Starting new simulation thread (3/6), progress 12.50% (3/24)
Starting new simulation thread (4/6), progress 16.67% (4/24)
Starting new simulation thread (5/6), progress 20.83% (5/24)
Starting new simulation thread (6/6), progress 25.00% (6/24)
All thread used, waiting until finish... ...
Simulation Batch done!
Starting new simulation thread (1/6), progress 29.17% (7/24)
Starting new simulation thread (2/6), progress 33.33% (8/24)
Starting new simulation thread (3/6), progress 37.50% (9/24)
Starting new simulation thread (4/6), progress 41.67% (10/24)
Starting new simulation thread (5/6), progress 45.83% (11/24)
Starting new simulation thread (6/6), progress 50.00% (12/24)
All thread used, waiting until finish... ...
Simulation Batch done!
Starting new simulation thread (1/6), progress 54.17% (13/24)
Starting new simulation thread (2/6), progress 58.33% (14/24)
Startin

In [119]:
trace1 = go.Scatter(
    y = cvData,
    mode = 'lines+markers',
    name = 'lines+markers'
)
layout = dict(
    title = 'ROC AUC value by rand number seed',
    xaxis = dict(title = 'Random number seed'),
    yaxis = dict(title = 'AUC')
)
py.iplot(dict(data=[trace1], layout=layout))

### Attempt 1 - Grid search with cross-validation using `GridSearchCV`

#### TODO
- Use `GridSearchCV` to do some simple parameter tuning


#### Reference
1. [Scikit Learn Doc - GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)
2. [Example kernal 1 found on Kaggle](https://www.kaggle.com/garethjns/microsoft-lightgbm-with-parameter-tuning-0-823/code)