In [None]:
#Save SageMaker ModelTraining.py as ModelTraining.py (or adjust training_job_entry_point below) in the same folder as this notebook
#Save "SageMaker Prediction.py" as inference.py in the same folder as this notebook. inference.py is a mandatory name and can not be changed

In [1]:
%%time
UseSavedIfExists = False
#provide models files in s3 if UseSavedIfExists is True
#There is a way to check if deployed models exist but 
#I feel safe to check the actual model file exists
#and did no find a way to get the model ref from the list
#client = boto3.client('sagemaker') -> list_models() -> get_model???
models_data = list()
#where are training data located locally
DataDir = '/home/kate/Research/Property/Data/'
#Temp local dir to save file before moving to S3
TmpDir = '/home/kate/Research/Property/Notebooks/SageMaker/tmp/'
#training data file name
training_dataset_name='property_wcf_class_training_basemodel0'
testing_dataset_name='property_wcf_testing'
#file name template for each fold to be saved in S3
training_dataset_fold_filename='training_dataset_fold'
validateion_dataset_fold_filename='validation_dataset_fold'
#model target column
target_column='hasclaim'
#model featureset
featureset  = [
'roofcd_encd',
'sqft',  
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind'  
]
#model parameters
hyperparameters = {
'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'silent': True,
        'booster': 'gbtree',
        'seed': 42,
        'scale_pos_weight':0.3,
        'colsample_bylevel': 0.8,
        'colsample_bytree': 0.8,
        'eta': 0.01,
        'max_depth': 6,
        'num_round':'5000'
}
#S3 bucket and folders to save files
bucket = 'sagemaker-wc-class'
#training fold data saved in
data_folder_in_bucket = 'data/folds/'
#models saved in
model_folder_in_bucket = 'model/fold'
#data in training and testing file type
content_type='text/csv'
#Script to be load in training instance
training_job_entry_point='ModelTraining.py'
#Training job name. Can be changed but there is a mandatory pattern
training_job_name = 'basemodel0-class-XGB'
#instance type to be created for training and transformation jobs. 
#They can be different types if needed
instance_type='ml.c5.xlarge'
#Transformation jobs
#Model is created for each fold model_name is just a template. fold number will be added 
model_name='property-wcf-class-basemodel0'
transformation_job_entry_point='inference.py'
transformation_input_folder = 'data/folds'
transformation_output_folder = 'output'
s3_batch_output = 's3://%s/%s/%s_fold_'%(bucket,transformation_output_folder,model_name)
s3_batch_input_training = 's3://%s/%s/%s.csv'%(bucket,transformation_input_folder,training_dataset_name)
s3_batch_input_testing = 's3://%s/%s/%s.csv'%(bucket,transformation_input_folder,testing_dataset_name)   
#column names with predicted data to add in the datasets
prediction_column_cv='sm_basemodel0_class_xgb_cv'
prediction_column_fold = 'sm_basemodel0_class_xgb_fold'

CPU times: user 29 µs, sys: 0 ns, total: 29 µs
Wall time: 37.2 µs


In [2]:
%%time
#packages used in the notebook
import time
import sys
import boto3
import re
import sagemaker
from sagemaker.session import s3_input
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.xgboost.model import XGBoostModel
import pandas as pd
import numpy as np
import s3fs

CPU times: user 482 ms, sys: 36.2 ms, total: 518 ms
Wall time: 517 ms


In [28]:
%%time
#should be run as a first step
#role arn is used when run from a local machine
role = 'arn:aws:iam::'
region = boto3.Session().region_name
s3 = s3fs.S3FileSystem()
smclient = boto3.Session().client('sagemaker')

CPU times: user 34.1 ms, sys: 4.06 ms, total: 38.2 ms
Wall time: 37.4 ms


In [4]:
%%time
#In order to compare training in the local mode and SageMaker 
#I need to use the same fold splitting in the training data in both modes
#The local training script loop thry training dataset in the memory
#SageMaker works only with data in S3 bucket
#Each training data fold is saved as a file in the bucket
#Only requred columns (target and featureset in training for modeling and only featureset for prediction)
#No header

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs


In [5]:
%%time
#data
training_dataset = pd.read_csv('%s%s.csv'%(DataDir,training_dataset_name), error_bad_lines=False, index_col=False)
testing_dataset = pd.read_csv('%s%s.csv'%(DataDir,testing_dataset_name), error_bad_lines=False, index_col=False)

CPU times: user 3.69 s, sys: 304 ms, total: 3.99 s
Wall time: 3.99 s


In [6]:
%%time
#Load training data to S3 bucket
#We need the dataset as a whole for prediction and calculating models metrics
if not(s3.exists('s3://%s/%s%s.csv'%(bucket,data_folder_in_bucket,training_dataset_name)) & UseSavedIfExists):
    print('%s.csv does not exist in S3. Loading...'%training_dataset_name)
    training_dataset[featureset].to_csv('s3://%s/%s%s.csv'%(bucket,data_folder_in_bucket,training_dataset_name),header=False,index=False)
else:
    print('%s.csv exists in S3'%training_dataset_name)

property_wcf_class_training_basemodel0.csv does not exist in S3. Loading...
CPU times: user 1.29 s, sys: 17.1 ms, total: 1.31 s
Wall time: 9.34 s


In [7]:
%%time
#Load testing data to S3 bucket
#We need the dataset for prediction and calculating models metrics
if not(s3.exists('s3://%s/%s%s.csv'%(bucket,data_folder_in_bucket,testing_dataset_name)) & UseSavedIfExists):
    print('%s.csv does not exist in S3. Loading...'%testing_dataset_name)
    testing_dataset[featureset].to_csv('s3://%s/%s%s.csv'%(bucket,data_folder_in_bucket,testing_dataset_name),header=False,index=False)
else:
    print('%s.csv exists in S3'%testing_dataset_name)

property_wcf_testing.csv does not exist in S3. Loading...
CPU times: user 1.21 s, sys: 16.5 ms, total: 1.23 s
Wall time: 9.83 s


In [8]:
%%time
#Splitting to folds, load to the bucket training data and training
#The script does not wait till the end of the training
#All training jobs are run in parallel
kfold = 5

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs


In [9]:
%%time
training_jobs = list()
estimators = list()
for i in range(0,kfold):
    print(' fold: {}  of  {} : '.format(i+1, kfold))
    #Preparing fold data in s3
    #training data
    if not(s3.exists('s3://%s/%s%s%s.csv'%(bucket,data_folder_in_bucket,training_dataset_fold_filename,i)) & UseSavedIfExists):
        print('%s%s.csv does not exist in S3. Loading...'%(training_dataset_fold_filename,i))
        training_dataset[training_dataset['fold_%s'%i]>0][[target_column] + featureset].to_csv('s3://%s/%s%s%s.csv'%(bucket,data_folder_in_bucket,training_dataset_fold_filename,i),header=False,index=False)
    else:
        print('%s%s.csv exists in S3.'%(training_dataset_fold_filename,i))
    #validation data
    if not(s3.exists('s3://%s/%s%s%s.csv'%(bucket,data_folder_in_bucket,validateion_dataset_fold_filename,i)) & UseSavedIfExists):
        print('%s%s.csv does not exist in S3. Loading...'%(validateion_dataset_fold_filename,i))    
        training_dataset[training_dataset['fold_%s'%i]==0][[target_column] + featureset].to_csv('s3://%s/%s%s%s.csv'%(bucket,data_folder_in_bucket,validateion_dataset_fold_filename,i),header=False,index=False)
    else:
         print('%s%s.csv exists in S3.'%(validateion_dataset_fold_filename,i)) 
    #Estimator and training    
    if len(models_data)-1>=i:
        model_data=models_data[i]
    else:
        model_data='empty'
    if not(s3.exists(model_data) & UseSavedIfExists):
        xgb_script_mode_estimator_fold = XGBoost(
        entry_point=training_job_entry_point,
        hyperparameters=hyperparameters,
        role=role, 
        train_instance_count=1,
        train_instance_type=instance_type,
        framework_version="1.0-1",
        output_path='s3://%s/%s_%s'%(bucket, model_folder_in_bucket,i)
        )
        estimators.append(xgb_script_mode_estimator_fold)
        train_input = s3_input('https://s3-%s.amazonaws.com/%s/%s%s%s.csv'%(region, bucket,data_folder_in_bucket,training_dataset_fold_filename,i), content_type=content_type)
        validation_input = s3_input('https://s3-%s.amazonaws.com/%s/%s%s%s.csv'%(region, bucket,data_folder_in_bucket,validateion_dataset_fold_filename,i), content_type=content_type)
        #Training
        job_name_fold=training_job_name +'-%s-'%i+ time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
        training_jobs.append(job_name_fold)
        model_data='s3://%s/%s_%s/%s/output/model.tar.gz'%(bucket, model_folder_in_bucket,i,job_name_fold)
        print('%s model does not exists. Training...'%model_data)      
        if len(models_data)-1>=i:
            models_data[i]=model_data
        else:
            models_data.append(model_data)
        xgb_script_mode_estimator_fold.fit({'train': train_input, 'validation': validation_input}, job_name=job_name_fold, wait=False)
    else:
        print('%s model exists in S3'%model_data)

 fold: 1  of  5 : 
training_dataset_fold0.csv does not exist in S3. Loading...
validation_dataset_fold0.csv does not exist in S3. Loading...


's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


s3://sagemaker-wc-class/model/fold_0/basemodel0-class-XGB-0-2020-10-29-22-41-41/output/model.tar.gz model does not exists. Training...
 fold: 2  of  5 : 
training_dataset_fold1.csv does not exist in S3. Loading...
validation_dataset_fold1.csv does not exist in S3. Loading...


's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


s3://sagemaker-wc-class/model/fold_1/basemodel0-class-XGB-1-2020-10-29-22-41-54/output/model.tar.gz model does not exists. Training...
 fold: 3  of  5 : 
training_dataset_fold2.csv does not exist in S3. Loading...
validation_dataset_fold2.csv does not exist in S3. Loading...


's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


s3://sagemaker-wc-class/model/fold_2/basemodel0-class-XGB-2-2020-10-29-22-42-07/output/model.tar.gz model does not exists. Training...
 fold: 4  of  5 : 
training_dataset_fold3.csv does not exist in S3. Loading...
validation_dataset_fold3.csv does not exist in S3. Loading...


's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


s3://sagemaker-wc-class/model/fold_3/basemodel0-class-XGB-3-2020-10-29-22-42-19/output/model.tar.gz model does not exists. Training...
 fold: 5  of  5 : 
training_dataset_fold4.csv does not exist in S3. Loading...
validation_dataset_fold4.csv does not exist in S3. Loading...


's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


s3://sagemaker-wc-class/model/fold_4/basemodel0-class-XGB-4-2020-10-29-22-42-31/output/model.tar.gz model does not exists. Training...
CPU times: user 7.72 s, sys: 126 ms, total: 7.84 s
Wall time: 1min


In [10]:
%%time
#Waiting till the end of all training jobs
#If there are None the waiting cycle will not start
#Training process should take 5-6 min
#Waiting till complete
#We could start prediction when a particular training job is complete
check_every_sec=10
n = 0
print_every_n_output=6
#If there are not complete training jobs in 5-6 minutes, it's better to look into logs
t = 0
minutes_to_wait=10*60/check_every_sec
EstimatorsFlg=len(estimators)>0
JobsFlg=len(training_jobs)>0
while (True & EstimatorsFlg & JobsFlg):
    statuses = list()
    n = n + 1
    for e,j in zip(estimators,training_jobs):
        status=e.sagemaker_session.describe_training_job(j)['TrainingJobStatus']
        if n==print_every_n_output:
            print('Training job %s status: %s'%(j,status))
        statuses.append(status)
    if 'InProgress' in statuses:
        if n==print_every_n_output:
            print('Continue waiting...')
            n = 0
    else:
        if set(statuses)=={'Completed'}:
            print('All Training Jobs are Completed')
        else:
            print('Something went wrong.')
        break 
    t = t+1
    if t>minutes_to_wait:
        print('Something went wrong. Training jobs are still running.')
        break
    time.sleep(check_every_sec)

Training job basemodel0-class-XGB-0-2020-10-29-22-41-41 status: InProgress
Training job basemodel0-class-XGB-1-2020-10-29-22-41-54 status: InProgress
Training job basemodel0-class-XGB-2-2020-10-29-22-42-07 status: InProgress
Training job basemodel0-class-XGB-3-2020-10-29-22-42-19 status: InProgress
Training job basemodel0-class-XGB-4-2020-10-29-22-42-31 status: InProgress
Continue waiting...
Training job basemodel0-class-XGB-0-2020-10-29-22-41-41 status: InProgress
Training job basemodel0-class-XGB-1-2020-10-29-22-41-54 status: InProgress
Training job basemodel0-class-XGB-2-2020-10-29-22-42-07 status: InProgress
Training job basemodel0-class-XGB-3-2020-10-29-22-42-19 status: InProgress
Training job basemodel0-class-XGB-4-2020-10-29-22-42-31 status: InProgress
Continue waiting...
Training job basemodel0-class-XGB-0-2020-10-29-22-41-41 status: InProgress
Training job basemodel0-class-XGB-1-2020-10-29-22-41-54 status: InProgress
Training job basemodel0-class-XGB-2-2020-10-29-22-42-07 stat

In [11]:
%%time
#Evaluation metric to be used in tuning
from sklearn.metrics import roc_auc_score,confusion_matrix
#To estimate models performance we need a custom gini function
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

CPU times: user 122 ms, sys: 0 ns, total: 122 ms
Wall time: 121 ms


In [12]:
%%time
#Prediction on training dataset with a transformation job
#I can only start 5 transformations jobs at a time
#so separate loops for training and testing datasets
#XGBoostModel for prediction can be created directly from xgb_script_mode_estimator_fold
#But because I can train models and run predict in different runs and 
#instead of traing just use already created model files
#I create XGBoostModel based on this files
#models_data list should be populated manualy
#Of course, models can be created once and reused
print('Models to use in prediction:')
print(models_data)
#There is a function because there is teh same process for training and testing datasets

Models to use in prediction:
['s3://sagemaker-wc-class/model/fold_0/basemodel0-class-XGB-0-2020-10-29-22-41-41/output/model.tar.gz', 's3://sagemaker-wc-class/model/fold_1/basemodel0-class-XGB-1-2020-10-29-22-41-54/output/model.tar.gz', 's3://sagemaker-wc-class/model/fold_2/basemodel0-class-XGB-2-2020-10-29-22-42-07/output/model.tar.gz', 's3://sagemaker-wc-class/model/fold_3/basemodel0-class-XGB-3-2020-10-29-22-42-19/output/model.tar.gz', 's3://sagemaker-wc-class/model/fold_4/basemodel0-class-XGB-4-2020-10-29-22-42-31/output/model.tar.gz']
CPU times: user 143 µs, sys: 0 ns, total: 143 µs
Wall time: 122 µs


In [29]:
%%time
#The same models are used for prediction based on training and testing data
models = list()
i = 0
for m in models_data:    
    #Try to delete if exists model and create a new model based on a model file
    name=model_name+'-%s'%i
    try:
        response = smclient.delete_model(ModelName=name)
        print('%s model was deleted'%name)
    except:
        print('%s model does not exist'%name)
        pass
    xgb_inference_model = XGBoostModel(
    name=name,
    model_data=m,
    role=role,
    entry_point=transformation_job_entry_point,
    framework_version="1.0-1",
    )
    models.append(xgb_inference_model)
    print('%s model was created'%name)
    i = i + 1        

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


property-wcf-class-basemodel0-0 model does not exist
property-wcf-class-basemodel0-0 model was created


Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


property-wcf-class-basemodel0-1 model was deleted
property-wcf-class-basemodel0-1 model was created


Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


property-wcf-class-basemodel0-2 model was deleted
property-wcf-class-basemodel0-2 model was created


Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


property-wcf-class-basemodel0-3 model was deleted
property-wcf-class-basemodel0-3 model was created


Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


property-wcf-class-basemodel0-4 model was deleted
property-wcf-class-basemodel0-4 model was created
CPU times: user 215 ms, sys: 20.2 ms, total: 235 ms
Wall time: 10.4 s


In [14]:
%%time
def RunModelsTransformJobs(s3_batch_input):
    tranform_jobs = list()
    tranformers = list()
    i = 0
    for m in models:       
        #Create transform job
        s3_batch_output_fold=s3_batch_output+'%s'%i
        transformer =  m.transformer(
                                              instance_count=1, 
                                              instance_type=instance_type,
                                              output_path=s3_batch_output_fold,
                                              accept='text/csv',
                                              strategy='MultiRecord',
                                              assemble_with='Line'
                                             )
        tranformers.append(transformer)
        transformer.transform(data=s3_batch_input, content_type='text/csv',split_type='Line')
        job_name = transformer.latest_transform_job.name
        tranform_jobs.append(job_name)
        print('Job %s started'%job_name)
        i = i + 1
    return (tranformers,tranform_jobs)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


In [15]:
%%time
def WaitForTransformJobs(tranformers, tranform_jobs, check_every_sec,print_every_n_output):
    #If there are not complete training jobs in 5-6 minutes, it's better to look into logs
    t = 0
    n = 0
    minutes_to_wait=6*60/check_every_sec
    while True:
        statuses = list()
        n = n + 1
        for e,j in zip(tranformers,tranform_jobs):
            status=e.sagemaker_session.describe_transform_job(j)['TransformJobStatus']
            if n==print_every_n_output:
                print('Transform job %s status: %s'%(j,status))
            statuses.append(status)
        if 'InProgress' in statuses:
            if n==print_every_n_output:
                print('Continue waiting...')
                n = 0
        else:
            if set(statuses)=={'Completed'}:
                print('All Transform Jobs are Completed')
            else:
                print('Something went wrong.')
            break 
        t = t+1
        if t>minutes_to_wait:
            print('Something went wrong. Transform jobs are still running.')
            break
        time.sleep(check_every_sec)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs


In [30]:
%%time
training_tranformers,training_tranform_jobs = RunModelsTransformJobs(s3_batch_input_training)
WaitForTransformJobs(training_tranformers,training_tranform_jobs, 10,6)

Job property-wcf-class-basemodel0-0-2020-10-29-23-03-44-991 started
Job property-wcf-class-basemodel0-1-2020-10-29-23-03-48-574 started
Job property-wcf-class-basemodel0-2-2020-10-29-23-03-50-197 started
Job property-wcf-class-basemodel0-3-2020-10-29-23-03-51-883 started
Job property-wcf-class-basemodel0-4-2020-10-29-23-03-53-886 started
Transform job property-wcf-class-basemodel0-0-2020-10-29-23-03-44-991 status: InProgress
Transform job property-wcf-class-basemodel0-1-2020-10-29-23-03-48-574 status: InProgress
Transform job property-wcf-class-basemodel0-2-2020-10-29-23-03-50-197 status: InProgress
Transform job property-wcf-class-basemodel0-3-2020-10-29-23-03-51-883 status: InProgress
Transform job property-wcf-class-basemodel0-4-2020-10-29-23-03-53-886 status: InProgress
Continue waiting...
Transform job property-wcf-class-basemodel0-0-2020-10-29-23-03-44-991 status: InProgress
Transform job property-wcf-class-basemodel0-1-2020-10-29-23-03-48-574 status: InProgress
Transform job pro

In [31]:
%%time
testing_tranformers,testing_tranform_jobs = RunModelsTransformJobs(s3_batch_input_testing)
WaitForTransformJobs(testing_tranformers,testing_tranform_jobs, 10,6)                                                      

Using already existing model: property-wcf-class-basemodel0-0


Job property-wcf-class-basemodel0-0-2020-10-29-23-08-06-018 started


Using already existing model: property-wcf-class-basemodel0-1


Job property-wcf-class-basemodel0-1-2020-10-29-23-08-09-336 started


Using already existing model: property-wcf-class-basemodel0-2


Job property-wcf-class-basemodel0-2-2020-10-29-23-08-10-726 started


Using already existing model: property-wcf-class-basemodel0-3


Job property-wcf-class-basemodel0-3-2020-10-29-23-08-13-767 started


Using already existing model: property-wcf-class-basemodel0-4


Job property-wcf-class-basemodel0-4-2020-10-29-23-08-14-896 started
Transform job property-wcf-class-basemodel0-0-2020-10-29-23-08-06-018 status: InProgress
Transform job property-wcf-class-basemodel0-1-2020-10-29-23-08-09-336 status: InProgress
Transform job property-wcf-class-basemodel0-2-2020-10-29-23-08-10-726 status: InProgress
Transform job property-wcf-class-basemodel0-3-2020-10-29-23-08-13-767 status: InProgress
Transform job property-wcf-class-basemodel0-4-2020-10-29-23-08-14-896 status: InProgress
Continue waiting...
Transform job property-wcf-class-basemodel0-0-2020-10-29-23-08-06-018 status: InProgress
Transform job property-wcf-class-basemodel0-1-2020-10-29-23-08-09-336 status: InProgress
Transform job property-wcf-class-basemodel0-2-2020-10-29-23-08-10-726 status: InProgress
Transform job property-wcf-class-basemodel0-3-2020-10-29-23-08-13-767 status: InProgress
Transform job property-wcf-class-basemodel0-4-2020-10-29-23-08-14-896 status: InProgress
Continue waiting...
Tr

In [32]:
%%time
training_dataset[prediction_column_cv]=0
testing_dataset[prediction_column_cv]=0
for i in range(0,kfold):
    print('Reading predicted data fold: {}  of  {} : '.format(i+1, kfold))
    #training
    sm_predicted_data= pd.read_csv('s3://%s/%s/%s_fold_%s/%s.csv.out'%(bucket,transformation_output_folder,model_name,i,training_dataset_name), names=['%s_%s'%(prediction_column_fold,i)], error_bad_lines=False, index_col=False)
    training_dataset['%s_%s'%(prediction_column_fold,i)] = sm_predicted_data['%s_%s'%(prediction_column_fold,i)].values
    #average
    training_dataset[prediction_column_cv]+=   training_dataset['%s_%s'%(prediction_column_fold,i)]/(kfold)
    #testing
    sm_predicted_data= pd.read_csv('s3://%s/%s/%s_fold_%s/%s.csv.out'%(bucket,transformation_output_folder,model_name,i,testing_dataset_name), names=['%s_%s'%(prediction_column_fold,i)], error_bad_lines=False, index_col=False)
    testing_dataset['%s_%s'%(prediction_column_fold,i)] = sm_predicted_data['%s_%s'%(prediction_column_fold,i)].values
    #average
    testing_dataset[prediction_column_cv]+=   testing_dataset['%s_%s'%(prediction_column_fold,i)]/(kfold)

Reading predicted data fold: 1  of  5 : 
Reading predicted data fold: 2  of  5 : 
Reading predicted data fold: 3  of  5 : 
Reading predicted data fold: 4  of  5 : 
Reading predicted data fold: 5  of  5 : 
CPU times: user 1.64 s, sys: 223 ms, total: 1.87 s
Wall time: 9.99 s


In [33]:
%%time
#Scores cv folds
#
Train_Gini_l = list()
Test_Gini_l = list()
Train_ROC_l = list()
Test_ROC_l = list()
#
Train_Gini_l.append(gini(training_dataset[target_column],training_dataset[prediction_column_cv])/gini(training_dataset[target_column],training_dataset[target_column]))
Test_Gini_l.append(gini(testing_dataset[target_column],testing_dataset[prediction_column_cv])/gini(testing_dataset[target_column],testing_dataset[target_column]))
Train_ROC_l.append(roc_auc_score(training_dataset[target_column], training_dataset[prediction_column_cv]))
Test_ROC_l.append(roc_auc_score(testing_dataset[target_column], testing_dataset[prediction_column_cv]))

CPU times: user 196 ms, sys: 0 ns, total: 196 ms
Wall time: 195 ms


In [34]:
%%time
Scores = pd.DataFrame(list(zip(Train_Gini_l,Test_Gini_l,Train_ROC_l,Test_ROC_l)), 
               columns =['Train_Gini', 'Test_Gini','Train_ROC_AUC', 'Test_ROC_AUC']) 
Scores

CPU times: user 628 µs, sys: 0 ns, total: 628 µs
Wall time: 580 µs


Unnamed: 0,Train_Gini,Test_Gini,Train_ROC_AUC,Test_ROC_AUC
0,0.431514,0.346827,0.715759,0.673413


In [35]:
%%time
#Saving new predictions locally
training_dataset.to_csv('%s%s.csv'%(DataDir,training_dataset_name),header=True,index=False)
testing_dataset.to_csv('%s%s.csv'%(DataDir,testing_dataset_name),header=True,index=False)

CPU times: user 32.8 s, sys: 393 ms, total: 33.1 s
Wall time: 33.6 s
