In [1]:
import argparse
import os
import lightgbm as lgb
import numpy as np
import pandas as pd
from pprint import pprint
from azureml.core import Run
import joblib
from sklearn.metrics import (confusion_matrix, f1_score, accuracy_score,
                             precision_score, recall_score)
from imblearn.metrics import geometric_mean_score
from mlpackage.utility import ml_model as mf


In [2]:
ID_COLUMN = 'id'
DATE_COLUMN = 'dateTime'
LABEL_COLUMN = 'Label'

In [3]:
base_dir = '/Users/anders.swanson/temp/output'

In [4]:
# parameters
parser = argparse.ArgumentParser()
parser.add_argument('-file')
# params: Estimator
parser.add_argument('--input_dir', dest="input_dir",
                    default=os.path.join(base_dir, 'split_data3'))
parser.add_argument('--output_dir', dest="output_dir",
                    default=os.path.join(base_dir, 'trained_model3'))
parser.add_argument('--data_name', dest="data_name", default="deal_data3")
parser.add_argument('--model_name', dest="model_name", default="deal_model3")

# params: LightGBM (from AutoML)
parser.add_argument('--boosting_type', type=str,
                    dest='boosting_type', default="gbdt")
parser.add_argument('--colsample_bytree', type=float,
                    dest='colsample_bytree', default=0.7922222222222222)
parser.add_argument('--importance_type', type=str, dest='importance_type', default='split')
parser.add_argument('--learning_rate', type=float,
                    dest='learning_rate', default=0.05263631578947369)
parser.add_argument('--max_bin', type=int, dest='max_bin', default=50)
parser.add_argument('--max_depth', type=int, dest='max_depth', default=8)
parser.add_argument('--min_child_samples', type=int,
                    dest='min_child_samples', default=351)
parser.add_argument('--min_child_weight', type=int,
                    dest='min_child_weight', default=2)
parser.add_argument('--min_split_gain', type=int,
                    dest='min_split_gain', default=0.9473684210526315)
parser.add_argument('--n_estimators', type=int,
                    dest='n_estimators', default=800)
parser.add_argument('--n_jobs', type=int,
                    dest='n_jobs', default=1)
parser.add_argument('--num_leaves', type=int, dest='num_leaves', default=170)
parser.add_argument('--objective', type=str,
                    dest='objective', default=None)
parser.add_argument('--random_state', type=int,
                    dest='random_state', default=None)
parser.add_argument('--reg_alpha', type=float,
                    dest='reg_alpha', default=0.7894736842105263)
parser.add_argument('--reg_lambda', type=float,
                    dest='reg_lambda', default=0.05263157894736842)
parser.add_argument('--silent', type=bool,
                    dest='silent', default=True)
parser.add_argument('--subsample', type=float, dest='subsample', default=0.8910526315789474)
parser.add_argument('--subsample_for_bin', type=int,
                    dest='subsample_for_bin', default=200000)
parser.add_argument('--subsample_freq', type=int,
                    dest='subsample_freq', default=0)
parser.add_argument('--verbose', type=int,
                    dest='verbose', default=-10)

args = parser.parse_args()
print("all args: ")
pprint(vars(args))

# create training args dict
params_dict = dict(**vars(args))
est_params = ['input_dir', 'output_dir', 'model_name', 'stage', 'data_name']
for value in est_params:
    params_dict.pop(value, None)



all args: 
{'boosting_type': 'gbdt',
 'colsample_bytree': 0.7922222222222222,
 'data_name': 'deal_data3',
 'file': 'C:\\Users\\lincoln.rychecky\\AppData\\Roaming\\jupyter\\runtime\\kernel-a4f9508e-2c5d-43d3-8998-9a2cf01da365.json',
 'importance_type': 'split',
 'input_dir': '/Users/anders.swanson/temp/output\\split_data3',
 'learning_rate': 0.05263631578947369,
 'max_bin': 50,
 'max_depth': 8,
 'min_child_samples': 351,
 'min_child_weight': 2,
 'min_split_gain': 0.9473684210526315,
 'model_name': 'deal_model3',
 'n_estimators': 800,
 'n_jobs': 1,
 'num_leaves': 170,
 'objective': None,
 'output_dir': '/Users/anders.swanson/temp/output\\trained_model3',
 'random_state': None,
 'reg_alpha': 0.7894736842105263,
 'reg_lambda': 0.05263157894736842,
 'silent': True,
 'subsample': 0.8910526315789474,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'verbose': -10}


In [5]:


def f1_score_wrapper(y_true, y_pred):
    """
    *Calculate the f1-score (weighted) given an array of actuals and an array of predictions

    *Args:
        *y_true (Array): Actual labels
        *y_pred (Array): Predicted labels

    *Returns:
        *f1-weighted: Harmonic mean of precision and recall for each class, weighted by class support and summed
    """
    return ('f1-weighted', f1_score(y_true, [1 if y >= 0.5 else 0 for y in y_pred], average='weighted'), 1)


def calc_recall_rate(df, probability):
    """
    *Determine attrition rate above specified probability threshold

    *Args:
        *df (Dataframe): The dataframe for which to calculate the attrition rate.
        *probability (Numeric): The probability threshold above which to calculate the attrition rate.

    *Returns:
        *rate: Percentage of attrition above some threshold
        *threshold: Threshold above which to calculate attrition rate
    """
    target = len(df) * probability
    threshold = np.argmin([abs(len(df[df['Probability'] >= x]) - target) for x in np.arange(0, 1, 0.01)]) / 100
    print('LIFT: for probability={} threshold is {}'.format(probability, threshold))
    return ((df['TMP_LABEL'] >= 1) & (df['Probability'] >= threshold)).sum() /\
        (df['Probability'] >= threshold).sum(), threshold


def train_model(args_dict, data, early_stopping_rounds=None):
    """
    *Train a gradient boosted decision tree model

    *Args:
        *params_dict (): The arguments of the classifier
        *data (Dataframe): Data to use for training
        *early_stopping_rounds (Numeric): How many early stopping rounds to use

    *Returns:
        *clf: Trained classifier
    """
    clf = lgb.LGBMClassifier(metric='None', **params_dict)
    clf.fit(
        X=data['X'],
        y=data['y'],
        eval_set=[(data['x_valid'], data['y_valid'])],
        eval_metric=f1_score_wrapper,
        early_stopping_rounds=early_stopping_rounds
    )
    return clf


def make_preevaluation_df(clf, x, xtra_cols, y, cols):
    """
    *Determine lift which is defined as the positive rate among top 10% of scored probabilities divided
    * by the positive rate of the overall population.

    *Args:
        *clf (): The classifier to score probabilities with
        *x (Dataframe): The feature set
        *xtra_cols (Dataframe): the extra labels
        *y (np.ndarray): The true Label. If `None`, 'Label' col will be 0.5
        *cols (List(String)): either
            cols to add to df: deal: ['id']; attrition:  ['id', 'DateOfExport']),
            or `None` if you want to include all cols from `xtra_cols`

    *Returns:
        * (Dataframe): id, Probability (predicted), Label (actual result) (and sometimes DateOfExport)
    """
    label_col = y if y is not None else 0.5

    df = (xtra_cols
          # subset where needed
          .pipe(lambda x: x[cols] if cols is not None else x)
          # predict on every row in holdout
          .assign(Probability=clf.predict_proba(x)[:, 1])
          # TODO : verify that we aren't predicting for the current month....
          .assign(Label=label_col)
          )
    return df


def get_metrics(df_preeval):
    """
    * Get accuracy metrics for a given set of observations+labels

    *Args:
        *df_preeval: output of make_preevaluation_df()

    *Returns:
        *metrics: dict containing accuracy, precision, recall, g-mean and f1
        *conf_to_log: json struct for logging to ML Service
        *conf: a sklearn confusion matrix
    """

    y_test = df_preeval["Label"]
    y_pred = [0.0 if x < 0.5 else 1.0 for x in df_preeval["Probability"]]
    conf_pre = confusion_matrix(y_test, y_pred)

    conf_to_log = {
        "schema_type": "confusion_matrix",
        "schema_version": "v1",
        "data": {
            "class_labels": ["0", "1"],
            "matrix": conf_pre.tolist()}
    }
    conf = pd.DataFrame(
        conf_pre,
        index=["actualFALSE", "actualTRUE"],
        columns=["predFALSE", "predTRUE"]).reset_index()

    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='binary'),
        'recall': recall_score(y_test, y_pred, average='binary'),
        'geometric_mean': geometric_mean_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred, average='binary'),
    }

    return metrics, conf_to_log, conf



In [6]:
def geometric_mean_score_wrapper(y_true, y_pred):
    """
    *Calculate the g-mean given an array of actuals and an array of predictions

    *Args:
        *y_true (Array): Actual labels
        *y_pred (Array): Predicted labels

    *Returns:
        *geometric mean: Geometric mean of precision and recall
    """
    return ('geometric mean', geometric_mean_score(y_true, [1 if y >= 0.5 else
                                                            0 for y in y_pred]), 1)

def train_model(args_dict, data, eval_func_wrapper, early_stopping_rounds=None):
    """
    *Train a gradient boosted decision tree model

    *Args:
        *args_dict (): The arguments of the classifier
        *data (Dataframe): Data to use for training
        *early_stopping_rounds (Numeric): How many early stopping rounds to use

    *Returns:
        *clf: Trained classifier
    """
    clf = lgb.LGBMClassifier(metric='None', **args_dict)
    clf.fit(
        X=data['X'],
        y=data['y'],
        eval_set=[(data['x_valid'], data['y_valid'])],
#         eval_metric=eval_func_wrapper,
        early_stopping_rounds=early_stopping_rounds
    )
    return clf

In [7]:
# INPUTS
# load train and test set into numpy arrays
data = joblib.load(os.path.join(args.input_dir, args.data_name + '.pkl'))


FileNotFoundError: [Errno 2] No such file or directory: '/Users/anders.swanson/temp/output\\split_data3\\deal_data3.pkl'

In [8]:
def demo_score(prob,act):
    return ((prob-act)**2).mean()

def crazy_wrapper(y_true, y_pred):
    """
    *Calculate the f1-score (weighted) given an array of actuals and an array of predictions

    *Args:
        *y_true (Array): Actual labels
        *y_pred (Array): Predicted labels

    *Returns:
        *f1-weighted: Harmonic mean of precision and recall for each class, weighted by class support and summed
    """
    return ('crazy_metric', demo_score(y_pred, y_true), 1)

In [9]:
clf = lgb.LGBMClassifier(metric='None', **params_dict)
clf.fit(
    X=data['X'],
    y=data['y'],
    eval_set=[(data['x_valid'], data['y_valid'])],
    eval_metric=crazy_wrapper,
    early_stopping_rounds=50
)

NameError: name 'data' is not defined

In [17]:
# train model (w/ params from CL args)
clf = train_model(params_dict, data, early_stopping_rounds=50)

[1]	valid_0's binary_error: 0.403105
Training until validation scores don't improve for 50 rounds
[2]	valid_0's binary_error: 0.403105
[3]	valid_0's binary_error: 0.403105
[4]	valid_0's binary_error: 0.393905
[5]	valid_0's binary_error: 0.352789
[6]	valid_0's binary_error: 0.321737
[7]	valid_0's binary_error: 0.305923
[8]	valid_0's binary_error: 0.295285
[9]	valid_0's binary_error: 0.283784
[10]	valid_0's binary_error: 0.278033
[11]	valid_0's binary_error: 0.273145
[12]	valid_0's binary_error: 0.269695
[13]	valid_0's binary_error: 0.267683
[14]	valid_0's binary_error: 0.265095
[15]	valid_0's binary_error: 0.26107
[16]	valid_0's binary_error: 0.258482
[17]	valid_0's binary_error: 0.254169
[18]	valid_0's binary_error: 0.251581
[19]	valid_0's binary_error: 0.253019
[20]	valid_0's binary_error: 0.249569
[21]	valid_0's binary_error: 0.250719
[22]	valid_0's binary_error: 0.250431
[23]	valid_0's binary_error: 0.250144
[24]	valid_0's binary_error: 0.250144
[25]	valid_0's binary_error: 0.248706

In [8]:
# HACK: due to early stopping bug, re-run model on best # iterations
params_dict['n_estimators'] = clf.best_iteration_
clf = train_model(params_dict, data)

[1]	valid_0's geometric mean: 0
Training until validation scores don't improve for 50 rounds
[2]	valid_0's geometric mean: 0
[3]	valid_0's geometric mean: 0
[4]	valid_0's geometric mean: 0.153383
[5]	valid_0's geometric mean: 0.373714
[6]	valid_0's geometric mean: 0.494396
[7]	valid_0's geometric mean: 0.542873
[8]	valid_0's geometric mean: 0.575456
[9]	valid_0's geometric mean: 0.606023
[10]	valid_0's geometric mean: 0.623658
[11]	valid_0's geometric mean: 0.639595
[12]	valid_0's geometric mean: 0.652282
[13]	valid_0's geometric mean: 0.658584
[14]	valid_0's geometric mean: 0.667219
[15]	valid_0's geometric mean: 0.677638
[16]	valid_0's geometric mean: 0.683231
[17]	valid_0's geometric mean: 0.690949
[18]	valid_0's geometric mean: 0.69505
[19]	valid_0's geometric mean: 0.697233
[20]	valid_0's geometric mean: 0.703351
[21]	valid_0's geometric mean: 0.702576
[22]	valid_0's geometric mean: 0.704013
[23]	valid_0's geometric mean: 0.705681
[24]	valid_0's geometric mean: 0.707615
[25]	valid

In [21]:
df_preeval_test = make_preevaluation_df(
    clf, data['x_test'], data['xtra_cols_test'], data['y_test'],
    [ID_COLUMN, DATE_COLUMN] + ['AvaOpportunityId'])
df_preeval_test

Unnamed: 0,id,dateTime,AvaOpportunityId,Probability,Label
17387,110fb940-9a06-41af-a533-3fb9615293cd,2020-02-26 00:45:21,A000227053,0.903614,1
17388,559adcce-9adf-4dbf-8b19-a2378f572942,2020-02-26 04:29:15,A000227059,0.403573,0
17389,3704d4fc-7e89-44d2-9b00-6e363885d3f3,2020-02-26 09:56:41,A000227068,0.941018,1
17390,733cf839-6a48-436f-801e-863a8a18bacd,2020-02-26 10:04:49,A000227069,0.901812,1
17391,dc981c3b-020d-4734-939f-71a9e8bb3538,2020-02-26 10:07:59,A000227070,0.512002,1
...,...,...,...,...,...
18037,819403b7-2b04-476d-b9e0-6aeedcb532e1,2020-06-23 00:22:25,A000236549,0.929687,0
18038,072bbdf3-9e2e-4980-a37b-ae955693a744,2020-06-23 16:54:10,A000236630,0.939531,1
18039,1942d87c-8cbe-4198-97cf-4dfc80aed195,2020-06-23 18:01:22,A000236633,0.425460,0
18040,fec55fb7-9bbf-48d3-8db6-cd101124773a,2020-06-24 07:23:29,A000236662,0.336744,0


In [22]:
geometric_mean_score(df_preeval_test['Label'], [1 if y >= 0.5 else 0 for y in df_preeval_test['Probability']])

0.7627762243310613

In [None]:
('metric name', 0.7627762243310613, 1)

In [13]:
# calculate accuracy metrics
metrics, conf_to_log, conf = get_metrics(df_preeval_test)
print("confusion matrix:\n", conf)
metrics

confusion matrix:
          index  predFALSE  predTRUE
0  actualFALSE        147        61
1   actualTRUE         79       368


{'accuracy': 0.7862595419847328,
 'precision': 0.8578088578088578,
 'recall': 0.8232662192393736,
 'geometric_mean': 0.7627762243310613,
 'f1': 0.8401826484018265}

In [None]:
# OUTPUTS
# log metrics
run = Run.get_context()
run.log("Arguments", args)
run.log('majority_obs', np.sum(data['y_valid'] == 0))
run.log('minority obs', np.sum(data['y_valid'] == 1))
run.log('precision', metrics['precision'])
run.log('recall', metrics['recall'])
run.log('f1', metrics['f1'])
run.log('geometric mean', metrics['geometric_mean'])
run.log('LIFT', lift)


In [None]:
# save model
os.makedirs(args.output_dir, exist_ok=True)
os.makedirs('outputs', exist_ok=True)

joblib.dump(value=clf, filename=os.path.join('outputs', args.model_name + '.pkl'))
joblib.dump(value=clf, filename=os.path.join(args.output_dir, args.model_name + '.pkl'))
