## ML Notebook

- Dev (cv based iterations on 2017+2018)
- Final model and object packing
- Prediction (for 2019)

In [253]:
# initialization

%reset -f

import sys, pandas as pd, numpy as np, inspect, re as re, functools as functools, pickle, glob, warnings, os
import itertools
from tqdm import tqdm

# sklearn packages
import sklearn
import sklearn.metrics as skm
from sklearn.base import clone

# some options/variables
randomseed = 1 # the value for the random state used at various points in the pipeline
pd.options.display.max_rows = 50 # specify if you want the full output in cells rather the truncated list
pd.options.display.max_columns = 200

# to display multiple outputs in a cell without usin print/display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# append the scripts path to pythonpath
sys.path.append('./scripts/')

# ignore warnings (only if you are the kind that would code when the world is burning)
warnings.filterwarnings('ignore')

# plot inline
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# # import the various ml modules
import xgboost as xgb

############################################## import the custom modules ################################
import helperfuncs as helper
import feateng as fte
import misc as miscfun
from misc import ce_encodings, DataFrameImputer, scalers

# instantiate the classes
helpers = helper.helper_funcs()
cust_funcs = fte.custom_funcs()
feat_sel = miscfun.feat_selection()

#############################################################################################################
# global function to flatten columns after a grouped operation and aggregation
# outside all classes since it is added as an attribute to pandas DataFrames
def __my_flatten_cols(self, how="_".join, reset_index=True):
    how = (lambda iter: list(iter)[-1]) if how == "last" else how
    self.columns = [how(filter(None, map(str, levels))) for levels in self.columns.values] \
    if isinstance(self.columns, pd.MultiIndex) else self.columns
    return self.reset_index(drop=True) if reset_index else self
pd.DataFrame.my_flatten_cols = __my_flatten_cols

In [254]:
from azure.datalake.store import core, lib, multithread

tenant = 'cef04b19-7776-4a94-b89b-375c77a8f936'
resource = 'https://datalake.azure.net/'
client_id = 'e9aaf06a-9856-42a8-ab3c-c8b0d3a9b110'
client_secret = 'DlbuV60szYT2U0CQNjzwRA55EsH42oX92AB7vbD2clk='

adlcreds = lib.auth(tenant_id = tenant,
                   client_secret = client_secret,
                   client_id = client_id,
                   resource = resource)

subs_id = '73f88e6b-3a35-4612-b550-555157e7059f'
adls = 'edhadlsanasagbdev'

adlsfsc = core.AzureDLFileSystem(adlcreds, store_name=adls)

path = '/root/anasandbox/people/opr10x/'

In [255]:
with adlsfsc.open(path + '/2019/Data/Output_Data/ads/final_ads.pickle', 'rb') as f:
    ads = pickle.load(f)
    f.close()

In [256]:
ads.opr.value_counts()

2.0    6470
3.0    4318
4.0    1712
1.0     902
5.0     537
0.0     472
Name: opr, dtype: int64

In [257]:
# Final PROCESSING

# remove opr==1B -> right here, right now
ads = ads[ads['opr']>0]
ads['opr'] = ads['opr']-1
ads.reset_index(drop=True, inplace=True)

## split into train+valid/prediction sets
traindf = ads[ads['year'].isin([2017, 2018])].copy()
preddf = ads[ads['year']==2019].copy()

traindf.dropna(subset=['global_id', 'opr'], inplace=True, how='any')
preddf.dropna(subset=['global_id'], inplace=True, how='any')

traindf.reset_index(drop=True, inplace=True)
preddf.reset_index(drop=True, inplace=True)

ytraindf = np.array(traindf.opr)

traindf.drop(columns=['opr'], inplace=True)
preddf.drop(columns=['opr'], inplace=True)

In [258]:
np.array(np.unique(ytraindf, return_counts=True)).T

array([[0.000e+00, 6.920e+02],
       [1.000e+00, 4.980e+03],
       [2.000e+00, 3.385e+03],
       [3.000e+00, 1.317e+03],
       [4.000e+00, 4.100e+02]])

# ORDINAL CLASSIFICATION APPROACH

## FEATURE LIST

In [290]:
traindf.columns

Index(['global_id', 'employee_band', 'ebm_level', 'year', 'function',
       'mr_pers_compgroup_year_comp_score_mean_functional_competencies',
       'mr_pers_compgroup_year_comp_score_mean_leadership_competencies',
       'mr_pers_compgroupl1_year_comp_score_mean_leadership_competencies_develop_people',
       'mr_pers_compgroupl1_year_comp_score_mean_leadership_competencies_dream_big',
       'mr_pers_compgroupl1_year_comp_score_mean_leadership_competencies_live_our_culture',
       'net_target', 'teamsize', 'teamsize_delta', 'index_average',
       'position_velocity', 'emp_time_in_band1', 'count_of_belts',
       'talentpool_renomination', 'talentpool', 'engagement_score',
       'manager_effectiveness_score', 'fs_prom', 'fs_ho', 'fs_adherant_perc',
       'fs_to_overall', 'dr_prom', 'dr_ho', 'dr_adherant_perc',
       'dr_to_overall', 'mean_team_tenure', 'lc_count', 'fc_count',
       'position_tenure', 'zone', 'target_delta', 'prev_opr', 'prev_prev_opr'],
      dtype='object')

### SPLIT INTO TRAIN==2017 AND VALID==2018

In [259]:
# train_0to5_2017 = traindf.copy()
# train_0to5_2017['response'] = ytraindf
# train_0to5_2017 = train_0to5_2017[train_0to5_2017['year'].isin([2017])]
# ytrain_0to5_2017 = np.array(train_0to5_2017.response)
# train_0to5_2017.drop(columns=['response'], inplace=True)

# valid_0to5_2018 = traindf.copy()
# valid_0to5_2018['response'] = ytraindf
# valid_0to5_2018 = valid_0to5_2018[valid_0to5_2018['year']==2018]
# yvalid_0to5_2018 = np.array(valid_0to5_2018.response)
# valid_0to5_2018.drop(columns=['response'], inplace=True)

In [284]:
clf = xgb.XGBClassifier()
clf

params = {'objective': ['binary:logistic'],
          'learning_rate': [0.1],
          'max_depth': [3,10],
          'min_child_weight': [7],
          'silent': [1],
          'subsample': [0.85],
          'colsample_bytree': [0.85],
          'n_estimators': [10],
          'seed': [1],
          'scale_pos_weight': [1,3,5],
          'base_score': [0.35, 0.45],
          'max_bin': [300],
          'gamma': [0.01],
          'reg_alpha': [0.01],
          'reg_lambda': [0.03]}

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [285]:
## use below script to iterate on train=2017, valid=2018

# %run -i ./scripts/ordinal_classifier_with_valid.py

In [287]:
## use below script to create cross-validated model on 2017+2018 or 2017 only or 2018 only

%run -i ./scripts/ordinal_classifier_without_valid.py

xgb_ordinal = OrdinalClassifier(train=traindf.copy(), ytrain=ytraindf.copy(),
                               clf=clf, params=params)

# fit the binary classifiers
xgb_ordinal.fit()

category encoding is happening ... 

category encoding completed 











  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   19.3s finished








 25%|█████████████████████                                                               | 1/4 [00:19<00:59, 19.87s/it]

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    9.5s finished








 50%|██████████████████████████████████████████                                          | 2/4 [00:30<00:34, 17.03s/it]

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    9.4s finished








 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:40<00:15, 15.06s/it]

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    9.0s finished








100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:50<00:00, 13.35s/it]









In [288]:
with adlsfsc.open(path + '/2019/Data/Output_Data/ml_prediction_objects/ordinal_classifier_objects.pickle', 'wb') as f:
    pickle.dump(xgb_ordinal, f)
    f.close()

# MULTICLASS APPROACH

## FEATURE LIST

In [268]:
# traindf.columns

In [269]:
# %run -i ./scripts/ml_xgboost_5class.py

# xgb_model, xgb_feat_names, xgtrain, xgtest, xgbpred, xgbpredprobs = quick_model_xgb(train=train_0to5_2017.copy(),
#                                                                                     valid=valid_0to5_2018.copy(),
#                                                                                     ytrain=ytrain_0to5_2017.copy(),
#                                                                                     yvalid=yvalid_0to5_2018.copy())

In [270]:
# def model_params(space_size=3, randomseed=123):
#     depth = np.arange(15, 20, dtype=int)
#     delta_step = np.arange(3, 6, dtype=int)
#     base_score = np.arange(0.35, 0.5, 0.05, dtype=float)
#     colsample_level = np.arange(0.7, 0.9, 0.05, dtype=float)
#     colsample_tree = np.arange(0.7, 0.9, 0.05, dtype=float)
#     subsample = np.arange(0.75, 0.9, 0.05, dtype=float)
    
#     model_params_grid = list(itertools.product(depth, delta_step, base_score, 
#                                           colsample_level, colsample_tree, subsample))
#     model_params_grid = pd.DataFrame(model_params_grid, 
#                 columns=['maxdepth', 'maxdeltastep', 'basescore', 'colsamplebylevel', 'colsamplebytree', 'subsample'])
#     model_params_grid = model_params_grid.sample(space_size, random_state=randomseed).reset_index(drop=True)
    
#     return model_params_grid

# mpg = model_params()

In [271]:
# %run -i ./scripts/ml_xgboost_5class_cv.py

# xgb_model={}
# xgb_featnames={}
# xgb_cv_results={}
# encoderobj={}
# scalerobj={}

# for i in tqdm(range(mpg.shape[0])):
#     xgb_model[i], xgb_featnames[i], xgb_cv_results[i], encoderobj[i], scalerobj[i] = quick_model_xgb(train=traindf.copy(), 
#                                                                                         ytrain=ytraindf.copy(),
#             params={'objective':'multi:softprob', 'eval_metric':['mlogloss', 'merror'], 'tree_method':'exact', 
#                     'silent':1, 'nthread':-1, 'num_class':5, 'learning_rate':0.1,  
#                     'n_jobs': -1, 'seed':1, 'grow_policy':'lossguide', 'max_bin':500, 
#                     'alpha': 0.02, 'gamma': 0.03, 'lambda': 0.01, 'min_child_weight': 9,
#                     ## params from the param grid are below
#                     'max_depth': mpg.loc[i, 'maxdepth'], 
#                     'max_delta_step': mpg.loc[i, 'maxdeltastep'], 
#                     'base_score': mpg.loc[i, 'basescore'], 
#                     'colsample_bylevel': mpg.loc[i, 'colsamplebylevel'],
#                     'colsample_bytree': mpg.loc[i, 'colsamplebytree'], 
#                     'subsample': mpg.loc[i, 'subsample']}, num_round=100)

In [272]:
# xgb.plot_importance(xgb_model[0], max_num_features=10)

In [273]:
# %run -i ./scripts/ml_xgboost_tuning.py

# xgbmod = xgbclass(train=train_0to5.copy(), ytrain=ytrain_0to5.copy())
# xgbtrials=pd.DataFrame(xgbmod.trials.results)
# #xgbmod.get_xgb_imp().head(5)

In [274]:
# # save all objects in a pickle for prediction (run this cell if the modelling code used was xgboost-multiclass)

# with adlsfsc.open(path + '/2019/Data/Output_Data/ml_prediction_objects/ml_model_scaler_encoder.pickle', 'wb') as f:
#     pickle.dump(xgb_model, f)
#     pickle.dump(encoderobj, f)
#     pickle.dump(scalerobj, f)
#     pickle.dump(xgb_featnames, f)
#     f.close()