## Prediction Notebook

- predict suggestions for 2019 set

In [87]:
# initialization

%reset -f

import sys, pandas as pd, numpy as np, inspect, re as re, functools as functools, pickle, glob, warnings, os

# sklearn packages
import sklearn.metrics as skm

# some options/variables
randomseed = 1 # the value for the random state used at various points in the pipeline
pd.options.display.max_rows = 50 # specify if you want the full output in cells rather the truncated list
pd.options.display.max_columns = 200

# to display multiple outputs in a cell without usin print/display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# append the scripts path to pythonpath
sys.path.append('./scripts/')

# ignore warnings (only if you are the kind that would code when the world is burning)
warnings.filterwarnings('ignore')

# import the various ml modules
import xgboost as xgb

############################################## import the custom modules ################################
import helperfuncs as helper
import feateng as fte
from misc import ce_encodings, DataFrameImputer, scalers

# instantiate the classes
helpers = helper.helper_funcs()
cust_funcs = fte.custom_funcs()

#############################################################################################################
# global function to flatten columns after a grouped operation and aggregation
# outside all classes since it is added as an attribute to pandas DataFrames
def __my_flatten_cols(self, how="_".join, reset_index=True):
    how = (lambda iter: list(iter)[-1]) if how == "last" else how
    self.columns = [how(filter(None, map(str, levels))) for levels in self.columns.values] \
    if isinstance(self.columns, pd.MultiIndex) else self.columns
    return self.reset_index(drop=True) if reset_index else self
pd.DataFrame.my_flatten_cols = __my_flatten_cols

In [88]:
%run -i ./scripts/dicts_cols.py

In [89]:
from azure.datalake.store import core, lib, multithread

tenant = 'cef04b19-7776-4a94-b89b-375c77a8f936'
resource = 'https://datalake.azure.net/'
client_id = 'e9aaf06a-9856-42a8-ab3c-c8b0d3a9b110'
client_secret = 'DlbuV60szYT2U0CQNjzwRA55EsH42oX92AB7vbD2clk='

adlcreds = lib.auth(tenant_id = tenant,
                   client_secret = client_secret,
                   client_id = client_id,
                   resource = resource)

subs_id = '73f88e6b-3a35-4612-b550-555157e7059f'
adls = 'edhadlsanasagbdev'

adlsfsc = core.AzureDLFileSystem(adlcreds, store_name=adls)

path = '/root/anasandbox/people/opr10x/'

In [90]:
with adlsfsc.open(path + '/2019/Data/Output_Data/ads/final_ads.pickle', 'rb') as f:
    ads = pickle.load(f)
    f.close()

# Final PROCESSING
preddf = ads[ads['year']==2019].copy()
preddf.dropna(subset=['global_id'], inplace=True, how='any')
preddf.reset_index(drop=True, inplace=True)
preddf.drop(columns=['opr'], inplace=True)

In [91]:
# get the scope ids

with adlsfsc.open(path + '/2019/Data/Output_Data/peer_group/peer_group_30april.csv', 'rb') as f:
    scope = helpers.csv_read(f, cols_to_keep=['global_id', 'emp_hiring_date'])
    f.close()
    
scope['scopeflag'] = 1
preddf['predflag'] = 0
pred_scope = preddf.merge(scope, on=['global_id'], how='outer')
pred_scope = pred_scope[pred_scope['scopeflag']==1].copy()
pred_scope['emp_hiring_date'] = pd.to_datetime(pred_scope['emp_hiring_date'], format='%Y-%m-%d')
pred_scope['hire_flag'] = np.where(pred_scope['emp_hiring_date']>pd.to_datetime('2019-01-01', format='%Y-%d-%m'),1,0)
pred_scope = pred_scope[pred_scope['hire_flag']==0]
scope_ids = pred_scope.global_id
preddf = preddf[preddf['global_id'].isin(scope_ids)]
preddf.drop(columns=['predflag'], inplace=True)
preddf.shape

(4184, 2)


(4080, 37)

## DISTRIBUTION

In [92]:
# DSITRIBUTION SNIPPET

## zonal distribution df
### use below snippet to use for distribution when doing zone level distribution
# distribution_df = ads[['zone', 'year', 'opr', 'global_id']].copy()
# distribution_df = distribution_df[distribution_df['year'].isin([2018])]
# distribution_df = distribution_df[distribution_df['opr']>0]
# distribution_df['opr'] = distribution_df['opr']-1
# distribution_df = distribution_df.groupby(['zone', 'year', 'opr'])['global_id'].count().reset_index()
# distribution_df = distribution_df.groupby(['zone', 'opr'])['global_id'].mean().reset_index()

# global distribution df
global_distribution_df = ads[['year', 'opr', 'global_id']].copy()
global_distribution_df = global_distribution_df[global_distribution_df['year'].isin([2018])]
global_distribution_df = global_distribution_df.groupby(['year', 'opr'])['global_id'].count().reset_index()
global_distribution_df = global_distribution_df[global_distribution_df['opr']>0]
global_distribution_df['opr'] = global_distribution_df['opr']-1
global_distribution_df = global_distribution_df.groupby(['opr'])['global_id'].mean().reset_index()
global_distribution_df.columns = ['oprclass', 'distr']
global_distribution_df['distr'] = global_distribution_df['distr']/global_distribution_df['distr'].sum()
global_distribution_df.sort_values(by='oprclass', inplace=True, ascending=False)
global_distribution_df['cumdistr'] = global_distribution_df['distr'].cumsum()
global_distribution_df['cumdistr'] = global_distribution_df['cumdistr'].round(4)
global_distribution_df.reset_index(inplace=True, drop=True)

# USE BELOW SNIPPETS FOR PREDICTION USING THE ORDINAL APPROACH

In [93]:
%run -i ./scripts/ordinal_classifier_without_valid.py

with adlsfsc.open(path + '/2019/Data/Output_Data/ml_prediction_objects/ordinal_classifier_objects.pickle', 'rb') as f:
    xgb_ordinal = pickle.load(f)
    f.close()

In [94]:
%run -i ./scripts/opr_prediction_ordinal.py

probs, preds, clfs_predict_class1 = predict_opr_ordinal(preddf.copy(), xgb_ordinal)

In [95]:
clfs_predict_df = pd.DataFrame.from_dict(clfs_predict_class1)
clfs_predict_df['global_id'] = np.array(preddf.global_id)
probs_df = pd.DataFrame(data=probs, columns=['1A', '3B', '3A', '4B', '4A'])

for i in range(4):
    clfs_predict_df.sort_values(by=[3-i], inplace=True, ascending=False, kind='mergesort')
    clfs_predict_df.reset_index(inplace=True, drop=True)
    clfs_predict_df[str(3-i) + '_new_index'] = clfs_predict_df.index
    clfs_predict_df[str(3-i) + '_index_perc'] = clfs_predict_df[str(3-i) + '_new_index'].rank(pct=True)
    clfs_predict_df[str(3-i) + '_flag'] = np.where(clfs_predict_df[str(3-i) + '_index_perc'] < 
                                                 global_distribution_df.iloc[i, 2], 1, 0)
    
clfs_predict_df['predictions'] = np.where(clfs_predict_df['3_flag']==1, 4,
                                   np.where(clfs_predict_df['2_flag']==1, 3,
                                           np.where(clfs_predict_df['1_flag']==1, 2,
                                                   np.where(clfs_predict_df['0_flag']==1, 1, 0))))

shape_predictions_df = clfs_predict_df[['global_id', 'predictions']].copy()
shape_predictions_df = pd.concat([shape_predictions_df.reset_index(drop=True),
                                 probs_df], axis=1)

In [96]:
# save the pickle and flatfile of the ML predictions from ordinal approach
with adlsfsc.open(path + '/2019/Data/Output_Data/ml_output/2019_predictions_probs_ordinal.pickle', 'wb') as f:
    pickle.dump(shape_predictions_df, f)
    f.close()
    
with adlsfsc.open(path + '/2019/Data/Output_Data/ml_output/2019_predictions_probs_ordinal.csv', 'wb') as f:
    pred_str = shape_predictions_df.to_csv()
    f.write(str.encode(pred_str))
    f.close()
    
# with adlsfsc.open(path + '/2019/Data/Output_Data/ml_output/2019_predictions_probs_ordinal.pickle', 'rb') as f:
#     shape_predictions_df = pickle.load(f)
#     f.close()

296037

In [97]:
%run -i ./scripts/rules_shape_rules.py

df = apply_rules(dset=preddf.copy(), predictions=shape_predictions_df.copy(), prev_prev_opr_col='prev_prev_opr', prev_opr_col='prev_opr',
                pred_col='predictions', tib_col='emp_time_in_band1', mei_col='engagement_score',
                ta_col='net_target', mean_ca_col='mr_pers_compgroup_year_comp_score_mean_leadership_competencies')

final_prediction_df = df.copy()
final_prediction_df = final_prediction_df.merge(shape_predictions_df[['global_id', 'predictions']].copy(), 
                                                on='global_id', how='left')
final_prediction_df['predictions'] = final_prediction_df['predictions'].map(rev_dep_dict_without1B)
final_prediction_df['predictions'] = np.where(final_prediction_df['rules_prediction'],
                                             final_prediction_df['predictions'],
                                             final_prediction_df['rules_prediction'])
final_prediction_df = final_prediction_df[['global_id', 'zone', 'predictions']]

In [115]:
ml_rank_df = clfs_predict_df[['global_id', 'predictions', '3_new_index', '2_new_index', '1_new_index', '0_new_index']].copy()
ml_rank_df['rank'] = np.where(ml_rank_df['predictions']==4, 10000+ml_rank_df['3_new_index'],
                             np.where(ml_rank_df['predictions']==3, 20000+ml_rank_df['2_new_index'],
                                     np.where(ml_rank_df['predictions']==2, 30000+ml_rank_df['1_new_index'], 
                                              40000+ml_rank_df['0_new_index'])))
ml_rank_df.sort_values(by=['rank'], ascending=True, inplace=True)
ml_rank_df.reset_index(drop=True, inplace=True)
ml_rank_df['rank'] = ml_rank_df.index
ml_rank_df['predictions_ML'] = ml_rank_df['predictions'].map(rev_dep_dict_without1B)
ml_rank_df.drop(columns=['predictions'], inplace=True)
final_prediction_df.rename(columns={'predictions':'predictions_SHAPE_RULES'}, inplace=True)
ml_rank_df = ml_rank_df.merge(final_prediction_df, on=['global_id'], how='left')
ml_rank_df = ml_rank_df[['global_id', 'zone', 'rank', 'predictions_ML', 'predictions_SHAPE_RULES']]

In [116]:
with adlsfsc.open(path + '/2019/Data/Output_Data/ml_output/2019_rank.csv', 'wb') as f:
    mlrank_str = ml_rank_df.to_csv()
    f.write(str.encode(mlrank_str))
    f.close()

165189

# USE BELOW SNIPPETS FOR PREDICTION USING THE MULTICLASS APPROACH

In [36]:
# %run -i ./scripts/opr_prediction_multiclass.py

# # create the prediction dictionary to append and take mean of all the probabilities
# probs={}

# for i in model.keys():
#     probs[i], pred_ids = predict_opr_multiclass(preddf.copy(), model[i], encoderobj[i], scalerobj[i], featnames[i])
# probsfull = pd.concat(probs.values()).groupby(level=0).mean()

# # convert the probabilities to labels (classes)
# pred_labels = pd.DataFrame(np.argmax(np.array(probsfull), 1))
# pred_labels.columns = ['predictions']
# pred_labels['global_id'] = pred_ids
# predictions_df = pd.concat([probsfull, pred_labels], axis=1)

In [37]:
# # save the pickle and flatfile of the ML predictions from multiclass approach
# with adlsfsc.open(path + '/2019/Data/Output_Data/ml_output/2019_predictions_probs.pickle', 'wb') as f:
#     pickle.dump(predictions_df, f)
#     f.close()
    
# with adlsfsc.open(path + '/2019/Data/Output_Data/ml_output/2019_predictions_probs.csv', 'wb') as f:
#     pred_str = predictions_df.to_csv()
#     f.write(str.encode(pred_str))
#     f.close()
    
# # with adlsfsc.open(path + '/2019/Data/Output_Data/ml_output/2019_predictions_probs.pickle', 'rb') as f:
# #     predictions_df = pickle.load(f)
# #     f.close()

## RULES

In [40]:
# %run -i ./scripts/rules_shape_rules.py

# df = apply_rules(dset=preddf.copy(), predictions=predictions_df.copy(), prev_prev_opr_col='prev_prev_opr', prev_opr_col='prev_opr',
#                 pred_col='predictions', tib_col='emp_time_in_band1', mei_col='engagement_score',
#                 ta_col='net_target', mean_ca_col='mr_pers_compgroup_year_comp_score_mean_leadership_competencies')

## WEIGHTED SHAPE LOGIC

In [38]:
# pred_withzone = predictions_df.merge(preddf[['global_id', 'zone']], how='left', on=['global_id'])
# pred_withzone['score'] = (pred_withzone['1A']*120567.4303 + 
#                           pred_withzone['3B']*120570.5456 + 
#                           pred_withzone['3A']*120572.7815 +
#                           pred_withzone['4B']*120575.186 + 
#                           pred_withzone['4A']*120578.4064)

# # use the distribution to apply shape
# pred_withzone.sort_values(by=['score'], ascending=False, inplace=True)
# pred_withzone.reset_index(drop=True, inplace=True)
# pred_withzone['newindex'] = pred_withzone.index
# pred_withzone['rank'] = pred_withzone['newindex'].rank(pct=True)
# pred_withzone['flag'] = np.where(pred_withzone['rank'] < global_distribution_df.iloc[0,2], 4,
#                         np.where(pred_withzone['rank'] < global_distribution_df.iloc[1,2], 3,
#                                 np.where(pred_withzone['rank'] < global_distribution_df.iloc[2,2], 2,
#                                         np.where(pred_withzone['rank'] < global_distribution_df.iloc[3,2], 1, 0))))

# # merge the output from rules and the shape modules giving precedence to rules
# df = df.merge(pred_withzone[['global_id', 'flag']], on=['global_id'], how='left')
# df['rules_prediction'] = df['rules_prediction'].map(dep_dict_without1B)
# df['predictions'] = np.where(df['rules_prediction'].isna, df['flag'], df['rules_prediction'])
# df['predictions'] = df['predictions'].astype(int)
# df.drop(columns=['rules_prediction', 'flag'], inplace=True)
# df['predictions'] = df['predictions'].map(rev_dep_dict_without1B)

## RULES again

In [39]:
# %run -i ./scripts/rules_shape_rules.py

# df = apply_rules(dset=df.copy(), prev_prev_opr_col='prev_prev_opr', prev_opr_col='prev_opr',
#                 pred_col='predictions', tib_col='emp_time_in_band1', mei_col='engagement_score',
#                 ta_col='net_target', mean_ca_col='mr_pers_compgroup_year_comp_score_mean_leadership_competencies',
#                 iteration='second')

# final_prediction_df = df.copy()
# final_prediction_df['predictions'] = np.where(final_prediction_df['rules_prediction'].isna,
#                                              final_prediction_df['predictions'],
#                                              final_prediction_df['rules_prediction'])
# final_prediction_df = final_prediction_df[['global_id', 'zone', 'predictions']]

# FIN

In [474]:
with adlsfsc.open(path + '/2019/Data/Output_Data/ml_output/final_predictions_df.csv', 'wb') as f:
    fpdf_str = final_prediction_df.to_csv()
    f.write(str.encode(fpdf_str))
    f.close()