# GLOBAL OPR PREDICTION

- Scope: All global banded employees (bands 0-5)
- Zones: All zones (and ZXV)
- ML perspective:
    - multiclass (main)
    - regression (discontinued)
- ML process: xgboost/catboost
- Feature perspective:
    - opr as response
    - historical opr
    - all blueprint, competency, target, movement features
    - train=2017
    - valid=2018
    - deployed model = 2018 or 2017+2018

# INITIALIZATION

In [1]:
%reset -f

import sys, pandas as pd, numpy as np, inspect, re as re, functools as functools, pickle, glob, warnings, os

from tqdm import tqdm

# sklearn packages
import sklearn.metrics as skm

# some options/variables
randomseed = 1 # the value for the random state used at various points in the pipeline
pd.options.display.max_rows = 50 # specify if you want the full output in cells rather the truncated list
pd.options.display.max_columns = 200

# to display multiple outputs in a cell without usin print/display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# append the scripts path to pythonpath
sys.path.append('./scripts/')

# ignore warnings (only if you are the kind that would code when the world is burning)
warnings.filterwarnings('ignore')

# plot inline
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# # import the various ml modules
import xgboost as xgb

############################################## import the custom modules ################################
import helperfuncs as helper
import feateng as fte
import misc as miscfun
import oprfunctions as oprfun
from misc import ce_encodings, DataFrameImputer, scalers
from oprfunctions import demo_fn, salary_process

# instantiate the classes
helpers = helper.helper_funcs()
cust_funcs = fte.custom_funcs()
feat_sel = miscfun.feat_selection()

#############################################################################################################
# global function to flatten columns after a grouped operation and aggregation
# outside all classes since it is added as an attribute to pandas DataFrames
def __my_flatten_cols(self, how="_".join, reset_index=True):
    how = (lambda iter: list(iter)[-1]) if how == "last" else how
    self.columns = [how(filter(None, map(str, levels))) for levels in self.columns.values] \
    if isinstance(self.columns, pd.MultiIndex) else self.columns
    return self.reset_index(drop=True) if reset_index else self
pd.DataFrame.my_flatten_cols = __my_flatten_cols

Using TensorFlow backend.


In [2]:
%run -i ./scripts/dicts_cols.py

## SETUP THE ADLS

In [3]:
from azure.datalake.store import core, lib, multithread

tenant = 'cef04b19-7776-4a94-b89b-375c77a8f936'
resource = 'https://datalake.azure.net/'
client_id = 'e9aaf06a-9856-42a8-ab3c-c8b0d3a9b110'
client_secret = 'DlbuV60szYT2U0CQNjzwRA55EsH42oX92AB7vbD2clk='

adlcreds = lib.auth(tenant_id = tenant,
                   client_secret = client_secret,
                   client_id = client_id,
                   resource = resource)

subs_id = '73f88e6b-3a35-4612-b550-555157e7059f'
adls = 'edhadlsanasagbdev'

adlsfsc = core.AzureDLFileSystem(adlcreds, store_name=adls)

path = '/root/anasandbox/people/opr10x/'

## OPR files initialization

In [4]:
# %run -i ./scripts/opr_script.py

# with adlsfsc.open(path + '/2019/Data/Raw_Data/pickle_files/Miscellaneous/opr_backup_17to18.pickle', 'rb') as f:
#     opr_reshaped = pickle.load(f)
#     f.close()

In [5]:
oprfull = open('E:/ADLS/pickles/opr_backup_17to18.pickle', 'rb')
opr_reshaped = pickle.load(oprfull)
oprfull.close()

## BluePrint

In [6]:
# %run -i ./scripts/blueprint_script.py

# with adlsfsc.open(path + '/2019/Data/Raw_Data/pickle_files/Blueprint/bp_backup_16to19_processed.pickle', 'rb') as f:
#     bp_full = pickle.load(f)
#     f.close()

In [7]:
bpfull = open('E:/ADLS/pickles/bp_backup_16to19_processed.pickle', 'rb')
bp_full = pickle.load(bpfull)
bpfull.close()

## MISC

### Features
- MRS features (compare ratio)
- Salary
- TeamSize and TeamSize_Difference

In [8]:
%run -i ./scripts/misc_script.py

(21975, 10)


## Competency

In [9]:
# %run -i ./scripts/competency_script_new.py

# compfull = open('E:/ADLS/pickles/competency_16to18_raw.pickle', 'rb')
# comp_full = pickle.load(compfull)
# compfull.close()

compfull2 = open('E:/ADLS/pickles/competency_16to18_processed.pickle', 'rb')
comp_full = pickle.load(compfull2)
compfull2.close()

In [10]:
# %run -i ./scripts/comp_2019.py

# with adlsfsc.open(path + '/2019/Data/Raw_Data/pickle_files/Navigate/competency_full_2019.pickle', 'rb') as f:
#     comp2019 = pickle.load(f)
#     f.close()

comp_2019 = open('E:/ADLS/pickles/competency_full_2019.pickle', 'rb')
comp2019 = pickle.load(comp_2019)
comp_2019.close()

## Target

In [11]:
# %run -i ./scripts/target_script.py

# with adlsfsc.open(path + '/2019/Data/Raw_Data/pickle_files/Miscellaneous/target_backup.pickle', 'rb') as f:
#     tar_reshaped = pickle.load(f)
#     f.close()

tarpkl = open('E:/ADLS/pickles/target_backup.pickle', 'rb')
tar_reshaped = pickle.load(tarpkl)
tarpkl.close()

## Movement

### Features
- time in band
- career velocity features

In [12]:
# %run -i ./scripts/movements_script.py
# %run ../working/Career_Velocity_Metric_eg_2018_used.ipynb

# with adlsfsc.open(path + '/2019/Data/Raw_Data/pickle_files/Movements/career_velocity.pkl', 'rb') as f:
#     cv_full = pickle.load(f)
#     pv_full = pickle.load(f)
#     tib_full = pickle.load(f)
#     f.close()
    
cvpkl = open('E:/ADLS/pickles/career_velocity.pkl', 'rb')
cv_full = pickle.load(cvpkl)
pv_full = pickle.load(cvpkl)
tib_full = pickle.load(cvpkl)
cvpkl.close()

## NAVIGATE DATA and related

### Features
- BELTS
- CAREER ASPIRATIONS
- COMPETENCY APPRAISAL
- GMT-GMBA
- GTP / PeopleBets / ZTP
- EDUCATION (nothing for now)
- ENGAGEMENT (not present for 2016)
- PDP (lot more work here. time since PDP and related)
- TURNOVER/BUSINESS IMPACT
- MOBILITY

In [13]:
# %run -i ./scripts/navigate_script.py

# with adlsfsc.open(path + '/2019/Data/Raw_Data/pickle_files/Navigate/navigate.pkl', 'rb') as f:
#     belts_grp = pickle.load(f)
#     tp_full = pickle.load(f)
#     engfull = pickle.load(f)
#     pdp_full = pickle.load(f)
#     f.close()

navigate = open('E:/ADLS/pickles/navigate.pickle','rb')
belts_grp = pickle.load(navigate)
tp_full = pickle.load(navigate)
engfull = pickle.load(navigate)
pdp_full = pickle.load(navigate)
navigate.close()

## Org Chart Features

### Features
- Manager summary - DR
- Manager summary - FS
- Team average Tenure

In [14]:
%run -i ./scripts/org_chart_features.py

(7005, 102)
(6977, 102)


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

(24982, 7)


 25%|█████████████████████                                                               | 1/4 [00:04<00:12,  4.03s/it]

(31819, 7)


 50%|██████████████████████████████████████████                                          | 2/4 [00:08<00:08,  4.29s/it]

(36929, 7)


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:17<00:05,  5.59s/it]

(29330, 7)


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:26<00:00,  6.46s/it]


## ADS creation

List of dataframes to be merged
- opr_reshaped
- bp_full
- teamsize_and_diff_df
- comp_full
- tar_reshaped
- cv_full, pv_full, tib_full
- belts_grp, gmt, gmba, tp_full, engfull
- mngrkpis_fs, mngrkpis_dr, tt_full

In [15]:
%run -i ./scripts/ads_prepare_new.py

(31, 2)
ads size before zone filter:  (23425, 62)
ads size after zone filter:  (21750, 62)


In [16]:
# save backup
ads_backup = open('E:/ADLS/pickles/ads_backup.pkl','wb')
pickle.dump(ads, ads_backup)
ads_backup.close()

# load backup
ads_backup = open('E:/ADLS/pickles/ads_backup.pkl','rb')
ads = pickle.load(ads_backup)
ads_backup.close()

In [17]:
with adlsfsc.open(path + '/2019/Data/Output_Data/ads/final_ads.pickle', 'wb') as f:
    pickle.dump(ads, f)
    f.close()
    
# with adlsfsc.open(path + '/2019/Data/Output_Data/ads/final_ads.pickle', 'rb') as f:
#     ads = pickle.load(f)
#     f.close()

In [18]:
with adlsfsc.open(path + '/2019/Data/Output_Data/ads/final_ads.csv', 'wb') as f:
    ads_str = ads.to_csv()
    f.write(str.encode(ads_str))
    f.close()

5141420

In [19]:
ads.columns

Index(['global_id', 'employee_band', 'ebm_level', 'year', 'function',
       'mr_pers_compgroup_year_comp_score_mean_functional_competencies',
       'mr_pers_compgroup_year_comp_score_mean_leadership_competencies',
       'mr_pers_compgroupl1_year_comp_score_mean_leadership_competencies_develop_people',
       'mr_pers_compgroupl1_year_comp_score_mean_leadership_competencies_dream_big',
       'mr_pers_compgroupl1_year_comp_score_mean_leadership_competencies_live_our_culture',
       'net_target', 'opr', 'teamsize', 'teamsize_delta', 'index_average',
       'position_velocity', 'emp_time_in_band1', 'count_of_belts',
       'talentpool_renomination', 'talentpool', 'engagement_score',
       'manager_effectiveness_score', 'fs_prom', 'fs_ho', 'fs_adherant_perc',
       'fs_to_overall', 'dr_prom', 'dr_ho', 'dr_adherant_perc',
       'dr_to_overall', 'mean_team_tenure', 'lc_count', 'fc_count',
       'position_tenure', 'zone', 'target_delta', 'prev_opr', 'prev_prev_opr'],
      dtype='obje

In [20]:
ads.head(5)

Unnamed: 0,global_id,employee_band,ebm_level,year,function,mr_pers_compgroup_year_comp_score_mean_functional_competencies,mr_pers_compgroup_year_comp_score_mean_leadership_competencies,mr_pers_compgroupl1_year_comp_score_mean_leadership_competencies_develop_people,mr_pers_compgroupl1_year_comp_score_mean_leadership_competencies_dream_big,mr_pers_compgroupl1_year_comp_score_mean_leadership_competencies_live_our_culture,net_target,opr,teamsize,teamsize_delta,index_average,position_velocity,emp_time_in_band1,count_of_belts,talentpool_renomination,talentpool,engagement_score,manager_effectiveness_score,fs_prom,fs_ho,fs_adherant_perc,fs_to_overall,dr_prom,dr_ho,dr_adherant_perc,dr_to_overall,mean_team_tenure,lc_count,fc_count,position_tenure,zone,target_delta,prev_opr,prev_prev_opr
0,1001929,11,-1,2016,sales,3.0,3.06911,3.07739,3.00571,3.086,96.0,1.0,90,0.0,-530.48,148.995,10.4975,,,,,,,,,,,,,,4.074315,,,3834.0,ZONE EUROPE,,1.0,
1,1001929,11,-1,2017,sales,3.0,3.16947,3.04333,3.154,3.27375,76.0,0.0,72,-18.0,-590.49,154.99,11.496667,,,,52.2222,95.0,0.0,0.0,,0.0,0.0,0.0,,0.0,5.074315,1.0,1.0,4199.0,ZONE EUROPE,-20.0,0.0,1.0
2,1001929,11,-1,2018,sales,3.0,3.449,3.38667,3.47167,3.47875,100.0,0.0,166,0.0,-650.5,160.985,12.495833,,,,80.3571,97.5,0.0,0.0,,0.0,0.0,0.0,,0.0,6.418395,1.0,1.0,4564.0,ZONE EUROPE,24.0,0.0,0.0
3,1001929,11,-1,2019,sales,2.714286,2.86111,2.66667,2.83333,3.08333,100.0,,160,-6.0,-700.5,165.95,13.328333,,,,46.875,90.0,0.0,0.0,,0.0,0.0,0.0,,0.0,4.807877,1.0,1.0,4929.0,ZONE EUROPE,0.0,0.0,0.0
4,1002651,9,3,2016,supply,3.0,3.265,3.17455,3.4,3.305,52.0,2.0,20,0.0,-29.68,35.888333,0.249167,,,,,,,,,,,,,,3.056963,,,91.0,ZONE EUROPE,,2.0,
