# Final_ADS_preparation.ipynb
> Project: **ABI Turnover**  
> Turnover Process Phase: **2**  
> Author: **Varun V**  
> Location: **GCC**  
> Team: **People Analytics**

## Importing the packages

In [3]:
## importing the relevant packages:

# clear the workspace
#%reset -f

# print list of files in directory
import os
print('The files in the folder are: ', os.listdir())

# the base packages
import collections # for the Counter function
import csv # for reading/writing csv files
import pandas as pd, numpy as np, time, gc, bisect, re
import itertools as itertools

randomseed = 1 # the value for the random state used at various points in the pipeline
pd.options.display.max_rows = 1000 # specify if you want the full output in cells rather the truncated list
pd.options.display.max_columns = 200

#to display multiple outputs in a cell without using print/display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Prepare the train and validation dataset with the labels

- The turnover files ('16 and '17)
    - returns turnover dataframe with leavers and their termination date
    - returns the unique list of leavers
- The raw ads files (from June16 to Dec17 monthly files)
    - all of them are aggregated
    - split into two (active/leavers)
    - 6? most recent records of leavers -> label=1
    - 2? random records of active + leavers (records older than last 6?) -> label=0
    
**P.S: **`This is applicable for the context where you have monthly datasets. if you have yearly datasets then this module does nothing and you can directly use the datasets in the next notebook (Turnover_modelling_FW.ipynb) after adding the labels`

In [5]:
%fs

ls /mnt/datalake/OUTPUT/MONTHLY_DATASETS/

path,name,size
dbfs:/mnt/datalake/OUTPUT/MONTHLY_DATASETS/201605.csv,201605.csv,3736913
dbfs:/mnt/datalake/OUTPUT/MONTHLY_DATASETS/201606.csv,201606.csv,3697835
dbfs:/mnt/datalake/OUTPUT/MONTHLY_DATASETS/201607.csv,201607.csv,4364859
dbfs:/mnt/datalake/OUTPUT/MONTHLY_DATASETS/201608.csv,201608.csv,4282142
dbfs:/mnt/datalake/OUTPUT/MONTHLY_DATASETS/201609.csv,201609.csv,4618496
dbfs:/mnt/datalake/OUTPUT/MONTHLY_DATASETS/201610.csv,201610.csv,4609998
dbfs:/mnt/datalake/OUTPUT/MONTHLY_DATASETS/201611.csv,201611.csv,4554487
dbfs:/mnt/datalake/OUTPUT/MONTHLY_DATASETS/201612.csv,201612.csv,4439758
dbfs:/mnt/datalake/OUTPUT/MONTHLY_DATASETS/201701.csv,201701.csv,4316376
dbfs:/mnt/datalake/OUTPUT/MONTHLY_DATASETS/201702.csv,201702.csv,4282068


In [6]:
class prepare_ads:
  """ this class creates the training dataframes and the labels from the raw monthly datasets """

  def __init__(self, n=1, m=6, train_start=201601, train_threshold=201801, valid_month=201801, to_threshold=201806):
    # the dictionary for months and month index mapping (uncomment if necessary)
    self.dict_of_months = {'jan': 'January', 'feb': 'February', 'mar': 'March', 'april': 'April', 'may': 'May', 'june': 'June', 'july': 'July', 
                      'aug': 'August', 'sep': 'September', 'oct': 'October', 'nov': 'Novembor', 'dec': 'December'}
    self.dict_of_months_ids = {'jan': 1, 'feb': 2, 'mar': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'aug': 8, 
                          'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
    self.n = n
    self.m = m
    self.trainstart = train_start
    self.trainthreshold = train_threshold
    self.validmonth = valid_month
    self.tothreshold = to_threshold

    self.to_read_all()
    self.read_all()

    train_name = str('TRAIN_' + str(train_threshold) + '_' + str(n) + '_' + str(m) + '.csv')
    valid_name = str('VALID_' + str(valid_month) + '_' + str(to_threshold) + '.csv')
    self.train.to_csv(str('/dbfs/mnt/datalake/OUTPUT/' + train_name), index=False)
    self.valid.to_csv(str('/dbfs/mnt/datalake/OUTPUT/' + valid_name), index=False)
    
  # turnover file creation function
  def to_read_all(self):
    # psg bands to filter
    psg_bands = ['vi_b', 'vii_a', 'vii_b', 'v_a', 'viii_a', 'iii_a', 'v_b', 'vi_a', 'iv_b',
   'x_a', 'iv_a', 'viii_b', 'ii_b', 'ii_a', 'ix_b', 'ix_a', 'iii_b', 'i_b', 'i_a', 'x_b', 'xi_b', 'xi_a']

    # read in the turnover main file
    to=pd.read_csv('/dbfs/mnt/datalake/OUTPUT/turnover_labels_df.csv')
    to=to[['employee_id', 'pay_scale_group', 'name_of_action_type', 'name_of_reason_for_action', 'termination_date_yearmonth', 'year']]
    to=to.loc[to['name_of_reason_for_action'].isin(['resignation_general', 'resignation _ personal reason', 'resignation _ others', 'mutual agreement', 'early retirement', 'resign _ career growth_opportu'])].reset_index(drop=True)
    to=to[to['pay_scale_group'].isin(psg_bands)]
    # below commented lines to be used for another context where you have a different naming convention. use as & when necessary
    #to['term_year'] = to['termination_date'].dt.year
    #to['term_month'] = to['termination_date'].dt.month_name()
    #to['term_monthid'] = to['termination_date'].dt.month
    to.sort_values(inplace=True, by=['employee_id', 'termination_date_yearmonth'])
    to.drop_duplicates(inplace=True, subset='employee_id')
    to_train = to[to['termination_date_yearmonth'] < self.trainthreshold]
    to_valid = to[(to['termination_date_yearmonth'] > self.validmonth) & (to['termination_date_yearmonth'] <= self.tothreshold)]
    to_train_ids = to_train['employee_id'].unique()
    to_valid_ids = to_valid['employee_id'].unique()
    to.rename(columns={'employee_id': 'employee_personnal_number_pa', 'termination_date_yearmonth': 'yearmonth'}, inplace=True)

    self.to = to
    self.to_train_ids = to_train_ids
    self.to_valid_ids = to_valid_ids
    return None

  # final ads creation function to aggregate all the monthly files
  # n = number of random records to be taken from the pool of records with label=0
  # m = number of recent records to be taken from the pool of records of leavers and given label=1
  def read_all(self):
    ## reading in the multiple month level files and mapping the labels based on the to files
    files = dbutils.fs.ls('/mnt/datalake/OUTPUT/MONTHLY_DATASETS/')
    file_names = {}

    iter = 0
    for i in files:
      file_names[i.name] = i.path
      iter = iter+1

    file_names_keys = list(file_names.keys())
    file_names_keys_values = [s.replace('.csv', '')for s in file_names_keys]
    train_list = [i for i in file_names_keys_values if ((int(i)>self.trainstart) & (int(i)<self.trainthreshold))]
    test_list = [i for i in file_names_keys_values if int(i) == self.validmonth]

    dftrain = {}
    dfvalid = []

    # append the (multiple) datasets into one TRAIN master set
    iter=0
    for i in train_list:
        dftrain[i] = pd.read_csv(str('/dbfs/mnt/datalake/OUTPUT/MONTHLY_DATASETS/' + i + '.csv'))
        dftrain[i]['yearmonth'] = int(i)
        # below commented lines to be used for another context where you have a different naming convention. use as & when & if necessary
        #dftrain[i]['term_year'] = int(i)
        #dftrain[i]['term_month_temp'] = dftrain[i]['month_file'].apply(lambda s:s.split('_')[0])
        #dftrain[i]['term_month'] = dftrain[i]['term_month_temp'].map(self.dict_of_months)
        #dftrain[i]['term_monthid'] = dftrain[i]['term_month_temp'].map(self.dict_of_months_ids)
        iter = iter+1
    # create the VALID dataset
    dfvalid = pd.read_csv(str('/dbfs/mnt/datalake/OUTPUT/MONTHLY_DATASETS/' + str(self.validmonth) + '.csv'))
    dfvalid['yearmonth'] = self.validmonth
    dfvalid['label'] = 0

    # combine the TRAIN monthly datasets
    df_all = pd.concat(dftrain.values(), ignore_index=True)
    df_all['employee_personnal_number_pa'] = df_all['employee_personnal_number_pa'].astype(int)

    # mapping the leavers (labels) for both TRAIN and VALID datasets
    df_all_temp=df_all.copy()
    df_all_temp['flag'] = df_all_temp['employee_personnal_number_pa'].isin(self.to_train_ids).astype(int)
    dfvalid['label'] = np.where(dfvalid['employee_personnal_number_pa'].isin(self.to_valid_ids), 1, 0).astype(int)

    # taking top m recent records of leavers to flag as labels=1
    x=df_all_temp.sort_values(ascending=False, by=['yearmonth']).groupby('employee_personnal_number_pa').head(self.m)
    # taking only The most recent record of leavers for label=1
    #x=df_all_temp.sort_values(ascending=False, by=['global id', 'term_year', 'term_monthid']).drop_duplicates(['global id'])
    x=x.loc[x['flag'] == 1].reset_index(drop=True)
    x=x[['employee_personnal_number_pa', 'yearmonth', 'flag']].reset_index(drop=True)
    x.rename(columns={'flag': 'label'}, inplace=True)

    df_all_new=pd.merge(df_all, x, how='left', on=['employee_personnal_number_pa', 'yearmonth'])
    df_all_new['label'].fillna(0, inplace=True)
    df_all_new['label'] = df_all_new['label'].astype(int)
    df_all_new.sort_values(by=['employee_personnal_number_pa', 'yearmonth'], inplace=True)

    mask = df_all_new['label'] == 1
    df_pos_labels = df_all_new[mask]
    df_neg_labels = df_all_new[~mask]

    df_neg_labels=df_neg_labels.groupby('employee_personnal_number_pa').apply(lambda x: x.sample(self.n, replace=True)).reset_index(drop=True)
    #df_neg_labels['groupid'] = df_neg_labels.groupby(['global id']).cumcount()+1
    #df_neg_labels['sample_flag'] = np.where((df_neg_labels['groupid'] % n == 0), 1, 0)
    #df_neg_labels = df_neg_labels.loc[df_neg_labels['sample_flag'] == 1].reset_index(drop=True)
    #df_neg_labels.drop(['groupid', 'sample_flag'], axis=1, inplace=True)

    df_all_complete = pd.concat([df_pos_labels.reset_index(drop=True), df_neg_labels], axis=0)
    #df_all_complete.drop(['yearmonth'], axis=1, inplace=True)
    self.train = df_all_complete
    self.valid = dfvalid
    return None

In [7]:
# the mother class

class final_ads:
  #put filter to make sure validation timeframe consistent with m
  def __init__(self):
    self.n_range = np.arange(1, 3, dtype=int)
    self.m_range = np.arange(2, 7, dtype=int)
    self.trainthresh_range = [201801]
    self.validmonth_range = np.arange(201801, 201809, dtype=int)
    self.tothresh_range = np.arange(201802, 201812, dtype=int)
    self.trainstart_range = [201601]

    self.product = list(itertools.product(self.n_range, self.m_range, self.trainstart_range, self.trainthresh_range, self.validmonth_range, self.tothresh_range))
    self.main()
    
  def main(self):
    # the grid is created in self.product, processed here
    # iterating through each combination in it and calling the ads_preparation module for each corresponding list of values
    grid_df = pd.DataFrame(self.product, columns=['nrange', 'mrange', 'trainstart', 'trainthresh', 'validmonth', 'turnoverthresh'])
    grid_df_filtered = grid_df[grid_df['turnoverthresh'] > (grid_df['validmonth'] + 1)].reset_index(drop=True)
    grid_df_filtered = grid_df_filtered[(grid_df_filtered['turnoverthresh']-grid_df_filtered['validmonth']) == grid_df_filtered['mrange']].reset_index(drop=True)
    for i in range(grid_df_filtered.shape[0]):
      prepare_ads(n=grid_df_filtered.loc[i, 'nrange'],
                  m=grid_df_filtered.loc[i, 'mrange'],
                  train_start=grid_df_filtered.loc[i, 'trainstart'],
                  train_threshold=grid_df_filtered.loc[i, 'trainthresh'],
                  valid_month=grid_df_filtered.loc[i, 'validmonth'],
                  to_threshold=grid_df_filtered.loc[i, 'turnoverthresh'])
    self.grid = grid_df_filtered
    return None

In [8]:
final_grid = final_ads()

## Output

- The ***TRAIN*** and ***VALID*** datasets are written as flatfiles in the working directory

`this marks the completion of the`** Final_ADS_preparation.ipynb **`notebook`