In [1]:
%reset -f
import pandas as pd
import numpy as np
import datetime as dt

#### Individual Variables
-	Individual Salary 2016
-	Individual’s Peer Mean Salary
-	Individual’s Salary Comparison with Peer
-	Number of months in same salary for individual

#### Manager Variables
-	Manager Salary 2016
-	Manager’s Peer Mean Salary
-	Manager’s Salary Comparison with Peer
-	Number of months in same salary for Manager

Note: *Peer is defined as employees with same band reporting to the same manager*

In [2]:
# main class #

class sal_feats():
    
    def __init__():
        return None
    
    def prepare(hc_path, sal_path):
        
        hc_master = pd.read_csv(hc_path, index_col=[0]).reset_index(drop=True)
        hc_master['Global ID'] = hc_master['Global ID'].astype(int)
        hc_master['Month_Reported'] = pd.to_datetime(hc_master['Month_Reported'])
        
        Sal = pd.read_csv(sal_path)
        
        currency_conversion_dict = {'BEF':0.024789,
                            'CZK':0.037008,
                            'EUR':1,
                            'GBP':1.167979,
                            'HUF':0.003228,
                            'JPY':0.008104,
                            'KZT':0.002844,
                            'RUB':0.015552,
                            'UAH':0.035007}

        Sal['Euro Salary'] = Sal.apply(lambda x : np.round(x['Annual Salary']*currency_conversion_dict[x['Currency']], 2), axis = 1)
        Sal['End Date']= Sal['End Date'].apply(lambda x : dt.date(2018,int(str(x)[5:7]),int(str(x)[8:10])) if str(x)[0:4]=='9999' else dt.date(int(str(x)[0:4]),int(str(x)[5:7]),int(str(x)[8:10])))
        Sal['Start Date']= Sal['Start Date'].apply(lambda x : dt.date(2018,int(str(x)[5:7]),int(str(x)[8:10])) if str(x)[0:4]=='9999' else dt.date(int(str(x)[0:4]),int(str(x)[5:7]),int(str(x)[8:10])))
        Sal=Sal[['Global ID','Start Date','End Date','Euro Salary']]
        Sal['Start Date'] = pd.to_datetime(Sal['Start Date'])
        Sal['End Date'] = pd.to_datetime(Sal['End Date'])
        
        hc_sal = pd.merge(hc_master, Sal, how='left')
        hc_sal=hc_sal[hc_sal['Start Date'].notnull()]
        return hc_sal
        
    def IND_feats(hc_sal):
        hc_sal['IND_SAL'] = np.where((hc_sal['Month_Reported'] >= hc_sal['Start Date']) &
                             (hc_sal['Month_Reported'] <= hc_sal['End Date']), hc_sal['Euro Salary'], 
                             hc_sal['Euro Salary'] * 0)

        hc_sal_filtered = hc_sal[hc_sal['IND_SAL'].notnull()]
        hc_sal_filtered = hc_sal_filtered[hc_sal_filtered['IND_SAL'] != 0.00]

        hcsal_feats = pd.DataFrame(hc_sal_filtered.groupby(['Pay Scale Group', 'MGR Person ID'])['IND_SAL']
                        .agg(['mean', 'min', 'max', 'std', 'count'])).reset_index()
        hcsal_feats.columns = ['Pay Scale Group', 'MGR Person ID', 'IND_peer_mean', 'IND_peer_min', 'IND_peer_max',
                               'IND_peer_std', 'IND_peer_count']

        xxx = pd.merge(hc_sal_filtered, hcsal_feats, how='left')
        xxx['is_IND_SALary_over_peer_mean'] = np.where(xxx['Euro Salary'] >= xxx['IND_peer_mean'], 1, 0)
        yyy = xxx.groupby(['Global ID', 'Euro Salary']).size().to_frame('months_in_same_salary').reset_index()
        zzz = pd.merge(xxx, yyy, how='left')
        return zzz
    
    def MGR_feats(hc_path, sal_path):
        hc_master = pd.read_csv(hc_path, index_col=[0]).reset_index(drop=True)
        
        mgr = hc_master[['Global ID', 'MGR Person ID']]
        unique_mgr=set(mgr['MGR Person ID'].unique())
        hc_master=hc_master.loc[hc_master['Global ID'].isin(unique_mgr)]
        hc_master['Global ID'] = hc_master['Global ID'].astype(int)
        hc_master['MGR Person ID'] = hc_master['MGR Person ID'].astype(int)
        hc_master['Month_Reported'] = pd.to_datetime(hc_master['Month_Reported'])
        
        Sal = pd.read_csv(sal_path)
        
        currency_conversion_dict = {'BEF':0.024789,
                            'CZK':0.037008,
                            'EUR':1,
                            'GBP':1.167979,
                            'HUF':0.003228,
                            'JPY':0.008104,
                            'KZT':0.002844,
                            'RUB':0.015552,
                            'UAH':0.035007}

        Sal['Euro Salary'] = Sal.apply(lambda x : np.round(x['Annual Salary']*currency_conversion_dict[x['Currency']], 2), axis = 1)
        Sal['End Date']= Sal['End Date'].apply(lambda x : dt.date(2018,int(str(x)[5:7]),int(str(x)[8:10])) if str(x)[0:4]=='9999' else dt.date(int(str(x)[0:4]),int(str(x)[5:7]),int(str(x)[8:10])))
        Sal['Start Date']= Sal['Start Date'].apply(lambda x : dt.date(2018,int(str(x)[5:7]),int(str(x)[8:10])) if str(x)[0:4]=='9999' else dt.date(int(str(x)[0:4]),int(str(x)[5:7]),int(str(x)[8:10])))
        Sal=Sal[['Global ID','Start Date','End Date','Euro Salary']]
        Sal['Start Date'] = pd.to_datetime(Sal['Start Date'])
        Sal['End Date'] = pd.to_datetime(Sal['End Date'])
        
        hc_sal = pd.merge(hc_master, Sal, how='left')
        hc_sal[hc_sal['Start Date'].notnull()]
        
        zzz = sal_feats.IND_feats(hc_sal)
        return zzz
        
    def return_feats(hc_path, sal_path):
        hc_sal = sal_feats.prepare(hc_path, sal_path)
        indfeats = sal_feats.IND_feats(hc_sal)
        mgrfeats = sal_feats.MGR_feats(hc_path, sal_path)
        
        mgrfeats = mgrfeats.add_prefix('MANAGER_')
        mgrfeats.rename(columns={'MANAGER_Global ID': 'MGR Person ID', 'MANAGER_Month_Reported': 'Month_Reported'}, inplace=True)
        all_feats = pd.merge(indfeats, mgrfeats, how='left')
        
#         print(indfeats.shape)
#         print(mgrfeats.shape)
#         print(all_feats.shape)
        
        return all_feats

In [None]:
all_feats_2016 = sal_feats.return_feats(hc_path='HC_2016_Master.csv', sal_path='Salary.csv')

all_feats_2016.to_csv('2016_sal_feats.csv', index=False)

all_feats_2017 = sal_feats.return_feats(hc_path='HC_2017_Master.csv', sal_path='Salary.csv')

all_feats_2017.to_csv('2017_sal_feats.csv', index=False)