In [55]:
import pandas as pd
import json
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
import statsmodels.formula.api as smf 
from sklearn.model_selection import train_test_split
import numpy as np

In [56]:
data = pd.read_csv('../data/output.csv')
employees = pd.read_csv('../data/hrdata_OSF.csv')

density = {}
for d in data.to_dict('records'):
    if d['sender'] not in density:
        density[d['sender']] = d['density']

employees_dict = {}
for emp in employees.to_dict('records'):
    name = emp['name']
    if name in density:
        d = density[name]
    else:
        d=0
    employees_dict[name] = {
        'acquired':emp['acquired'],
        'tenure':emp['tenure'],
        'title_status':emp['title_status'],
        'male':emp['male'],
        'rating':emp['rating'],
        'branch_id':d
    }

retention = data.drop(columns=['Unnamed: 0','month','diff','n_emails','innov'])
reformation = data.drop(columns=['Unnamed: 0','month','diff','n_emails','ret'])

In [57]:
# retention = retention.to_dict('records')
retentions = {}
for r in retention.to_dict('records'):
    retentions[r['sender']] = r['ret']

In [58]:
reformations = {}
for r in reformation.to_dict('records'):
    reformations[r['sender']] = r['innov']

In [59]:
df = []
for emp in employees_dict:
    if emp in retentions and emp in reformations:

        temp = employees_dict[emp]
        temp['ret'] = retentions[emp]
        temp['innov'] = reformations[emp]
        temp['name'] = emp

        df.append(employees_dict[emp])

In [None]:
df

In [60]:
df = pd.DataFrame(df)
df.fillna(0, inplace=True)
X_train, X_test= train_test_split(df, test_size=0.33, random_state=42)

In [61]:
retentions_X_train = X_train.drop(columns=['innov','name'], axis=1)
reformations_X_train = X_train.drop(columns=['ret','name'], axis=1)

retentions_X_train = sm.add_constant(retentions_X_train) 

retentions_y_train = X_train['ret']
reformations_y_train = X_train['innov']

In [62]:
retentions_X_train

Unnamed: 0,const,acquired,tenure,title_status,male,rating,branch_id,ret
69,1.0,0,17.645448,2,1,0.000,0.027345,0.77
127,1.0,0,4.098563,0,0,0.000,0.042891,0.73
27,1.0,0,4.599589,0,0,3.460,0.027345,0.86
150,1.0,0,1.752225,0,1,3.964,0.021898,0.79
124,1.0,0,4.599589,1,0,0.000,0.042891,0.67
...,...,...,...,...,...,...,...,...
71,1.0,0,5.796030,0,0,3.292,0.021898,0.68
106,1.0,0,12.199863,2,1,4.040,0.021898,0.69
14,1.0,0,3.457906,0,1,0.000,0.034722,0.90
92,1.0,0,7.460643,1,1,2.720,0.025918,0.74


In [63]:
retentions_X_test = X_test.drop(columns=['innov','name','ret'], axis=1)
reformations_X_test = X_test.drop(columns=['ret','name','innov'], axis=1)

# retentions_X_test = sm.add_constant(retentions_X_test) 

retentions_y_test = X_test['ret']
reformations_y_test = X_test['innov']

In [64]:
retentions_y_test.to_list()

[0.68,
 0.77,
 0.76,
 0.71,
 0.67,
 0.94,
 0.78,
 0.5,
 0.72,
 0.71,
 0.92,
 0.79,
 0.92,
 0.88,
 0.95,
 0.92,
 0.71,
 0.83,
 0.72,
 0.9,
 0.74,
 0.97,
 0.9,
 0.85,
 0.88,
 0.82,
 0.74,
 0.47,
 0.74,
 0.77,
 0.73,
 0.72,
 0.74,
 0.73,
 0.86,
 0.78,
 0.94,
 0.89,
 0.73,
 0.79,
 0.97,
 0.71,
 0.78,
 0.77,
 0.74,
 0.89,
 0.76,
 0.95,
 0.72,
 0.79,
 0.74,
 0.89,
 0.78]

In [65]:
# olsmod = sm.OLS(np.asarraya(retentions_X_train),retentions_y_train)
# olsres = olsmod.fit()
olsmod = smf.ols(formula='ret ~ acquired + tenure + title_status + male + rating + branch_id', data=retentions_X_train)
olsres = olsmod.fit()

In [66]:
olsmod_innov = smf.ols(formula='innov ~ acquired + tenure + title_status + male + rating + branch_id', data=reformations_X_train)
olsres_innov = olsmod_innov.fit()

In [67]:
olsres.summary()

0,1,2,3
Dep. Variable:,ret,R-squared:,0.018
Model:,OLS,Adj. R-squared:,-0.031
Method:,Least Squares,F-statistic:,0.3692
Date:,"Wed, 10 Aug 2022",Prob (F-statistic):,0.869
Time:,10:46:03,Log-Likelihood:,87.49
No. Observations:,106,AIC:,-163.0
Df Residuals:,100,BIC:,-147.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7843,0.039,20.291,0.000,0.708,0.861
acquired,4.049e-16,1.62e-15,0.250,0.803,-2.81e-15,3.62e-15
tenure,0.0009,0.003,0.366,0.715,-0.004,0.006
title_status,-0.0034,0.017,-0.195,0.845,-0.037,0.031
male,-0.0103,0.025,-0.418,0.677,-0.059,0.039
rating,-0.0067,0.007,-0.998,0.321,-0.020,0.007
branch_id,0.0335,1.020,0.033,0.974,-1.991,2.058

0,1,2,3
Omnibus:,0.009,Durbin-Watson:,2.281
Prob(Omnibus):,0.996,Jarque-Bera (JB):,0.114
Skew:,-0.014,Prob(JB):,0.945
Kurtosis:,2.842,Cond. No.,3.7e+17


In [68]:
ypred = olsres.predict(retentions_X_test)
ypred

78     0.786258
155    0.786305
128    0.757283
55     0.789130
94     0.784363
29     0.786188
147    0.786953
51     0.784987
98     0.792718
141    0.783842
19     0.766504
60     0.754361
15     0.788626
65     0.794898
24     0.776765
30     0.786006
126    0.767051
101    0.798477
96     0.757008
16     0.788501
151    0.775329
18     0.787456
12     0.754814
9      0.788952
31     0.785412
125    0.785624
95     0.783581
56     0.761923
145    0.769955
152    0.786617
135    0.756331
76     0.785005
75     0.759966
138    0.763054
2      0.789784
86     0.787245
45     0.791576
42     0.764206
68     0.788804
118    0.786504
26     0.776880
137    0.790787
146    0.783597
90     0.747468
66     0.762761
36     0.791872
82     0.755388
22     0.790066
85     0.770181
81     0.784139
112    0.749032
11     0.772428
109    0.789784
dtype: float64

In [69]:
ypred_innov = olsres_innov.predict(reformations_X_test)
ypred_innov

78     0.101273
155    0.138871
128    0.171922
55     0.173539
94     0.158124
29     0.149331
147    0.147988
51     0.153555
98     0.135751
141    0.141392
19     0.153882
60     0.176924
15     0.145822
65     0.170717
24     0.149774
30     0.175934
126    0.181805
101    0.131641
96     0.174599
16     0.174444
151    0.155603
18     0.220207
12     0.148009
9      0.173220
31     0.153272
125    0.150005
95     0.125857
56     0.113333
145    0.178863
152    0.138422
135    0.170707
76     0.153529
75     0.196531
138    0.163299
2      0.157574
86     0.097115
45     0.112252
42     0.164882
68     0.174514
118    0.148963
26     0.140498
137    0.129590
146    0.155556
90     0.209002
66     0.160311
36     0.140907
82     0.176628
22     0.172697
85     0.150547
81     0.193601
112    0.150800
11     0.189155
109    0.157574
dtype: float64

In [70]:
reformations_X_test

Unnamed: 0,acquired,tenure,title_status,male,rating,branch_id
78,0,5.664613,1,0,0.0,0.0
155,0,1.333333,0,0,0.0,0.021898
128,0,2.652977,2,0,3.56,0.027345
55,0,3.605749,0,0,0.0,0.042891
94,0,17.259411,2,1,0.0,0.027345
29,0,1.007529,0,0,0.0,0.027476
147,0,1.831622,0,0,0.0,0.027345
51,0,3.321013,1,0,0.0,0.027345
98,0,8.084873,0,0,0.0,0.02521
141,0,2.362765,1,0,0.0,0.019859
