In [141]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

In [80]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Day of the Week,Month,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Normal Weight,Overweight,Obese,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,2,7,289,36,33,239.554,0,0,1,0,1,1,1
1,0,0,0,0,2,7,118,13,50,239.554,0,0,1,0,1,0,0
2,0,0,0,1,3,7,179,51,38,239.554,0,0,1,0,0,0,0
3,1,0,0,0,4,7,279,5,39,239.554,1,0,0,0,1,0,1
4,0,0,0,1,4,7,289,36,33,239.554,0,0,1,0,1,1,0


splitting the inputs

In [81]:
unscaled_inputs = data_preprocessed.iloc[:,:-1]
unscaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Day of the Week,Month,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Normal Weight,Overweight,Obese,Education,Children,Pets
0,0,0,0,1,2,7,289,36,33,239.554,0,0,1,0,1,1
1,0,0,0,0,2,7,118,13,50,239.554,0,0,1,0,1,0
2,0,0,0,1,3,7,179,51,38,239.554,0,0,1,0,0,0
3,1,0,0,0,4,7,279,5,39,239.554,1,0,0,0,1,0
4,0,0,0,1,4,7,289,36,33,239.554,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,3,5,179,22,40,237.656,1,0,0,1,1,0
696,1,0,0,0,3,5,225,26,28,237.656,1,0,0,0,1,1
697,1,0,0,0,4,5,330,16,28,237.656,0,1,0,1,0,0
698,0,0,0,1,4,5,235,16,32,237.656,0,1,0,1,0,0


In [82]:
targets = list(data_preprocessed.iloc[:,-1])

with standardscaler we can standardize the values

In [83]:
scaler = StandardScaler()

In [84]:
scaler.fit(unscaled_inputs)

In [85]:
scaled_inputs = scaler.transform(unscaled_inputs)

In [86]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.82870199,  1.29691647],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.82870199, -0.77105968],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -1.20670641, -0.77105968],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -1.20670641, -0.77105968],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -1.20670641, -0.77105968],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.82870199,  1.29691647]])

In [87]:
scaled_inputs.shape

(700, 16)

we classified most of the variables in the dataset so i'm going to be using a logistic regression for this case in particular as it is the best fit for categorical variables

In [88]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size= 0.8, random_state=20)

In [89]:
logreg = LogisticRegression()

In [90]:
logreg.fit(x_train, y_train)

In [91]:
logreg.score(x_train, y_train)

0.7785714285714286

not great, not terrible, we should look at the variables

In [92]:
names = unscaled_inputs.columns.values
names

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Day of the Week',
       'Month', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Normal Weight', 'Overweight', 'Obese',
       'Education', 'Children', 'Pets'], dtype=object)

we can get the coefficient, or weights, of the variables to look at the data

In [93]:
variables_table = pd.DataFrame(columns=['Name'], data=names)
variables_table['Coefficient'] = np.transpose(logreg.coef_)
variables_table

Unnamed: 0,Name,Coefficient
0,Reason 1,2.154799
1,Reason 2,0.350154
2,Reason 3,1.612093
3,Reason 4,1.414509
4,Day of the Week,-0.019982
5,Month,0.189042
6,Transportation Expense,0.867042
7,Distance to Work,0.023248
8,Age,-0.134078
9,Daily Work Load Average,-0.036191


we are still missing the intercept, or bias

In [94]:
variables_table.index = variables_table.index+1
variables_table

Unnamed: 0,Name,Coefficient
1,Reason 1,2.154799
2,Reason 2,0.350154
3,Reason 3,1.612093
4,Reason 4,1.414509
5,Day of the Week,-0.019982
6,Month,0.189042
7,Transportation Expense,0.867042
8,Distance to Work,0.023248
9,Age,-0.134078
10,Daily Work Load Average,-0.036191


we can add it now, but it will be at the bottom

In [95]:
variables_table.loc[0] = ['Intercept', logreg.intercept_[0]]
variables_table

Unnamed: 0,Name,Coefficient
1,Reason 1,2.154799
2,Reason 2,0.350154
3,Reason 3,1.612093
4,Reason 4,1.414509
5,Day of the Week,-0.019982
6,Month,0.189042
7,Transportation Expense,0.867042
8,Distance to Work,0.023248
9,Age,-0.134078
10,Daily Work Load Average,-0.036191


so we sort the table

In [96]:
variables_table = variables_table.sort_index()
variables_table

Unnamed: 0,Name,Coefficient
0,Intercept,-0.223554
1,Reason 1,2.154799
2,Reason 2,0.350154
3,Reason 3,1.612093
4,Reason 4,1.414509
5,Day of the Week,-0.019982
6,Month,0.189042
7,Transportation Expense,0.867042
8,Distance to Work,0.023248
9,Age,-0.134078


In [98]:
variables_table['Odds_Ratio'] = np.exp(variables_table.Coefficient)
variables_table.sort_values('Odds_Ratio', ascending=False)

Unnamed: 0,Name,Coefficient,Odds_Ratio
1,Reason 1,2.154799,8.626153
3,Reason 3,1.612093,5.013292
4,Reason 4,1.414509,4.114465
7,Transportation Expense,0.867042,2.37986
15,Children,0.422766,1.526178
2,Reason 2,0.350154,1.419286
6,Month,0.189042,1.208091
12,Overweight,0.170191,1.185532
13,Obese,0.052885,1.054309
8,Distance to Work,0.023248,1.02352


Here we can see which variables are more or less relevant, a variables is not important if:
* the coefficient is around 0
* the odds ratio is around 1
###### A weight close to 0 will null the variable and an odds ratio close to 1 means it will remain almost the same

In [107]:
irrelevant_variables = variables_table.loc[(variables_table['Coefficient'] > -0.05) & (variables_table['Coefficient']< 0.05)]
irrelevant_variables

Unnamed: 0,Name,Coefficient,Odds_Ratio
5,Day of the Week,-0.019982,0.980216
8,Distance to Work,0.023248,1.02352
10,Daily Work Load Average,-0.036191,0.964456


that's interesting, woulda thought distance to work and work load would way more considering work load directly affects the employee mentally

knowing this it would be safe to drop this columns and the accuracy of the model would remain the same

In [110]:
irrelevant_columns = list(irrelevant_variables.Name.values)
irrelevant_columns

['Day of the Week', 'Distance to Work', 'Daily Work Load Average']

let's drop em

In [111]:
unscaled_inputs = unscaled_inputs.drop(columns=irrelevant_columns, axis=1)
unscaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Transportation Expense,Age,Normal Weight,Overweight,Obese,Education,Children,Pets
0,0,0,0,1,7,289,33,0,0,1,0,1,1
1,0,0,0,0,7,118,50,0,0,1,0,1,0
2,0,0,0,1,7,179,38,0,0,1,0,0,0
3,1,0,0,0,7,279,39,1,0,0,0,1,0
4,0,0,0,1,7,289,33,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,1,0,0,1,1,0
696,1,0,0,0,5,225,28,1,0,0,0,1,1
697,1,0,0,0,5,330,28,0,1,0,1,0,0
698,0,0,0,1,5,235,32,0,1,0,1,0,0


for clarity's sake i will go through the scaling process again

In [114]:
scaler = StandardScaler()
scaler.fit(unscaled_inputs)
scaled_inputs = scaler.transform(unscaled_inputs)

In [115]:
logreg = LogisticRegression()
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size= 0.8, random_state=20)
logreg.fit(x_train, y_train)

In [116]:
logreg.score(x_train, y_train)

0.7821428571428571

indeed it is a tiny bit more accurate, it's good to remove redundant variables

Let's test the model with unknown variables

In [117]:
logreg.score(x_test, y_test)

0.7142857142857143

In [118]:
(logreg.score(x_train, y_train) - logreg.score(x_test, y_test)) * 100

6.785714285714284

less than a 7% difference, it's ok, don't think we overfitted

we can get the probabiliy of each variable to land true in excess absenteeism. The first value is for 0 and the second 1 so we want to keep the ones in the second position

In [122]:
probabilities = logreg.predict_proba(x_test)
probabilities

array([[0.70627239, 0.29372761],
       [0.49674211, 0.50325789],
       [0.35617621, 0.64382379],
       [0.78897163, 0.21102837],
       [0.08749558, 0.91250442],
       [0.31894411, 0.68105589],
       [0.31312296, 0.68687704],
       [0.17692731, 0.82307269],
       [0.863386  , 0.136614  ],
       [0.74989324, 0.25010676],
       [0.23040629, 0.76959371],
       [0.0784381 , 0.9215619 ],
       [0.05497183, 0.94502817],
       [0.84128329, 0.15871671],
       [0.37917923, 0.62082077],
       [0.37760783, 0.62239217],
       [0.2828943 , 0.7171057 ],
       [0.27183674, 0.72816326],
       [0.57018949, 0.42981051],
       [0.05560405, 0.94439595],
       [0.78447967, 0.21552033],
       [0.78897163, 0.21102837],
       [0.34686606, 0.65313394],
       [0.34686606, 0.65313394],
       [0.25645135, 0.74354865],
       [0.8274746 , 0.1725254 ],
       [0.38886884, 0.61113116],
       [0.88851845, 0.11148155],
       [0.13408319, 0.86591681],
       [0.78897163, 0.21102837],
       [0.

In [123]:
probabilities = probabilities[:,1]
probabilities

array([0.29372761, 0.50325789, 0.64382379, 0.21102837, 0.91250442,
       0.68105589, 0.68687704, 0.82307269, 0.136614  , 0.25010676,
       0.76959371, 0.9215619 , 0.94502817, 0.15871671, 0.62082077,
       0.62239217, 0.7171057 , 0.72816326, 0.42981051, 0.94439595,
       0.21552033, 0.21102837, 0.65313394, 0.65313394, 0.74354865,
       0.1725254 , 0.61113116, 0.11148155, 0.86591681, 0.21102837,
       0.44827   , 0.58516456, 0.70453256, 0.78857645, 0.21102837,
       0.38648203, 0.1432532 , 0.8949179 , 0.23052113, 0.56641481,
       0.20198862, 0.39180776, 0.15733644, 0.06697518, 0.70307963,
       0.70198415, 0.83019743, 0.29372761, 0.25179773, 0.19324126,
       0.54170067, 0.11759362, 0.68105589, 0.31907956, 0.88996641,
       0.53080572, 0.97385623, 0.36690609, 0.08168305, 0.08591836,
       0.78348348, 0.6689533 , 0.46025977, 0.79254975, 0.17254802,
       0.27136749, 0.01081916, 0.1432532 , 0.83938096, 0.19044632,
       0.1432532 , 0.0996985 , 0.93845923, 0.28302567, 0.46727

let's get the %

In [124]:
probabilities = [round(number*100,2) for number in probabilities]
probabilities

[29.37,
 50.33,
 64.38,
 21.1,
 91.25,
 68.11,
 68.69,
 82.31,
 13.66,
 25.01,
 76.96,
 92.16,
 94.5,
 15.87,
 62.08,
 62.24,
 71.71,
 72.82,
 42.98,
 94.44,
 21.55,
 21.1,
 65.31,
 65.31,
 74.35,
 17.25,
 61.11,
 11.15,
 86.59,
 21.1,
 44.83,
 58.52,
 70.45,
 78.86,
 21.1,
 38.65,
 14.33,
 89.49,
 23.05,
 56.64,
 20.2,
 39.18,
 15.73,
 6.7,
 70.31,
 70.2,
 83.02,
 29.37,
 25.18,
 19.32,
 54.17,
 11.76,
 68.11,
 31.91,
 89.0,
 53.08,
 97.39,
 36.69,
 8.17,
 8.59,
 78.35,
 66.9,
 46.03,
 79.25,
 17.25,
 27.14,
 1.08,
 14.33,
 83.94,
 19.04,
 14.33,
 9.97,
 93.85,
 28.3,
 46.73,
 21.1,
 84.55,
 61.41,
 80.69,
 52.59,
 16.48,
 1.43,
 36.69,
 68.68,
 44.76,
 9.14,
 72.7,
 65.31,
 5.56,
 58.52,
 13.65,
 17.44,
 22.04,
 27.14,
 25.01,
 90.96,
 30.53,
 88.37,
 28.42,
 18.04,
 49.48,
 58.52,
 76.02,
 82.66,
 27.2,
 30.59,
 28.24,
 90.96,
 86.6,
 16.66,
 1.28,
 85.58,
 73.46,
 40.78,
 37.35,
 51.16,
 92.22,
 69.36,
 64.09,
 22.5,
 26.06,
 12.9,
 93.93,
 39.18,
 21.1,
 23.66,
 20.2,
 11.96,
 76.

In [125]:
len(probabilities)

140

to make it fancy we can add those % to each register that was predicted

In [128]:
raw_csv = pd.read_csv('Absenteeism_data.csv')

In [145]:
employee_percentage = raw_csv.iloc[560:,:]
employee_percentage.insert(1,'Excess Absenteeism probability', probabilities)
employee_percentage = employee_percentage.iloc[:,:2]
employee_percentage.sort_values('Excess Absenteeism probability', ascending=False)

Unnamed: 0,ID,Excess Absenteeism probability
616,3,97.39
572,17,94.50
579,14,94.44
682,26,93.93
632,3,93.85
...,...,...
603,12,6.70
648,14,5.56
641,3,1.43
670,14,1.28


And that's pretty much it for predictions

Let's save the model

In [142]:
with open('model', 'wb') as f:
    pickle.dump(logreg, f)

and the scaler too

In [143]:
with open('scaler', 'wb') as f:
    pickle.dump(scaler, f)

next we should make a module that processes data with the same format so we can test the model and the scaler