# Predicting Absenteeism at Work: Pre-Processing
### Brief
Using logistic regression, predict which employees will be excessively absent from work due to the various inputs of our dataset.

### Load Data

In [66]:
# load libraries
import pandas as pd
import numpy as np
import sklearn
from datetime import datetime

# read in csv
raw_df = pd.read_csv('S:/Matt/Data Science/Udemy/Python, SQL and Tableau/Data/absent_in_data.csv')

# create copy of data
df =  raw_df.copy()

# peek at data
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [67]:
# check for nulls (there are none!)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


In [68]:
# investigate 'Education'
sorted(df['Education'].unique())

[1, 2, 3, 4]

In [69]:
# there aren't many counts in 2, 3 or 4 (i.e. graduate, postgraduate, doctorate etc.) therefore group together
# 1 is high school, therefore this can remain separate due to high count and distinct meaning
df['Education'].value_counts()

1    583
3     73
2     40
4      4
Name: Education, dtype: int64

### Pre-Processing
Steps:
* Drop 'ID' column
* Group 'Reason for Absence' variable into 4 binary groups/dummy variables
    * 0 = no reason given
    * 1 - 14 = general sickness absence
    * 15 - 17 = maternity absence
    * 18 - 22 = extreme/unusual injury absence
    * 22+ = light/medical appointment
* Drop 'Reason for Absence' raw column
* Extract month and day values from date, then drop 'Date' column
* Binarize the 'Education' variable

In [70]:
# drop ID column
del df['ID']

# group 'Reason for Absence' into dummy/binary variables (keep 0 as 0 in all groups i.e. no reason given)
# drop 0 reason column (e.g. 'Reason_0') to avoid multicollinearity and redundancy

# long-winded, manual method
#df['Reason_1'] = np.where(np.logical_and(df['Reason for Absence'] > 0, df['Reason for Absence'] < 15), 1, 0)
#df['Reason_2'] = np.where(np.logical_and(df['Reason for Absence'] >= 15, df['Reason for Absence'] < 18), 1, 0)
#df['Reason_3'] = np.where(np.logical_and(df['Reason for Absence'] >= 18, df['Reason for Absence'] < 22), 1, 0)
#df['Reason_4'] = np.where(df['Reason for Absence'] >= 22, 1, 0)

# scientific method (preferable)
# create dummy variables of 'Reason for Absence', ignoring 0 response
absence_dummies = pd.get_dummies(df['Reason for Absence'], drop_first = True) # drop 'Reason_0' to avoid multicollinearity

# group dummy variables into classes
reason_1 = absence_dummies.loc[:, 1:14].max(axis=1) # max gets 1 if any are 1 or 0 if not (essentially creates binary series)
reason_2 = absence_dummies.loc[:, 15:17].max(axis=1)
reason_3 = absence_dummies.loc[:, 18:21].max(axis=1)
reason_4 = absence_dummies.loc[:, 22:].max(axis=1)

# add dummy variables to original dataframe
df = pd.concat([df, reason_1.rename('Reason_1'), reason_2.rename('Reason_2'),
                reason_3.rename('Reason_3'), reason_4.rename('Reason_4')],
                axis=1) # axis specifies that you're concatenating columns

# drop raw 'Reason for Absence' column
del df['Reason for Absence']

# convert 'Date' column into datetime
df['Date'] = pd.to_datetime(df['Date'])

# function to extract weekday from datetime column
def date_to_weekday(date):
    return(date.weekday())

# extract day and month from date column
df['Month Value'] = df['Date'].dt.month
#df['Day of the Week'] = df['Date'].dt.dayofweek
df['Day of the Week'] = df['Date'].apply(date_to_weekday)

# drop raw 'Date' column
del df['Date']

# binarize 'Education' column (0 for 1 and 1 for rest)
#df['Education'] = np.where(df['Education'] == 1, 0, 1)
df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1}).astype(int)

# re-order columns to match desired output
df = df[['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value', 'Day of the Week', 'Transportation Expense',
         'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children', 'Pets',
         'Absenteeism Time in Hours']]

# check outputs
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Write Out Pre-Processed CSV
This will be the input to our ML stage, therefore worth checkpointing our progress here.

In [71]:
# write data to csv
df.to_csv('S:/Matt/Data Science/Udemy/Python, SQL and Tableau/Data/absent_out_data (my_file).csv', index=False)

# create copy of df for checkpoint
df_prepro = df.copy()

# check dtypes before proceeding
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Reason_1                   700 non-null    uint8  
 1   Reason_2                   700 non-null    uint8  
 2   Reason_3                   700 non-null    uint8  
 3   Reason_4                   700 non-null    uint8  
 4   Month Value                700 non-null    int64  
 5   Day of the Week            700 non-null    int64  
 6   Transportation Expense     700 non-null    int64  
 7   Distance to Work           700 non-null    int64  
 8   Age                        700 non-null    int64  
 9   Daily Work Load Average    700 non-null    float64
 10  Body Mass Index            700 non-null    int64  
 11  Education                  700 non-null    int32  
 12  Children                   700 non-null    int64  
 13  Pets                       700 non-null    int64  