## Absenteeism Project

## Abstract
This project was completed as part of the Data Science course by 365 on Udemy. It addresses the issue of workforce absenteeism by developing a predictive model to forecast excessive absenteeism. The project demonstrates my ability to:

1. Comprehend the task and approach it analytically
2. Understand and prepare the data
3. Model using logistic regression
4. Save the model

In [1]:
#Importing relevant libraries 
import pandas as pd
import numpy as np

In [2]:
#Importing data
df = pd.read_csv("Absenteeism_data.csv")

In [3]:
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [4]:
#creating a copy in case we need to get back to "first check point"
df_1 = df.copy()

In [5]:
#examining the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


In [6]:
df['ID'] = df['ID'].astype('object')
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   ID                         700 non-null    object        
 1   Reason for Absence         700 non-null    int64         
 2   Date                       700 non-null    datetime64[ns]
 3   Transportation Expense     700 non-null    int64         
 4   Distance to Work           700 non-null    int64         
 5   Age                        700 non-null    int64         
 6   Daily Work Load Average    700 non-null    float64       
 7   Body Mass Index            700 non-null    int64         
 8   Education                  700 non-null    int64         
 9   Children                   700 non-null    int64         
 10  Pets                       700 non-null    int64         
 11  Absenteeism Time in Hours  700 non-null    int64         
dtypes: datet

In [7]:
df.describe()

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,19.411429,2016-12-31 00:45:15.428571392,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
min,0.0,2015-07-06 00:00:00,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,13.0,2016-03-27 06:00:00,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,23.0,2016-12-19 00:00:00,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,27.0,2017-10-26 06:00:00,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0
max,28.0,2018-05-31 00:00:00,388.0,52.0,58.0,378.884,38.0,4.0,4.0,8.0,120.0
std,8.356292,,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082


In [8]:
# Generate dummy variables for 'Reason for Absence'
reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first=True)

# Group reasons based on specified ranges
reason_1 = reason_columns.iloc[:, :14].max(axis=1)
reason_2 = reason_columns.iloc[:, 14:17].max(axis=1)
reason_3 = reason_columns.iloc[:, 17:21].max(axis=1)
reason_4 = reason_columns.iloc[:, 21:].max(axis=1)

# Rename reason groups for clarity
reason_1.name = 'Reason_1'
reason_2.name = 'Reason_2'
reason_3.name = 'Reason_3'
reason_4.name = 'Reason_4'

In [9]:
# Concatenate the grouped reasons with the original dataframe
df = pd.concat([df, reason_1, reason_2, reason_3, reason_4], axis=1)

# Reorder columns
column_names_reordered = [
    'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
    'Date', 'Transportation Expense', 'Distance to Work', 'Age',
    'Daily Work Load Average', 'Body Mass Index', 'Education',
    'Children', 'Pets', 'Absenteeism Time in Hours'
]

df = df[column_names_reordered]
df.head()


Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,2015-07-07,289,36,33,239.554,30,1,2,1,4
1,False,False,False,False,2015-07-14,118,13,50,239.554,31,1,1,0,0
2,False,False,False,True,2015-07-15,179,51,38,239.554,31,1,0,0,2
3,True,False,False,False,2015-07-16,279,5,39,239.554,24,1,2,0,4
4,False,False,False,True,2015-07-23,289,36,33,239.554,30,1,2,1,2


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Reason_1                   700 non-null    bool          
 1   Reason_2                   700 non-null    bool          
 2   Reason_3                   700 non-null    bool          
 3   Reason_4                   700 non-null    bool          
 4   Date                       700 non-null    datetime64[ns]
 5   Transportation Expense     700 non-null    int64         
 6   Distance to Work           700 non-null    int64         
 7   Age                        700 non-null    int64         
 8   Daily Work Load Average    700 non-null    float64       
 9   Body Mass Index            700 non-null    int64         
 10  Education                  700 non-null    int64         
 11  Children                   700 non-null    int64         
 12  Pets    

In [11]:
df_1.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [12]:
#creating a check point 
df_2 = df.copy()
df_2.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,2015-07-07,289,36,33,239.554,30,1,2,1,4
1,False,False,False,False,2015-07-14,118,13,50,239.554,31,1,1,0,0
2,False,False,False,True,2015-07-15,179,51,38,239.554,31,1,0,0,2
3,True,False,False,False,2015-07-16,279,5,39,239.554,24,1,2,0,4
4,False,False,False,True,2015-07-23,289,36,33,239.554,30,1,2,1,2


## Analysis

In [13]:
df['Absenteeism Time in Hours'].median()

3.0

In [14]:
df['Absenteeism Time in Hours'].mean()

6.761428571428572

In [15]:
#creating a traget accroding to the median where any value above 3 hours will be considered excessive Absenteeism
targets = np.where(df["Absenteeism Time in Hours"] > df["Absenteeism Time in Hours"].median() , 1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [16]:
df["excessive Absenteeism"] = targets

In [17]:
data_targets = df.drop(["Absenteeism Time in Hours", "Date"], axis= 1)
data_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,excessive Absenteeism
0,False,False,False,True,289,36,33,239.554,30,1,2,1,1
1,False,False,False,False,118,13,50,239.554,31,1,1,0,0
2,False,False,False,True,179,51,38,239.554,31,1,0,0,0
3,True,False,False,False,279,5,39,239.554,24,1,2,0,1
4,False,False,False,True,289,36,33,239.554,30,1,2,1,0


In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
data_targets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Reason_1                 700 non-null    bool   
 1   Reason_2                 700 non-null    bool   
 2   Reason_3                 700 non-null    bool   
 3   Reason_4                 700 non-null    bool   
 4   Transportation Expense   700 non-null    int64  
 5   Distance to Work         700 non-null    int64  
 6   Age                      700 non-null    int64  
 7   Daily Work Load Average  700 non-null    float64
 8   Body Mass Index          700 non-null    int64  
 9   Education                700 non-null    int64  
 10  Children                 700 non-null    int64  
 11  Pets                     700 non-null    int64  
 12  excessive Absenteeism    700 non-null    int32  
dtypes: bool(4), float64(1), int32(1), int64(7)
memory usage: 49.3 KB


In [20]:
data_targets[["Reason_1", "Reason_2", "Reason_3", "Reason_4"]] = data_targets[["Reason_1", "Reason_2", "Reason_3", "Reason_4"]].astype(int)
data_targets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Reason_1                 700 non-null    int32  
 1   Reason_2                 700 non-null    int32  
 2   Reason_3                 700 non-null    int32  
 3   Reason_4                 700 non-null    int32  
 4   Transportation Expense   700 non-null    int64  
 5   Distance to Work         700 non-null    int64  
 6   Age                      700 non-null    int64  
 7   Daily Work Load Average  700 non-null    float64
 8   Body Mass Index          700 non-null    int64  
 9   Education                700 non-null    int64  
 10  Children                 700 non-null    int64  
 11  Pets                     700 non-null    int64  
 12  excessive Absenteeism    700 non-null    int32  
dtypes: float64(1), int32(5), int64(7)
memory usage: 57.6 KB


## scaling features

In [21]:
unscaled_inputs = data_targets.iloc[: , :-1]
scaler = StandardScaler()
scaler.fit(unscaled_inputs)

In [22]:
scaled_inputs = scaler.transform(unscaled_inputs)
scaled_inputs

array([[-0.57735027, -0.09298136, -0.39626354, ..., -0.42368477,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.39626354, ..., -0.42368477,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.39626354, ..., -0.42368477,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.39626354, ...,  1.07419067,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.39626354, ...,  2.57206611,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.39626354, ..., -0.42368477,
        -0.01928035,  0.26848661]])

In [23]:
#spliting data
from sklearn.model_selection import train_test_split

In [24]:
train_test_split(scaled_inputs,targets)

[array([[-0.57735027, -0.09298136, -0.39626354, ..., -0.42368477,
         -0.01928035,  1.12666297],
        [-0.57735027, -0.09298136, -0.39626354, ...,  2.57206611,
         -0.91902997, -0.58968976],
        [ 1.73205081, -0.09298136, -0.39626354, ..., -0.42368477,
         -0.01928035, -0.58968976],
        ...,
        [-0.57735027, -0.09298136,  2.52357307, ...,  2.57206611,
         -0.01928035,  0.26848661],
        [-0.57735027, -0.09298136, -0.39626354, ..., -0.42368477,
         -0.91902997, -0.58968976],
        [ 1.73205081, -0.09298136, -0.39626354, ...,  1.07419067,
          0.88046927, -0.58968976]]),
 array([[-0.57735027, -0.09298136, -0.39626354, ...,  2.57206611,
         -0.91902997, -0.58968976],
        [-0.57735027, -0.09298136,  2.52357307, ..., -0.42368477,
         -0.91902997, -0.58968976],
        [ 1.73205081, -0.09298136, -0.39626354, ..., -0.42368477,
          0.88046927, -0.58968976],
        ...,
        [-0.57735027, -0.09298136, -0.39626354, ..., -

In [25]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=0.2, random_state=42)

In [26]:
print(f"Training data shape: {x_train.shape}")
print(f"Testing data shape: {x_test.shape}")
print(f"Testing data shape: {y_train.shape}")
print(f"Testing data shape: {y_test.shape}")

Training data shape: (560, 12)
Testing data shape: (140, 12)
Testing data shape: (560,)
Testing data shape: (140,)


## Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [28]:
reg = LogisticRegression()
reg.fit(x_train,y_train)

In [29]:
reg.score(x_train,y_train)

0.7946428571428571

## finding intercept and coefficients 

In [30]:
reg.intercept_

array([-0.15700726])

In [31]:
reg.coef_

array([[ 2.06536862,  0.33374291,  1.87501648,  1.24002846,  0.69032747,
        -0.13563992, -0.30801778, -0.08495501,  0.26680131, -0.26757125,
         0.39432708, -0.4967411 ]])

In [32]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [33]:
feature_name = unscaled_inputs.columns.values

In [34]:
summry_table = pd.DataFrame (columns= ["Feature_name"], data = feature_name)
summry_table ["coefficients"] = np.transpose(reg.coef_)
summry_table

Unnamed: 0,Feature_name,coefficients
0,Reason_1,2.065369
1,Reason_2,0.333743
2,Reason_3,1.875016
3,Reason_4,1.240028
4,Transportation Expense,0.690327
5,Distance to Work,-0.13564
6,Age,-0.308018
7,Daily Work Load Average,-0.084955
8,Body Mass Index,0.266801
9,Education,-0.267571


## Testing

In [35]:
reg.score(x_test,y_test)

0.8

## Save model

In [36]:
import pickle

In [37]:
with open("model", "wb") as file:
    pickle.dump(reg, file)

In [38]:
with open("model", "wb") as file:
    pickle.dump(scaled_inputs, file)