In [1]:
import pandas as pd
import datetime as dt

In [2]:
# check new data
df = pd.read_csv('Absenteeism_new_data.csv')

In [3]:
reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)


In [4]:
reason_type_1 = reason_columns.loc[:, '1':'14'].max(axis=1)
reason_type_2 = reason_columns.loc[:, '15':'17'].max(axis=1)
reason_type_3 = reason_columns.loc[:, '18':'21'].max(axis=1)
reason_type_4 = reason_columns.loc[:, '22':].max(axis=1)

In [5]:
# to avoid multicollinearity, drop the 'reason for absence' column from df
df = df.drop(['Reason for Absence'], axis = 1)

# concatenate df and the 4 types of reason for absence
df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)

In [6]:
# assigning names to the 4 columns
column_names = ['Date', 'Transportation Expense',
                'Distance to Work', 'Age', 'Daily Work Load Average',
                'Body Mass Index', 'Education', 'Children', 'Pets', 
                'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
df.columns = column_names

In [7]:
# reordering the columns
column_names_reodered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Date', 'Transportation Expense',
                        'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
                        'Education', 'Children', 'Pets', 'Absenteeism Time in Hours']
df.columns = column_names_reodered


In [8]:
df['Date'] = df['Date'].apply(pd.to_datetime, errors='coerce')

In [9]:
# Extracting the 'Year','Month' and 'day' from Date Column.
df['Month'] = df['Date'].dt.month_name()
df['Day'] = df['Date'].dt.day_name()

# converting Month and Day columns to intergers, this will help in StandardScaler
df['Month'] = df['Month'].replace(['January', 'February', 'March', 'April', 'May',
                                'June', 'July', 'August', 'September', 'October', 'November', 'December'], 
                                ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'])
df['Day'] = df['Day'].replace(['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'], 
                                ['1', '2', '3', '4', '5', '6', '7'])

# dropping the 'Date' Column
df = df.drop(['Date'], axis = 1)

In [10]:
# rearrange the column
date_rearranged = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month', 'Day','Transportation Expense', 'Distance to Work', 'Age',
                  'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children', 'Pets', 'Absenteeism Time in Hours']
df.columns = date_rearranged

In [11]:
# using the 'map' method to reassign the values in the Education column
df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})

# replace the NaN values
df = df.fillna(value=0) 

# drop the variables we decide we don't need
df = df.drop(['Day', 'Daily Work Load Average', 'Distance to Work'], axis = 1)