**Explore whether a person with certain characteristis is expected to be away form work at some point in time or not**

In [78]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv(r"https://raw.githubusercontent.com/MarcoCalbucci/DataAnalytics/main/PredictiveModeling/Exercises/Absenteeism_data.csv")
df

In [None]:
df.info()

#### Categorical variables

'Reason for Absence' is a categorical variable!

In [None]:
# 'Reason for Absence' is a categorical variable --> dummy variables
reason_columns = pd.get_dummies(df['Reason for Absence'])
reason_columns.head()

In [None]:
# sum must be 1
reason_columns['check'] = reason_columns.sum(axis=1)
# reason_columns['check'].all()
# len(reason_columns['check']) == len(reason_columns[reason_columns['check']==1])
list(reason_columns['check'].unique())

In [None]:
# drop one column to avoid multicollinearity
reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first=True)
reason_columns.head()

Reasons can be divided in groups:

1-14: disease

15-17: related to pregnancy

18-21: related to poisoning

22-28: light reason (es. dental consultation)

In [None]:
# group reasons
column_disease = reason_columns.loc[:,1:14].max(axis=1)
column_pregnancy = reason_columns.loc[:,15:17].max(axis=1)
column_poisoning = reason_columns.loc[:,18:21].max(axis=1)
column_light_reason = reason_columns.loc[:,22:28].max(axis=1)

In [None]:
# add a column for each group
df["Disease"] = column_disease
df["Pregnancy"] = column_pregnancy
df["Poisoning"] = column_poisoning
df["Light reason"] = column_light_reason

In [None]:
# drop the original column
df = df.drop(columns=['Reason for Absence'])
df

#### Dates

In [66]:
df['Date']

0      07/07/2015
1      14/07/2015
2      15/07/2015
3      16/07/2015
4      23/07/2015
          ...    
695    23/05/2018
696    23/05/2018
697    24/05/2018
698    24/05/2018
699    31/05/2018
Name: Date, Length: 700, dtype: object

In [65]:
type(df['Date'])

pandas.core.series.Series

In [61]:
type(df['Date'][0])

str

In [59]:
df['Date'].dtypes

dtype('O')

In [67]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

In [68]:
df['Date']

0     2015-07-07
1     2015-07-14
2     2015-07-15
3     2015-07-16
4     2015-07-23
         ...    
695   2018-05-23
696   2018-05-23
697   2018-05-24
698   2018-05-24
699   2018-05-31
Name: Date, Length: 700, dtype: datetime64[ns]

In [72]:
df['Month'] = df['Date'].dt.month
df['Month']

0      7
1      7
2      7
3      7
4      7
      ..
695    5
696    5
697    5
698    5
699    5
Name: Month, Length: 700, dtype: int64

In [None]:
def week_of_day(date):
    return date.weekday()

df['Date'].apply(lambda x: week_of_day(x))

In [73]:
df['WeekDay'] = df['Date'].dt.weekday
df['WeekDay']

0      1
1      1
2      2
3      3
4      3
      ..
695    2
696    2
697    3
698    3
699    3
Name: WeekDay, Length: 700, dtype: int64

In [81]:
df = df.drop(columns=['Date'])