## Import NumPy and Pandas

In [1]:
import numpy as np
import pandas as pd

## Get "Absenteeism-data.csv" original file

In [2]:
raw_csv_data = pd.read_csv("Absenteeism-data.csv")

## Display data

In [3]:
#raw_csv_data

## Create a copy from the original file

In [4]:
df = raw_csv_data.copy()

In [5]:
#df

## Display all columns and rows

In [6]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [7]:
#display(df)

## Display informations about the data

In [33]:
#df.info()

## Drop 'ID' column

In [9]:
df = df.drop(['ID'], axis = 1)

In [10]:
#df

## Extract 'Reasons for Absence' column

### This list shows the categories of absenteeism. Each number represents a category

In [11]:
sorted(df['Reason for Absence'].unique())

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28]

## Get dummies

In [12]:
reason_columns = pd.get_dummies(df['Reason for Absence'])

In [32]:
#reason_columns

## Check if some rows have missing values

#### 0: missing value || 1: single value

In [25]:
reason_columns['check'] = reason_columns.sum(axis = 1)

In [24]:
#reason_columns

### Check the vertical axis length

In [15]:
reason_columns['check'].sum(axis = 0)

700

### Check if we have different values from the "check" column

In [16]:
reason_columns['check'].unique()

array([1])

### Delete "check" column after the verifications

In [20]:
reason_columns = reason_columns.drop(['check'], axis = 1)

KeyError: "['check'] not found in axis"

In [31]:
#reason_columns

## Delete column 0 from "reason_columns" variable

In [18]:
reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)

In [22]:
#reason_columns

## Grouping the "Reason for Absence" columns

### Three steps:
- Add dummy variables on the current state of DateFrame (avoid duplicate information between "Reason for Absence" column and dummy variables)
- Grouping the variables into the DataFrame (create group for each category)

### First step

In [26]:
df.columns.values

array(['Reason for Absence', 'Date', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours'], dtype=object)

In [27]:
reason_columns.columns.values

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       21, 22, 23, 24, 25, 26, 27, 28, 'check'], dtype=object)

In [28]:
df = df.drop(['Reason for Absence'], axis = 1)

In [30]:
#df

### Second step

In [39]:
reason_type_1 = reason_columns.loc[:, 1:14].max(axis = 1)
reason_type_2 = reason_columns.loc[:, 14:17].max(axis = 1)
reason_type_3 = reason_columns.loc[:, 18:21].max(axis = 1)
reason_type_4 = reason_columns.loc[:, 22:28].max(axis = 1)

In [43]:
#reason_type_1

0      0
1      0
2      0
3      1
4      0
5      0
6      0
7      0
8      0
9      0
10     1
11     1
12     1
13     1
14     0
15     1
16     0
17     0
18     1
19     0
20     1
21     1
22     1
23     0
24     0
25     0
26     0
27     0
28     0
29     0
30     0
31     0
32     0
33     0
34     0
35     0
36     0
37     1
38     0
39     0
40     0
41     0
42     0
43     0
44     0
45     0
46     0
47     0
48     0
49     1
50     0
51     0
52     0
53     0
54     0
55     0
56     0
57     0
58     0
59     0
60     0
61     0
62     0
63     0
64     0
65     0
66     0
67     0
68     0
69     0
70     0
71     0
72     0
73     0
74     0
75     1
76     0
77     0
78     0
79     0
80     0
81     0
82     1
83     0
84     0
85     1
86     0
87     1
88     0
89     1
90     0
91     0
92     1
93     0
94     0
95     0
96     1
97     0
98     0
99     0
100    0
101    0
102    0
103    0
104    0
105    0
106    0
107    0
108    0
109    0
110    0
1