In [None]:
## This notebook will detail my workflow in preprocessing mock data for training a Logistic Regression model for classification
## Goals: 
##          Dropping irrelevant data 
##          Transforming variables to the right format 
##          Creating new features 
##          Standardizing, splitting, training
##          Validating  the model 

In [126]:
## Transforming the raw CSV 

import pandas as pd 

Raw_data = pd.read_csv("Abs_Raw.csv")
Raw_data.head()                 

## The first step in preprocessing will be the following: 
# Drop the ID column, 
# Group reasons for absence, 
# Split Date column 

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [127]:
## Dropping ID column, Grouping,

Raw_data = Raw_data.drop('ID', axis=1)
Reason_Dummies = pd.get_dummies(Raw_data['Reason for Absence'], dtype=int, drop_first=True)
reason_type_1 = Reason_Dummies.loc[:, 1:14].max(axis=1)
reason_type_2 = Reason_Dummies.loc[:, 15:17].max(axis=1)
reason_type_3 = Reason_Dummies.loc[:, 18:21].max(axis=1)
reason_type_4 = Reason_Dummies.loc[:, 22:].max(axis=1)

Raw_Grouped = pd.concat([Raw_data, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis =1)
Raw_Grouped = Raw_Grouped.drop('Reason for Absence', axis=1)


In [128]:
## Transforming Date column 
## Add two columns: month and day of week 

Raw_Dated_Grouped = Raw_Grouped.copy()

## Adding a month column
Raw_Dated_Grouped['Date'] = pd.to_datetime(Raw_Dated_Grouped['Date'], format = '%d/%m/%Y')
months = []

for i in range(Raw_Dated_Grouped.shape[0]):
    months.append(Raw_Dated_Grouped['Date'][i].month)

Raw_Dated_Grouped['Month Value'] = months

## Adding a weekday column
def to_weekday(date_value):
    return date_value.weekday()

Raw_Dated_Grouped['Day of the Week'] = Raw_Dated_Grouped['Date'].apply(to_weekday)

Raw_Dated_Grouped = Raw_Dated_Grouped.drop('Date', axis=1)

In [129]:
## Step 1 completed! Save please.. 
Preprocessed = Raw_Dated_Grouped.copy()
Preprocessed.head()

Unnamed: 0,ID,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3,Month Value,Day of the Week
0,11,289,36,33,239.554,30,1,2,1,4,0,0,0,1,7,1
1,36,118,13,50,239.554,31,1,1,0,0,0,0,0,0,7,1
2,3,179,51,38,239.554,31,1,0,0,2,0,0,0,1,7,2
3,7,279,5,39,239.554,24,1,2,0,4,1,0,0,0,7,3
4,11,289,36,33,239.554,30,1,2,1,2,0,0,0,1,7,3


In [167]:
## Step 2! Targeting and scaling 
## I use the median to select targets for the regression model and at the same time balance the dataset 
## I split the data set into two: inputs that need scaling and binary or ordinal inputs (these can not really be scaled)
## Then I concat them into the final inputs data
Targets = pd.DataFrame(np.where(Preprocessed['Absenteeism Time in Hours'] > 
                   Preprocessed['Absenteeism Time in Hours'].median(), 1, 0))
Preprocessed.columns

inputs_to_scale = Preprocessed.drop(['ID', 'Education', 'Pets', 0, 'Children', 1,2,3], axis=1)
bin_inputs = Preprocessed.drop(['ID', 'Transportation Expense', 'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
                                'Month Value', 'Day of the Week'], axis=1)
## Scale the targets
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Scaled_inputs = scaler.fit_transform(inputs_to_scale)
Inputs = pd.concat([pd.DataFrame(Scaled_inputs), bin_inputs], axis=1)

In [178]:
## Step 3! Splitting the data, training the model 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
x_train, x_test, y_train, y_test = train_test_split(Scaled_inputs, Targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)
print(x_train.shape, y_train.shape) ## see how it looks, looks good! 

## Training 
Reg = LogisticRegression()
Reg.fit(x_train, y_train)
Reg.score(x_train, y_train)

(560, 8) (560, 1)


  y = column_or_1d(y, warn=True)


0.9428571428571428