In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [2]:
class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [5]:
class absenteeism_model():
    
    def __init__(self, model_file, scaler_file):
        with open('model', 'rb') as model_file, open('scaler', 'rb') as scaler_file:
            self.reg = pickle.load(model_file)
            self.scaler = pickle.load(scaler_file)
            self.data = None
            
    def load_andclean_data(self, data_file):
        # Loader data og laver kopi
        df = pd.read_csv(data_file, delimiter = ',')
        self.df_with_predictions = df.copy()
        
        # Fjerner ID
        df = df.drop(['ID'], axis = 1)
        df['Absenteeism Time in Hours'] = 'NaN'
        
        # Sætter Reasons up i dummies efter sortering af grundene i 4 grupper og laver et overall dataframe. Fjerner original variablen.
        reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)
        reason_type_1 = reasons_columns.loc[:, '1':'14'].max(axis = 1)
        reason_type_2 = reasons_columns.loc[:, '15':'17'].max(axis = 1)
        reason_type_3 = reasons_columns.loc[:, '18':'21'].max(axis = 1)
        reason_type_4 = reasons_columns.loc[:, '22':].max(axis = 1)
        df = df.drop(['Reason for Absence'], axis = 1)
        
        # Sætter det hele sammen, fikser variable navne og rækkefølge
        df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
        column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
                        'Daily Work Load Average', 'Body Mass Index', 'Education',
                        'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason 1', 'Reason 2', 'Reason 3', 'Reason 4']
        df.columns = column_names
        column_names_reordered = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 
                                  'Date', 'Transportation Expense', 'Distance to Work', 'Age',
                                  'Daily Work Load Average', 'Body Mass Index', 'Education',
                                  'Children', 'Pets', 'Absenteeism Time in Hours']
        df = df[column_names_reordered]
        
        # Sætter datoer op i rigtig format, finder mpnederne og ligger til dataframe, derefter dagen i ugen. Sætter rækkefølgen op igen.
        df['Date'] = pd.to_datetime(df['Date'], format = '%d%m%Y')
        list_months = []
        for i in range(df.shape[0]):
            list_months.append(df['Date'][i].month)
            
        df['Month Value'] = list_months
        df['Day of the Week'] = df['Date'].apply(lambda x: x.weekday())
        df = df.drop(['Date'], axis = 1)
        
        column_names_upd = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 
                            'Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age',
                            'Daily Work Load Average', 'Body Mass Index', 'Education',
                            'Children', 'Pets', 'Absenteeism Time in Hours']
        df = df[column_names_upd]
        
        # Fikser og sortere Education featuren
        df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})
        
        # Indsætter 0 på NaN værdier
        df = df.fillna(value = 0)
        
        # Fjerner Absenteeism i timer og variabler vi ikke mener bringer noget til modellen
        df = df.drop(['Absenteeism Time in Hours', 'Day of the Week', ' Daily Work Load Average', 'Distance to Work'], axis = 1)
        
        self.preprocessed_data = df.copy()
        
        self.data = self.scaler.transform(df)
    
    # Funktion der giver sandsynligheden for at excessive absence
    def predicted_probability(self):
        if (self.data is not None):
            pred = self.reg.predict_proba(self.data)[:,1]
            return pred
        
    # Funktion der bestemmer giver hvilke gruppe en data punkt hører til baseret på modellen   
    def predicted_output_category(self):
        if (self.data is not None):
            pred_outputs = self.reg.predict(self.data)
            return pred_outputs
        
    # Funnction to predict the above with probabilities added to... dont know actually.
    def predicted_outputs(self):
        if (self.data is not None):
            self.preprocessed_data['Probability'] = self.reg.predict_proba(self.data)[:,1]
            self.preprocessed_data['Prediction'] = self.reg.predict(self.data)
            return self.preprocessed_data
    
        