In [1]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv('sap_storing_data_hu_project.csv', low_memory = False)  

# Remove duplicates, date time convert, hersteltijd created, dropped nans

In [3]:
df = df.drop_duplicates(subset=['#stm_sap_meldnr'])
df = df.reset_index(drop=True)

df['stm_aanntpl_tijd']= pd.to_datetime(df['stm_aanntpl_tijd'])
df['stm_fh_ddt']= pd.to_datetime(df['stm_fh_ddt'])

# Calculate the true reparation time
df['stm_hers_tijd'] = (df['stm_fh_ddt'].dt.hour * 60 + df['stm_fh_ddt'].dt.minute) - (df['stm_aanntpl_tijd'].dt.hour * 60 + df['stm_aanntpl_tijd'].dt.minute)

df = df.dropna(subset=['stm_hers_tijd','stm_prioriteit' ]) 
df = df.reset_index(drop=True)

#stm_sap_meldnr        0
stm_hers_tijd          0
stm_oorz_groep     15254
stm_prioriteit         0
stm_oorz_code      15254
dtype: int64

#### Nu de data in juiste types staat en gene duplicates in zitten, kunnen we verder gaan met de volgende functies:
- prep_data, hierin cleanen we de data nog om de mean square error beter te maken
- split_data, om te model te gebruiken, worden er dummies en de data wordt splitst in train en test waardes
- dtr_train_model, hier wordt DecisionTreeRegressor gebruikt om de waardes te voorspellen

In [4]:
def prep_data(df, nan_columns):    
    df = df.dropna(subset=nan_columns)
    df = df.reset_index(drop=True)
    
    # We hebben deze waardes gekozen na overleg met product owner
    df = df[(df['stm_hers_tijd'] >= 4) & (df['stm_hers_tijd'] <= 480)]
    df = df.reset_index(drop=True)
    return df

def split_data (df, dummies_columns):
    DummiesX = pd.get_dummies(data=df[dummies_columns])
    y = df['stm_hers_tijd']
    X_train, X_test, y_train, y_test = train_test_split(DummiesX, y, random_state=2)
    return X_train, X_test, y_train, y_test

def dtr_train_model(X_train, X_test, y_train, y_test, maxDepth):
    DTR = DecisionTreeRegressor(max_depth=7)
    DTR.fit(X_train,y_train)
    y_pred = DTR.predict(X_test)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test, y_pred)
    return r2, RMSE

In [5]:
df_DTR = df[['#stm_sap_meldnr', 'stm_hers_tijd', 'stm_oorz_groep','stm_prioriteit', 'stm_oorz_code']]

In [6]:
df_DTR = prep_data(df_DTR, ['stm_hers_tijd', 'stm_oorz_code', 'stm_prioriteit'])
X_train, X_test, y_train, y_test = split_data(df_DTR, ['stm_oorz_code', 'stm_prioriteit'])

print("Decision Tree Regressor: " + str(dtr_train_model(X_train, X_test, y_train, y_test, 7)))

Decision Tree Regressor: (0.06272165201693758, 75.12609334127268)
