In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
df = pd.read_csv('sap_storing_data_hu_project.csv', low_memory = False)  

# Remove duplicates, date time convert, hersteltijd created, dropped nans

In [None]:
df = df.drop_duplicates(subset=['#stm_sap_meldnr'])
df = df.reset_index(drop=True)

df['stm_aanntpl_tijd']= pd.to_datetime(df['stm_aanntpl_tijd'])
df['stm_fh_ddt']= pd.to_datetime(df['stm_fh_ddt'])

# Calculate the true reparation time
df['stm_hers_tijd'] = (df['stm_fh_ddt'].dt.hour * 60 + df['stm_fh_ddt'].dt.minute) - (df['stm_aanntpl_tijd'].dt.hour * 60 + df['stm_aanntpl_tijd'].dt.minute)

df = df.dropna(subset=['stm_hers_tijd','stm_prioriteit' ]) 
df = df.reset_index(drop=True)

#### Nu de data in juiste types staat en gene duplicates in zitten, kunnen we verder gaan met de volgende functies:
- prep_data, hierin cleanen we de data nog om de mean square error beter te maken
- prep_rfr_model, om te model te gebruiken, worden er dummies en de data wordt splitst in train en test waardes
- rfr_model, hier wordt randomforestregressor gebruikt om de waardes te voorspellen
- rfc_model,hier wordt randomforestclassifier gebruikt om de waardes te voorspellen

In [None]:
def prep_data (df):
    df_Kpog = df.drop_duplicates(subset=['#stm_sap_meldnr'])
    df_Kpog = df_Kpog.reset_index(drop=True)
    df_Kpog = df_Kpog.dropna(subset=['stm_hers_tijd', 'stm_oorz_code', 'stm_prioriteit']) # , 'stm_oorz_groep'
    df_Kpog = df_Kpog.reset_index(drop=True)
    df_Kpog = df_Kpog[(df_Kpog['stm_hers_tijd'] >= 4) & (df_Kpog['stm_hers_tijd'] <= 480)]
    df_Kpog = df_Kpog.reset_index(drop=True)
    return df_Kpog

def prep_rfr_model (df):
    DummiesX = pd.get_dummies(data=df[['stm_oorz_code', 'stm_prioriteit']]) #, 'stm_oorz_groep']])
    DummiesX.head()
    y = df['stm_hers_tijd']
    X_train, X_test, y_train, y_test = train_test_split(DummiesX, y, random_state=2)
    return X_train, X_test, y_train, y_test
    
def rfr_model (X_train, X_test, y_train, y_test):
    regr = RandomForestRegressor(max_depth=15, random_state=0)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    meansquare = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test, y_pred)
    return r2, meansquare

def rfc_model (X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(max_depth=15, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    meansquare = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test, y_pred)
#     print(clf.predict_proba(DummiesX[:5]))
    return r2, meansquare


In [None]:
df_Kpog = df[['#stm_sap_meldnr', 'stm_hers_tijd', 'stm_oorz_groep','stm_prioriteit', 'stm_oorz_code']]

In [None]:
df_Kpog = prep_data (df_Kpog)
X_train, X_test, y_train, y_test = prep_rfr_model (df_Kpog)
print("Random Forest Classifier: " + str(rfc_model(X_train, X_test, y_train, y_test)))
print("Random Forest Regressor: " + str(rfr_model(X_train, X_test, y_train, y_test)))

Random Forest Classifier: (-0.2877596227585655, 88.05903363398777)
Random Forest Regressor: (0.07181898604536596, 74.76061297516087)
