In [20]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [22]:
def Model_trainer_RandomForest(df, dummies_columns):
    def prep_data(df, nan_columns):    
        df = df.dropna(subset=nan_columns) #['stm_hers_tijd', 'stm_oorz_code', 'stm_prioriteit']
        df = df.reset_index(drop=True)
        df = df[(df['stm_hers_tijd'] >= 4) & (df['stm_hers_tijd'] <= 480)]
        df = df.reset_index(drop=True)
        return df
    
    def rfr_train_model (df, dummies_columns):
        X_train = pd.get_dummies(data=df[dummies_columns]) # ['stm_oorz_code', 'stm_prioriteit']
        y_train = df['stm_hers_tijd']
        regr = RandomForestRegressor(max_depth=12, random_state=0)
        regr.fit(X_train, y_train)
        return X_train, y_train, regr
    
    df_RFR = prep_data(df[['#stm_sap_meldnr', 'stm_hers_tijd', 'stm_oorz_groep','stm_prioriteit', 'stm_oorz_code']], ['stm_hers_tijd', 'stm_oorz_code', 'stm_prioriteit'])
    X_train, y_train, regr = rfr_train_model (df_RFR, dummies_columns)
    return X_train, y_train, regr
    
def RandomForestTester (oorzaakcode, prioriteit, geschatte_tijd, X_train, y_train, regr):
    def dataframe_creator(oorzaakcode, prioriteit, geschatte_tijd):
        X_test = pd.DataFrame({'stm_oorz_code': [oorzaakcode], 'stm_prioriteit': [prioriteit]})
        y_test = pd.DataFrame({'stm_hers_tijd': [geschatte_tijd]})
        return X_test, y_test
    
    def predictor(regr, X_train, y_train, X_test, y_test):
        y_pred = regr.predict(X_test)
        meansquare = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
        if geschatte_tijd != 0:
            output = "Bij oorzaakcode {}, prioriteit {} met werkelijke tijd {} gokt het: {}, dus Hij zit er {} naast".format(oorzaakcode, prioriteit, geschatte_tijd, round(y_pred[0]), round(meansquare))
        else:
            output = "Bij oorzaakcode {}, prioriteit {} met onbekende werkelijke tijd gokt het: {}".format(oorzaakcode, prioriteit, round(y_pred[0]))
        return output
    X_test, y_test = dataframe_creator(oorzaakcode, prioriteit, geschatte_tijd)
    output = predictor(regr, X_train, y_train, X_test, y_test)
    return output

def onetimerun():
    df = pd.read_csv('sap_storing_data_hu_project.csv', low_memory = False)
    df = df.drop_duplicates(subset=['#stm_sap_meldnr'])
    df = df.reset_index(drop=True)

    df['stm_aanntpl_tijd']= pd.to_datetime(df['stm_aanntpl_tijd'])
    df['stm_fh_ddt']= pd.to_datetime(df['stm_fh_ddt'])

    # Calculate the true reparation time
    df['stm_hers_tijd'] = (df['stm_fh_ddt'].dt.hour * 60 + df['stm_fh_ddt'].dt.minute) - (df['stm_aanntpl_tijd'].dt.hour * 60 + df['stm_aanntpl_tijd'].dt.minute)

    df = df.dropna(subset=['stm_hers_tijd']) 
    df = df.reset_index(drop=True)
    return df

In [None]:
#only 1 time run
df = onetimerun()

In [None]:
#only 1 time run
X_train, y_train, regr = Model_trainer_RandomForest(df, ['stm_oorz_code', 'stm_prioriteit'])

In [None]:
oorzaakcode = 218
prioriteit = 2
geschatte_tijd = np.nan

In [None]:
try:
    print(RandomForestTester (oorzaakcode, prioriteit, geschatte_tijd, X_train, y_train, regr))
except ValueError:
    print(RandomForestTester (oorzaakcode, prioriteit, 0, X_train, y_train, regr))