In [1]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('sap_storing_data_hu_project.csv', low_memory = False)  

# Remove duplicates, date time convert, hersteltijd created, dropped nans

In [3]:
df = df.drop_duplicates(subset=['#stm_sap_meldnr'])
df = df.reset_index(drop=True)

df['stm_aanntpl_tijd']= pd.to_datetime(df['stm_aanntpl_tijd'])
df['stm_fh_ddt']= pd.to_datetime(df['stm_fh_ddt'])

# Calculate the true reparation time
df['stm_hers_tijd'] = (df['stm_fh_ddt'].dt.hour * 60 + df['stm_fh_ddt'].dt.minute) - (df['stm_aanntpl_tijd'].dt.hour * 60 + df['stm_aanntpl_tijd'].dt.minute)

df = df.dropna(subset=['stm_hers_tijd']) 
df = df.reset_index(drop=True)

# Linear regression

In [4]:
df_LR = df[['#stm_sap_meldnr', 'stm_hers_tijd', 'stm_oorz_groep','stm_prioriteit', 'stm_oorz_code']]

In [5]:
def prep_data(df, nan_columns):    
    df = df.dropna(subset=nan_columns) #['stm_hers_tijd', 'stm_oorz_code', 'stm_prioriteit']
    df = df.reset_index(drop=True)
    
    # TODO: waarom zoveel weggegooid, explain
    df = df[(df['stm_hers_tijd'] >= 4) & (df['stm_hers_tijd'] <= 480)]
    df = df.reset_index(drop=True)
    return df

In [6]:
def split_data (df, dummies_columns):
    DummiesX = pd.get_dummies(data=df[dummies_columns]) # ['stm_oorz_code', 'stm_prioriteit']
    y = df['stm_hers_tijd']
    
    X_train, X_test, y_train, y_test = train_test_split(DummiesX, y, random_state=2)
    
    return X_train, X_test, y_train, y_test

In [7]:
def lr_train_model(X_train, X_test, y_train, y_test):
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)

    y_pred = linreg.predict(X_test)

    RMSE = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test,y_pred)

In [8]:
df_LR = prep_data(df_LR, ['stm_hers_tijd', 'stm_oorz_code', 'stm_prioriteit'])

In [9]:
X_train, X_test, y_train, y_test = split_data(df_LR, ['stm_oorz_code', 'stm_prioriteit'])

In [10]:
lr_train_model(X_train, X_test, y_train, y_test)

In [11]:
# def best_max_depth_calculator():
#     depthlist = []
#     depth_r2 = []
#     for depth in range(1,15):
#         regr = RandomForestRegressor(max_depth=depth, random_state=0)
#         regr.fit(X_train, y_train)
#         y_pred = regr.predict(X_test)
#         depthlist.append(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
#         depth_r2.append(r2_score(y_test, y_pred))
#     return depthlist, depth_r2

In [12]:
# best_max_depth_calculator()

In [13]:
# met 12 voor max_depth is het beste, daarna gaat de RMSE score iets omhoog.

In [14]:
# met 12 voor max_depth is het beste, daarna gaat de RMSE score iets omhoog.

# Decision Tree Regressor

In [50]:
from sklearn.tree import DecisionTreeRegressor

In [60]:
df_DTR = df[['#stm_sap_meldnr', 'stm_hers_tijd', 'stm_oorz_groep','stm_prioriteit', 'stm_oorz_code']]

In [61]:
def dtr_train_model(X_train, X_test, y_train, y_test, maxDepth):
    DTR = DecisionTreeRegressor(max_depth=maxDepth)
    DTR.fit(X_train,y_train)
    y_pred = DTR.predict(X_test)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test, y_pred)
    return r2, RMSE

In [62]:
df_DTR = prep_data(df_DTR, ['stm_hers_tijd', 'stm_oorz_code', 'stm_prioriteit'])

In [63]:
X_train, X_test, y_train, y_test = split_data(df_DTR, ['stm_oorz_code', 'stm_prioriteit'])

In [64]:
dtr_train_model(X_train, X_test, y_train, y_test, 2)

(0.016480960876911754, 76.9569591837372)

In [65]:
from sklearn.tree import DecisionTreeRegressor

def decision_tree_regressor_predict_proba(X_train, y_train, X_test, **kwargs):
    """Trains DecisionTreeRegressor model and predicts probabilities of each y.

    Args:
        X_train: Training features.
        y_train: Training labels.
        X_test: New data to predict on.
        **kwargs: Other arguments passed to DecisionTreeRegressor.

    Returns:
        DataFrame with columns for record_id (row of X_test), y 
        (predicted value), and prob (of that y value).
        The sum of prob equals 1 for each record_id.
    """
    # Train model.
    m = DecisionTreeRegressor(**kwargs).fit(X_train, y_train)
    # Get y values corresponding to each node.
    node_ys = pd.DataFrame({'node_id': m.apply(X_train), 'y': y_train})
    print(node_ys)
    # Calculate probability as 1 / number of y values per node.
    node_ys['prob'] = 1 / node_ys.groupby(node_ys.node_id).transform('count')
    # Aggregate per node-y, in case of multiple training records with the same y.
    node_ys_dedup = node_ys.groupby(['node_id', 'y']).prob.sum().to_frame()\
        .reset_index()
    # Extract predicted leaf node for each new observation.
    leaf = pd.DataFrame(m.decision_path(X_test).toarray()).apply(
        lambda x:x.to_numpy().nonzero()[0].max(), axis=1).to_frame(
            name='node_id')
    leaf['record_id'] = leaf.index
    # Merge with y values and drop node_id.
    return leaf.merge(node_ys_dedup, on='node_id').drop(
        'node_id', axis=1).sort_values(['record_id', 'y'])

In [66]:
df_prob = decision_tree_regressor_predict_proba(X_train, y_train, X_test, random_state=0)

        node_id      y
242455      181   50.0
49341       536  150.0
104996      181   15.0
119100      626   66.0
92276       510   60.0
...         ...    ...
33867       100   35.0
84434       520   47.0
95816       154    6.0
203245      467   41.0
100879      520   42.0

[258359 rows x 2 columns]


In [67]:
df_prob.columns

Index(['record_id', 'y', 'prob'], dtype='object')

In [68]:
df_prob.sort_values(by='prob', ascending=False)

Unnamed: 0,record_id,y,prob
27435716,15849,464.0,1.000000
25485463,72887,270.0,1.000000
27437896,48470,422.0,1.000000
27438105,79422,96.0,1.000000
27433752,13229,66.0,1.000000
...,...,...,...
4738783,57293,377.0,0.000066
4890614,63111,219.0,0.000066
4890661,63111,266.0,0.000066
4357988,42361,433.0,0.000066


# Random Forest Regressor

In [23]:
df_RFR = df[['#stm_sap_meldnr', 'stm_hers_tijd', 'stm_oorz_groep','stm_prioriteit', 'stm_oorz_code']]
    
def rfr_train_model (X_train, X_test, y_train, y_test, maxDepth):
    regr = RandomForestRegressor(max_depth=maxDepth, random_state=0)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    meansquare = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test, y_pred)
    return r2, meansquare

def rfc_train_model (X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(max_depth=15, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    meansquare = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test, y_pred)
    return r2, meansquare

In [24]:
df_RFR = prep_data(df_RFR, ['stm_hers_tijd', 'stm_oorz_code', 'stm_prioriteit'])

In [25]:
X_train, X_test, y_train, y_test = split_data(df_RFR, ['stm_oorz_code', 'stm_prioriteit'])

In [26]:
rfr_train_model(X_train, X_test, y_train, y_test, 12)

(0.0720348860973693, 74.75191760286457)