In [147]:
import pandas as pd
import numpy as np
from scipy import stats
from cleaning import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [148]:
# Define the columns to drop
columns_to_drop = ['stm_sap_mon_meld_ddt', 'stm_mon_begin_ddt', 'stm_mon_toelichting_trdl', 'stm_oh_pg_mld',
                   'stm_scenario_mon', 'stm_mon_nr_status_omschr', 'stm_mon_nr__statuscode', 'stm_mon_nr_status_wijzdd',
                   'stm_aanntpl_ddt', 'stm_objectdl_code_gst', 'stm_objectdl_groep_gst', 'stm_progfh_in_ddt',
                   'stm_progfh_in_invoer_ddt', 'stm_progfh_gw_ddt', 'stm_progfh_gw_lwd_ddt', 'stm_progfh_hz',
                   'stm_veroorz_groep', 'stm_veroorz_code', 'stm_veroorz_tekst_kort', 'stm_effect', 'stm_afspr_aanvangddt',
                   'stm_mon_eind_ddt', 'stm_mon_vhdsincident', 'stm_dir_betrok_tr', 'stm_aangelegd_dd', 'stm_aangelegd_tijd',
                   'stm_mon_begindatum', 'stm_mon_begintijd', 'stm_progfh_gw_datum', 'stm_mon_eind_datum', 'stm_mon_eind_tijd',
                   'stm_controle_dd', 'stm_akkoord_mon_toewijz', 'stm_status_sapnaarmon', 'stm_fact_jn', 'stm_akkoord_melding_jn',
                   'stm_afsluit_ddt', 'stm_afsluit_dd', 'stm_afsluit_tijd', 'stm_rec_toegev_ddt', 'stm_hinderwaarde',
                   'stm_actie', 'stm_standplaats', 'stm_status_gebr', 'stm_wbi_nummer', 'stm_projnr', 'stm_historie_toelichting',
                   'stm_schade_verhaalb_jn', 'stm_schadenr', 'stm_schade_status_ga', 'stm_schade_statusdatum', 'stm_relatiervo_vorig',
                   'stm_relatiervo_volgend', 'stm_relatiervo', 'stm_afspr_func_hersteldd', 'stm_afspr_func_hersteltijd',
                   'stm_sorteerveld', 'stm_rapportage_maand', 'stm_rapportage_jaar', 'stm_x_bron_publ_dt', 'stm_x_bron_bestandsnaam',
                   'stm_x_bron_arch_dt', 'stm_x_actueel_ind', 'stm_x_run_id', 'stm_x_bk', 'stm_x_start_sessie_dt', 'stm_x_vervallen_ind']

# Load the data
df = load_data('sap_storing_data_hu_subset.csv')

# Drop the unnecessary columns
df = drop_columns(df, columns_to_drop)

# Clean the data (handle missing values, remove columns with excessive NaNs)
#df, avg_list, mode_list = clean_data(df)

# Filter de data
df = filter_data(df)

# Save the cleaned data to a new CSV (optional)
save_data(df, 'final_db_cleaned.csv')

# Display results (optional)
#print("Data cleaning complete.")
#print("Columns filled with averages: ", avg_list)
#print("Columns filled with mode: ", mode_list)
#print("Cleaned dataframe shape: ", df.shape)


In [149]:
# Tijdelijk
df[['stm_sap_meld_ddt', 'stm_fh_ddt', 'stm_progfh_in_invoer_tijd', 'stm_progfh_in_invoer_dat', 'totale_functiehersteltijd']].sample(10)
df['stm_progfh_in_invoer_tijd'].dtype

dtype('O')

In [150]:
df.describe

<bound method NDFrame.describe of     Unnamed: 0  #stm_sap_meldnr  stm_mon_nr stm_vl_post    stm_sap_meld_ddt  \
1            2         50053213        48.0          GN 2006-01-02 12:35:00   
2            3         50053214        72.0          ZL 2006-01-02 16:40:00   
3            4         50053215        96.0          ZL 2006-01-02 22:30:00   
4            5         50053218        38.0         EHV 2006-01-02 11:23:00   
5            6         50053219        99.0         EHV 2006-01-02 23:25:00   
6            7         50053220         0.0         NaN 2006-01-02 16:45:00   
7            8         50053222         0.0         NaN 2006-01-02 14:18:00   
10          11         50053225         0.0         NaN 2006-01-02 10:45:00   
12          13         50053227         0.0         NaN 2006-01-03 19:18:00   
14          15         50053233        45.0          GN 2006-01-03 10:32:00   
15         200         50053574         3.0          AH 2006-01-08 04:10:00   
16         201    

In [151]:
# Updated function to handle both scalar and array return types
def mode_per_group(group):
    mode_result = stats.mode(group, keepdims=True)  # Ensure it returns in an array-like format
    return mode_result.mode[0]  # Safely access the mode value

# Calculate baseline (mode) for each oorzaak code
baseline_modes = df.groupby('stm_oorz_code')['stm_fh_duur'].apply(mode_per_group).reset_index()
baseline_modes.columns = ['stm_oorz_code', 'baseline_fh_duur']

# Merge baseline values with the original DataFrame
df = df.merge(baseline_modes, on='stm_oorz_code')

# Calculate the difference between actual time and baseline (mode)
df['difference'] = df['stm_fh_duur'] - df['baseline_fh_duur']

In [152]:
# Mean Absolute Error (MAE)
mae = df['difference'].abs().mean()
print(f'Mean Absolute Error (MAE): {mae}')


Mean Absolute Error (MAE): 52.52542372881356


In [153]:
# Check if the baseline prediction matches the actual time
df['correct_prediction'] = df['stm_fh_duur'] == df['baseline_fh_duur']

# Calculate the accuracy in percentage
accuracy = df['correct_prediction'].mean() * 100

# Print the accuracy
print(f'Accuracy of the baseline model: {accuracy:.2f}%')


Accuracy of the baseline model: 38.98%


In [154]:
# Onafhankelijke variabelen (X) en afhankelijke variabele (y)
X = df[['stm_progfh_in_duur', 'stm_oorz_code']]
y = df['stm_fh_duur']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Lineair regressiemodel aanmaken
model = LinearRegression()

# Train het model
model.fit(X_train, y_train)

# Maak voorspellingen op de testset
y_pred = model.predict(X_test)

# Bereken de evaluatiestatistieken
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Zet R2 om in een percentage
r2_percentage = r2 * 100

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_percentage:.2f}%')


Mean Squared Error: 4430.315235328863
R-squared: -142.58%
