# First look at ProRail data

In [None]:
# imported libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import random
import warnings
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
storing_csv = pd.read_csv("data/sap_storing_data_hu_project.csv", index_col=0)
df = pd.DataFrame(storing_csv)
df.head()

In [None]:
df = df[[
 '#stm_sap_meldnr',
 'stm_mon_nr',
 'stm_vl_post',
 'stm_sap_meld_ddt',
 'stm_sap_meldtekst_lang',
 'stm_sap_meldtekst',
 'stm_geo_mld',
 'stm_geo_mld_uit_functiepl',
 'stm_equipm_nr_mld',
 'stm_equipm_soort_mld',
 'stm_equipm_omschr_mld',
 'stm_km_van_mld',
 'stm_km_tot_mld',
 'stm_prioriteit',
 'stm_status_melding_sap',
 'stm_aanngeb_ddt',
 'stm_oh_pg_gst',
 'stm_geo_gst',
 'stm_geo_gst_uit_functiepl',
 'stm_equipm_nr_gst',
 'stm_equipm_soort_gst',
 'stm_equipm_omschr_gst',
 'stm_km_van_gst',
 'stm_km_tot_gst',
 'stm_oorz_groep',
 'stm_oorz_code',
 'stm_oorz_tkst',
 'stm_fh_ddt',
 'stm_fh_status',
 'stm_sap_storeind_ddt',
 'stm_tao_indicator',
 'stm_tao_indicator_vorige',
 'stm_tao_soort_mutatie',
 'stm_tao_telling_mutatie',
 'stm_tao_beinvloedbaar_indicator',
 'stm_evb',
 'stm_sap_melddatum',
 'stm_sap_meldtijd',
 'stm_contractgeb_mld',
 'stm_functiepl_mld',
 'stm_techn_mld',
 'stm_contractgeb_gst',
 'stm_functiepl_gst',
 'stm_techn_gst',
 'stm_aanngeb_dd',
 'stm_aanngeb_tijd',
 'stm_aanntpl_dd',
 'stm_aanntpl_tijd',
 'stm_arbeid',
 'stm_progfh_in_datum',
 'stm_progfh_in_tijd',
 'stm_progfh_in_invoer_dat',
 'stm_progfh_in_invoer_tijd',
 'stm_progfh_in_duur',
 'stm_progfh_gw_tijd',
 'stm_progfh_gw_lwd_datum',
 'stm_progfh_gw_lwd_tijd',
 'stm_progfh_gw_duur',
 'stm_progfh_gw_teller',
 'stm_afspr_aanvangdd',
 'stm_afspr_aanvangtijd',
 'stm_fh_dd',
 'stm_fh_tijd',
 'stm_fh_duur',
 'stm_reactie_duur',
 'stm_sap_storeinddatum',
 'stm_sap_storeindtijd',
 'stm_oorz_tekst_kort',
 'stm_pplg_van',
 'stm_pplg_naar',
 'stm_dstrglp_van',
 'stm_dstrglp_naar']]

In [None]:
df.sample(20)

In [None]:
df.describe()

In [None]:
# Unique values of each column
for column in df.columns[1:]:
    print(f"{column}: ")
    print(eval(f"df.{column}.unique()"))

In [None]:
for column in df.columns:
    count = df[f'{column}'].isna().sum()
    print(f"{column}: {count}")

In [None]:
df.corr()[]

In [None]:
len(df.columns)

In [None]:
plt.boxplot(df['stm_fh_duur'].dropna())

In [None]:
df[['stm_fh_ddt', 'stm_sap_storeind_ddt']].sample(20)

In [None]:
dt_format = '%d/%m/%Y %H:%M:%S'
duration = []
for i in range(len(df)):
    ddtb = df['stm_sap_meld_ddt'].loc[df.index[i]]
    ddte = df['stm_fh_ddt'].loc[df.index[i]]
    # if type is float it is an nan value
    if type(ddtb) is float or type(ddte) is float:
        duration.append(-1)
        continue
    dt_o1 = datetime.strptime(ddtb, dt_format)
    dt_o2 = datetime.strptime(ddte, dt_format)
    timedif = dt_o2 - dt_o1
    duration.append(timedif.seconds + timedif.days * (24*60*60))


In [None]:
duration2 = (np.array(duration)/60)
plt.scatter(df['stm_fh_duur'], duration2, s=3, alpha=0.02)
x = np.arange(0, 400000)
# plt.plot(x, x, color='red')
plt.xlim(-1, 400000)
plt.ylim(-1, 400000)
plt.show()

In [None]:
max(duration)

In [None]:
plt.boxplot(duration)

In [None]:
np.quantile(duration, [0.25,0.5,0.75])

In [None]:
type(df['stm_sap_meld_ddt'].loc[df.index[2]])

In [None]:
test_dt1 = '02/01/2006 09:00:00'
test_dt2 = '02/02/2010 08:00:10'
dt_format = '%d/%m/%Y %H:%M:%S'
dt_o1 = datetime.strptime(test_dt1, dt_format)
dt_o2 = datetime.strptime(test_dt2, dt_format)
timedif = dt_o2 - dt_o1
timedifsec = timedif.seconds + timedif.days * (24*60*60)
timedifsec

In [None]:
type(df['stm_sap_meld_ddt'][0])

In [None]:
for column in df.columns:
    count = df[f'{column}'].isna().sum()
    print(f"{column}: {count}")

In [None]:
# df[(df['stm_fh_duur'] == 0)].count()
len(df['stm_fh_duur'].loc[(df['stm_fh_duur'] == 0) | (df['stm_fh_duur'].isna())])

In [None]:
dt_format = '%d/%m/%Y %H:%M:%S'
ddtb = df['stm_sap_meld_ddt'].loc[df.index[150000]]
ddte = df['stm_fh_ddt'].loc[df.index[200000]]
dt_o1 = datetime.strptime(ddtb, dt_format)
dt_o2 = datetime.strptime(ddte, dt_format)
print(dt_o2 - dt_o1)
dt_o2 - dt_o1

In [None]:
dt_format = '%d/%m/%Y %H:%M:%S'
fh_to_eindst = []
for i in range(len(df)):
    ddtb = df['stm_fh_ddt'].loc[df.index[i]]
    ddte = df['stm_sap_storeind_ddt'].loc[df.index[i]]
    # if type is float it is an nan value
    if type(ddtb) is float or type(ddte) is float:
#         fh_to_eindst.append(-1)
        continue
    dt_o1 = datetime.strptime(ddtb, dt_format)
    dt_o2 = datetime.strptime(ddte, dt_format)
    timedif = dt_o2 - dt_o1
    fh_to_eindst.append(timedif.seconds + timedif.days * (24*60*60))

In [None]:
# seconds
plt.boxplot(fh_to_eindst)

In [None]:
# seconds
np.quantile(fh_to_eindst, [0.25,0.5,0.75])

In [None]:
max(fh_to_eindst)

In [None]:
df.corr()['stm_fh_duur']