# Pipeline Data understanding and preparation

## Libraries importeren

In [113]:
# imported libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import random
import warnings
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
warnings.simplefilter(action='ignore', category=FutureWarning)

## DataFrame inladen

In [114]:
cols_to_use = [
    '#stm_sap_meldnr',
    'stm_mon_nr',
    'stm_vl_post',
    'stm_sap_meld_ddt',
    'stm_sap_meldtekst_lang',
    'stm_sap_meldtekst',
    'stm_geo_mld',
    'stm_geo_mld_uit_functiepl',
    'stm_equipm_nr_mld',
    'stm_equipm_soort_mld',
    'stm_equipm_omschr_mld',
    'stm_km_van_mld',
    'stm_km_tot_mld',
    'stm_prioriteit',
    'stm_status_melding_sap',
    'stm_aanngeb_ddt',
    'stm_oh_pg_gst',
    'stm_geo_gst',
    'stm_geo_gst_uit_functiepl',
    'stm_equipm_nr_gst',
    'stm_equipm_soort_gst',
    'stm_equipm_omschr_gst',
    'stm_km_van_gst',
    'stm_km_tot_gst',
    'stm_oorz_groep',
    'stm_oorz_code',
    'stm_oorz_tkst',
    'stm_fh_ddt',
    'stm_fh_status',
    'stm_sap_storeind_ddt',
    'stm_tao_indicator',
    'stm_tao_indicator_vorige',
    'stm_tao_soort_mutatie',
    'stm_tao_telling_mutatie',
    'stm_tao_beinvloedbaar_indicator',
    'stm_evb',
    'stm_sap_melddatum',
    'stm_sap_meldtijd',
    'stm_contractgeb_mld',
    'stm_functiepl_mld',
    'stm_techn_mld',
    'stm_contractgeb_gst',
    'stm_functiepl_gst',
    'stm_techn_gst',
    'stm_aanngeb_dd',
    'stm_aanngeb_tijd',
    'stm_aanntpl_dd',
    'stm_aanntpl_tijd',
    'stm_arbeid',
    'stm_progfh_in_datum',
    'stm_progfh_in_tijd',
    'stm_progfh_in_invoer_dat',
    'stm_progfh_in_invoer_tijd',
    'stm_progfh_in_duur',
    'stm_progfh_gw_tijd',
    'stm_progfh_gw_lwd_datum',
    'stm_progfh_gw_lwd_tijd',
    'stm_progfh_gw_duur',
    'stm_progfh_gw_teller',
    'stm_afspr_aanvangdd',
    'stm_afspr_aanvangtijd',
    'stm_fh_dd',
    'stm_fh_tijd',
    'stm_fh_duur',
    'stm_reactie_duur',
    'stm_sap_storeinddatum',
    'stm_sap_storeindtijd',
    'stm_oorz_tekst_kort',
    'stm_pplg_van',
    'stm_pplg_naar',
    'stm_dstrglp_van',
    'stm_dstrglp_naar'
]

In [115]:
df = pd.read_csv("data/sap_storing_data_hu_project.csv", index_col=0, usecols=cols_to_use, engine='pyarrow')
df.head()

Unnamed: 0_level_0,stm_mon_nr,stm_vl_post,stm_sap_meld_ddt,stm_sap_meldtekst_lang,stm_sap_meldtekst,stm_geo_mld,stm_geo_mld_uit_functiepl,stm_equipm_nr_mld,stm_equipm_soort_mld,stm_equipm_omschr_mld,...,stm_fh_tijd,stm_fh_duur,stm_reactie_duur,stm_sap_storeinddatum,stm_sap_storeindtijd,stm_oorz_tekst_kort,stm_pplg_van,stm_pplg_naar,stm_dstrglp_van,stm_dstrglp_naar
#stm_sap_meldnr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
50053211,0.0,,02/01/2006 09:00:00,Logboeknr Geeltje : 49 Tijd: 0900 VL-Po...,Logboeknr Geeltje : 49 Tijd: 0900,624.0,624.0,,,,...,09:00:00,0.0,99999999.0,02/01/2006,09:00:00,,,,,
50053213,48.0,GN,02/01/2006 12:35:00,Logboeknr RBV : 48 Tijd: 1235 VL-Post: ...,Logboeknr RBV : 48 Tijd: 1235 VL-P,201.0,201.0,,,,...,13:26:00,51.0,99999999.0,02/01/2006,13:26:00,schapen op de spoorbaan!,,,Lp,Apg
50053214,72.0,ZL,02/01/2006 16:40:00,Logboeknr RBV : 72 Tijd: 1640 VL-Post: ...,Logboeknr RBV : 72 Tijd: 1640 VL-P,25.0,25.0,,,,...,17:20:00,40.0,99999999.0,02/01/2006,17:20:00,Persoon langs de baan,,,Hgl,
50053215,96.0,ZL,02/01/2006 22:30:00,Logboeknr RBV : 96 Tijd: 2230 VL-Post: ...,Logboeknr RBV : 96 Tijd: 2230 VL-P,12.0,12.0,,,,...,22:36:00,6.0,99999999.0,02/01/2006,22:36:00,Bijna aanrijding met persoon,,,Hgv,


## Prepareren Target variabele

De gekozen target variabele is de tijd (in minuten) vanaf het moment dat de aannemer ter plaatse is tot functie herstel

In [116]:
# alle relevante kolommen veranderen naar datetimes
df['stm_aanntpl_tijd'] = pd.to_datetime(df['stm_aanntpl_tijd'], format='%H:%M:%S', errors='coerce')
df['stm_aanntpl_dd'] = pd.to_datetime(df['stm_aanntpl_dd'], format='%d/%m/%Y', errors='coerce')
df['stm_fh_ddt'] = pd.to_datetime(df['stm_fh_ddt'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

df[['stm_aanntpl_tijd', 'stm_aanntpl_dd', 'stm_fh_ddt']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 898526 entries, 0 to 99999999
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   stm_aanntpl_tijd  898488 non-null  datetime64[ns]
 1   stm_aanntpl_dd    689915 non-null  datetime64[ns]
 2   stm_fh_ddt        732891 non-null  datetime64[ns]
dtypes: datetime64[ns](3)
memory usage: 27.4 MB


In [117]:
df['stm_aanntpl_tijd'] = df['stm_aanntpl_tijd'].astype('str')
df['stm_aanntpl_dd'] = df['stm_aanntpl_dd'].astype('str')
# de twee kolommen samen voegen om 1 datetime feature van te maken
df["aanntpl_ddt"] = df["stm_aanntpl_dd"] + " " + df["stm_aanntpl_tijd"].apply(lambda x: x.split(' ')[-1])
df['aanntpl_ddt'] = pd.to_datetime(df['aanntpl_ddt'], format='%Y-%m-%d %H:%M:%S', errors='coerce') 
print(df['aanntpl_ddt'].isna().sum())
df = df.dropna(subset=['aanntpl_ddt'])
df['aanntpl_ddt'].sample(30)

208611


#stm_sap_meldnr
80700775   2013-01-15 10:09:00
80493478   2009-02-16 00:57:00
80685807   2012-09-30 16:30:00
80752571   2014-02-05 01:44:00
80748551   2014-01-03 12:50:00
80797531   2015-03-02 08:51:00
80544885   2010-02-09 10:00:00
80770483   2014-07-02 08:30:00
80713337   2013-04-04 06:06:00
80891818   2017-07-11 23:30:01
80620576   2011-06-20 08:30:00
80701031   2013-01-15 14:20:00
80849896   2016-05-31 16:52:32
80810064   2015-06-24 08:30:00
80727542   2013-07-21 13:00:00
80821885   2015-10-03 14:15:00
80765617   2014-05-21 20:54:00
80452646   2008-04-22 07:17:00
80810979   2015-07-01 15:55:00
80844236   2016-04-15 01:30:00
80411287   2007-06-08 21:21:00
80756035   2014-03-07 15:16:00
80720674   2013-06-03 08:05:00
80765299   2014-05-20 09:09:00
80635440   2011-10-06 01:00:00
80538041   2009-12-29 12:11:00
70367000   2018-05-02 16:45:00
80618943   2011-06-07 10:20:00
80718309   2013-05-14 08:59:00
80695984   2012-12-10 04:51:00
Name: aanntpl_ddt, dtype: datetime64[ns]

In [118]:
# berken de duur van aannemer ter plaatse tot functieherstel in minuten
df['anm_tot_fh'] = df['stm_fh_ddt'] - df['aanntpl_ddt']
df['anm_tot_fh'] = df['anm_tot_fh'].apply(lambda x: x.seconds/60 + x.days * (24*60))
df = df.dropna(subset=['anm_tot_fh'])
df['anm_tot_fh'].sample(30)


#stm_sap_meldnr
80378902        90.000000
80520781        19.000000
80697628        35.000000
80705684        45.000000
80414160       110.000000
80747023         0.000000
80535476        35.000000
80694410        99.000000
80412715        15.000000
70058727        45.000000
80842138        31.716667
70435868       249.033333
80842693         5.000000
80708688        92.000000
80894673         0.816667
80852894        22.000000
80900123       283.983333
80851537         0.000000
80910851         1.000000
80544992         3.000000
80777520        16.000000
80652056        58.000000
80750644         9.000000
80854292         0.000000
80798872     45047.000000
80616194         3.000000
80346297        39.383333
80409474    234008.000000
80928627        30.016667
70441973       120.000000
Name: anm_tot_fh, dtype: float64

In [120]:
# verwijder alle negatieve waardes, omdat dit er niet zo veel zijn
df = df[df['anm_tot_fh'] >= 0]