# Pipeline Data understanding and preparation

## Libraries importeren

In [96]:
# imported libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import random
import warnings
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
warnings.simplefilter(action='ignore', category=FutureWarning)

## DataFrame inladen

In [97]:
cols_to_use = [
    '#stm_sap_meldnr',
    'stm_mon_nr',
    'stm_vl_post',
    'stm_sap_meld_ddt',
    'stm_sap_meldtekst_lang',
    'stm_sap_meldtekst',
    'stm_geo_mld',
    'stm_geo_mld_uit_functiepl',
    'stm_equipm_nr_mld',
    'stm_equipm_soort_mld',
    'stm_equipm_omschr_mld',
    'stm_km_van_mld',
    'stm_km_tot_mld',
    'stm_prioriteit',
    'stm_status_melding_sap',
    'stm_aanngeb_ddt',
    'stm_oh_pg_gst',
    'stm_geo_gst',
    'stm_geo_gst_uit_functiepl',
    'stm_equipm_nr_gst',
    'stm_equipm_soort_gst',
    'stm_equipm_omschr_gst',
    'stm_km_van_gst',
    'stm_km_tot_gst',
    'stm_oorz_groep',
    'stm_oorz_code',
    'stm_oorz_tkst',
    'stm_fh_ddt',
    'stm_fh_status',
    'stm_sap_storeind_ddt',
    'stm_tao_indicator',
    'stm_tao_indicator_vorige',
    'stm_tao_soort_mutatie',
    'stm_tao_telling_mutatie',
    'stm_tao_beinvloedbaar_indicator',
    'stm_evb',
    'stm_sap_melddatum',
    'stm_sap_meldtijd',
    'stm_contractgeb_mld',
    'stm_functiepl_mld',
    'stm_techn_mld',
    'stm_contractgeb_gst',
    'stm_functiepl_gst',
    'stm_techn_gst',
    'stm_aanngeb_dd',
    'stm_aanngeb_tijd',
    'stm_aanntpl_dd',
    'stm_aanntpl_tijd',
    'stm_arbeid',
    'stm_progfh_in_datum',
    'stm_progfh_in_tijd',
    'stm_progfh_in_invoer_dat',
    'stm_progfh_in_invoer_tijd',
    'stm_progfh_in_duur',
    'stm_progfh_gw_tijd',
    'stm_progfh_gw_lwd_datum',
    'stm_progfh_gw_lwd_tijd',
    'stm_progfh_gw_duur',
    'stm_progfh_gw_teller',
    'stm_afspr_aanvangdd',
    'stm_afspr_aanvangtijd',
    'stm_fh_dd',
    'stm_fh_tijd',
    'stm_fh_duur',
    'stm_reactie_duur',
    'stm_sap_storeinddatum',
    'stm_sap_storeindtijd',
    'stm_oorz_tekst_kort',
    'stm_pplg_van',
    'stm_pplg_naar',
    'stm_dstrglp_van',
    'stm_dstrglp_naar'
]

In [98]:
df = pd.read_csv("data/sap_storing_data_hu_project.csv", index_col=0, usecols=cols_to_use, engine='pyarrow')
df.head()

Unnamed: 0_level_0,stm_mon_nr,stm_vl_post,stm_sap_meld_ddt,stm_sap_meldtekst_lang,stm_sap_meldtekst,stm_geo_mld,stm_geo_mld_uit_functiepl,stm_equipm_nr_mld,stm_equipm_soort_mld,stm_equipm_omschr_mld,...,stm_fh_tijd,stm_fh_duur,stm_reactie_duur,stm_sap_storeinddatum,stm_sap_storeindtijd,stm_oorz_tekst_kort,stm_pplg_van,stm_pplg_naar,stm_dstrglp_van,stm_dstrglp_naar
#stm_sap_meldnr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
50053211,0.0,,02/01/2006 09:00:00,Logboeknr Geeltje : 49 Tijd: 0900 VL-Po...,Logboeknr Geeltje : 49 Tijd: 0900,624.0,624.0,,,,...,09:00:00,0.0,99999999.0,02/01/2006,09:00:00,,,,,
50053213,48.0,GN,02/01/2006 12:35:00,Logboeknr RBV : 48 Tijd: 1235 VL-Post: ...,Logboeknr RBV : 48 Tijd: 1235 VL-P,201.0,201.0,,,,...,13:26:00,51.0,99999999.0,02/01/2006,13:26:00,schapen op de spoorbaan!,,,Lp,Apg
50053214,72.0,ZL,02/01/2006 16:40:00,Logboeknr RBV : 72 Tijd: 1640 VL-Post: ...,Logboeknr RBV : 72 Tijd: 1640 VL-P,25.0,25.0,,,,...,17:20:00,40.0,99999999.0,02/01/2006,17:20:00,Persoon langs de baan,,,Hgl,
50053215,96.0,ZL,02/01/2006 22:30:00,Logboeknr RBV : 96 Tijd: 2230 VL-Post: ...,Logboeknr RBV : 96 Tijd: 2230 VL-P,12.0,12.0,,,,...,22:36:00,6.0,99999999.0,02/01/2006,22:36:00,Bijna aanrijding met persoon,,,Hgv,


## Prepareren Target variabele

De gekozen target variabele is de tijd (in minuten) vanaf het moment dat de aannemer ter plaatse is tot functie herstel

In [99]:
# alle relevante kolommen veranderen naar datetimes
df['stm_aanntpl_tijd'] = pd.to_datetime(df['stm_aanntpl_tijd'], format='%H:%M:%S', errors='coerce')
df['stm_aanntpl_dd'] = pd.to_datetime(df['stm_aanntpl_dd'], format='%d/%m/%Y', errors='coerce')
df['stm_fh_ddt'] = pd.to_datetime(df['stm_fh_ddt'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

df[['stm_aanntpl_tijd', 'stm_aanntpl_dd', 'stm_fh_ddt']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 898526 entries, 0 to 99999999
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   stm_aanntpl_tijd  898488 non-null  datetime64[ns]
 1   stm_aanntpl_dd    689915 non-null  datetime64[ns]
 2   stm_fh_ddt        732891 non-null  datetime64[ns]
dtypes: datetime64[ns](3)
memory usage: 27.4 MB


In [102]:
df['stm_aanntpl_tijd'] = df['stm_aanntpl_tijd'].astype('str')
df['stm_aanntpl_dd'] = df['stm_aanntpl_dd'].astype('str')
# de twee kolommen samen voegen om 1 datetime feature van te maken
df["aanntpl_ddt"] = df["stm_aanntpl_dd"] + " " + df["stm_aanntpl_tijd"].apply(lambda x: x.split(' ')[-1])
df['aanntpl_ddt'] = pd.to_datetime(df['aanntpl_ddt'], format='%Y-%m-%d %H:%M:%S', errors='coerce') 
print(df['aanntpl_ddt'].isna().sum())
df = df.dropna(subset=['aanntpl_ddt'])
df['aanntpl_ddt'].sample(30)

0


#stm_sap_meldnr
80830497   2015-12-12 16:30:00
70080215   2015-07-27 07:31:00
80434524   2007-11-29 16:21:00
80738512   2013-10-16 14:20:00
80751773   2014-01-29 11:49:00
70336822   2018-01-14 03:48:01
80826323   2015-11-12 15:29:00
80531805   2009-11-25 19:31:00
80913523   2018-06-06 01:42:00
80711613   2013-03-22 02:29:00
70380284   2018-06-19 01:25:00
80790026   2014-12-18 09:11:00
80733498   2013-09-04 07:34:00
80474451   2008-09-29 09:24:00
80748828   2014-01-04 22:33:00
80682319   2012-09-03 16:25:00
80672621   2012-06-27 10:00:00
80921469   2018-10-06 06:50:00
80346229   2006-01-26 14:32:00
80829606   2015-12-06 04:35:00
80547050   2010-02-22 15:36:00
80772691   2014-07-18 20:07:00
80800311   2015-03-23 03:25:00
80721142   2013-06-06 09:00:00
80775804   2014-08-12 11:35:00
80758051   2014-03-25 22:55:00
80361203   2006-05-30 11:35:00
80909443   2018-03-29 14:26:00
80448694   2008-03-25 12:48:00
80411702   2007-06-11 18:22:00
Name: aanntpl_ddt, dtype: datetime64[ns]

In [103]:
# berken de duur van aannemer ter plaatse tot functieherstel in minuten
df['anm_tot_fh'] = df['stm_fh_ddt'] - df['aanntpl_ddt']
df['anm_tot_fh'] = df['anm_tot_fh'].apply(lambda x: x.seconds/60 + x.days * (24*60))
df = df.dropna(subset=['anm_tot_fh'])
df['anm_tot_fh'].sample(30)
