# Data Analyse ProRail Storingen

## Importeren benodigde libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
pd.set_option('display.max_columns', None)

## Data inlezen en eerste selectie

In [None]:
df = pd.read_csv('Data/sap_storing_data_hu_subset.csv', index_col=0, low_memory=False)

df = df[[
    "stm_mon_nr", "stm_vl_post", "stm_sap_meld_ddt", "stm_sap_meldtekst_lang",
    "stm_sap_meldtekst", "stm_geo_mld", "stm_equipm_nr_mld", "stm_equipm_soort_mld",
    "stm_equipm_omschr_mld", "stm_km_van_mld", "stm_km_tot_mld", "stm_prioriteit",
    "stm_aanngeb_ddt", "stm_oh_pg_gst", "stm_geo_gst", "stm_equipm_nr_gst",
    "stm_equipm_soort_gst", "stm_equipm_omschr_gst", "stm_km_van_gst", "stm_km_tot_gst",
    "stm_oorz_groep", "stm_oorz_code", "stm_oorz_tkst", "stm_fh_ddt",
    "stm_fh_status", "stm_sap_storeind_ddt", "stm_tao_indicator", "stm_tao_indicator_vorige",
    "stm_tao_soort_mutatie", "stm_tao_telling_mutatie", "stm_tao_beinvloedbaar_indicator", "stm_sap_melddatum",
    "stm_sap_meldtijd", "stm_contractgeb_mld", "stm_techn_mld", "stm_contractgeb_gst",
    "stm_techn_gst", "stm_aanngeb_dd", "stm_aanngeb_tijd", "stm_aanntpl_dd",
    "stm_aanntpl_tijd", "stm_progfh_in_datum", "stm_progfh_in_tijd", "stm_progfh_in_invoer_dat",
    "stm_progfh_in_invoer_tijd", "stm_progfh_in_duur", "stm_progfh_gw_tijd", "stm_progfh_gw_teller",
    "stm_fh_dd", "stm_fh_tijd", "stm_fh_duur", "stm_sap_storeinddatum",
    "stm_sap_storeindtijd", "stm_oorz_tekst_kort", "stm_pplg_van", "stm_pplg_naar",
    "stm_dstrglp_van", "stm_dstrglp_naar"
]]

df = df.drop_duplicates()
df.head()

## Selectie van relevante features voor analyse

In [3]:
data = df[[
    'stm_oorz_code',
    'stm_sap_melddatum',
    'stm_sap_meldtijd',
    'stm_geo_mld',
    'stm_aanntpl_tijd',
    'stm_fh_tijd',
    'stm_techn_mld',
    'stm_prioriteit',
    'stm_contractgeb_mld',
    'stm_fh_duur',
    'stm_progfh_in_duur',
    'stm_progfh_in_tijd'
]]

### Betekenis van de belangrijkste kolommen:
| Kolom | Betekenis | Type |
|-------|-----------|------|
| stm_oorz_code | Oorzaak code | Nominaal |
| stm_sap_melddatum | Datum melding | Ordinaal |
| stm_sap_meldtijd | Tijdstip melding | Ordinaal |
| stm_geo_mld | Geo code melding | Nominaal |
| stm_aanntpl_tijd | Tijdstip aannemer ter plaatse | Ordinaal |
| stm_fh_tijd | Tijdstip functieherstel | Ordinaal |
| stm_techn_mld | Techniekveld melding | Nominaal |
| stm_prioriteit | Prioriteitsindicatie | Ordinaal |
| stm_contractgeb_mld | Contract gebied melding | Nominaal |
| stm_fh_duur | Duur functieherstel | Continue |
| stm_progfh_in_duur | Prognose duur functieherstel | Continue |
| stm_progfh_in_tijd | Prognose tijd functieherstel | Ordinaal |

## Tijd kolommen converteren naar minuten van de dag

In [4]:
tijd_kolommen = ['stm_sap_meldtijd', 'stm_aanntpl_tijd', 'stm_fh_tijd', 'stm_progfh_in_tijd']

def convert_time_to_minutes(df, columns):
    for col in columns:
        df.loc[:, col] = pd.to_datetime(df[col], format='%H:%M:%S', errors='coerce')
        df.loc[:, col] = df[col].apply(lambda x: x.hour * 60 + x.minute if pd.notnull(x) else None)
    return df

data = convert_time_to_minutes(data, tijd_kolommen)
data[['stm_sap_meldtijd', 'stm_aanntpl_tijd', 'stm_fh_tijd', 'stm_fh_duur']]

Unnamed: 0,stm_sap_meldtijd,stm_aanntpl_tijd,stm_fh_tijd,stm_fh_duur
1,540,0,540,0.0
2,755,0,806,51.0
3,1000,0,1040,40.0
4,1350,0,1356,6.0
5,683,0,690,7.0
...,...,...,...,...
908625,486,545,569,83.0
908626,561,608,644,83.0
908627,561,608,644,83.0
908628,855,885,914,19.0


## Analyse functiehersteltijd (FHT)

In [5]:
print(len(data[data['stm_fh_duur'] == 0]))
print(len(data.loc[(data["stm_progfh_in_tijd"] == data["stm_fh_tijd"]) & (data["stm_fh_tijd"] != 0)]))
print(len(data[data['stm_fh_duur'] == (data['stm_progfh_in_tijd']-data['stm_aanntpl_tijd'])]))

8
7
8


### Data analyse bevindingen:
- 160.000 records zonder storing
- 140.000 records met prognose FHT gelijk aan werkelijke FHT
- 150.000 records met werkelijke FHT gelijk aan tijd aannemer tot prognose FHT

## Omzetten prognose duur naar numeriek

In [6]:
data['stm_progfh_in_duur'] = pd.to_numeric(data['stm_progfh_in_duur'], errors='coerce')
data['stm_progfh_in_duur'] = data['stm_progfh_in_duur'].fillna(0).astype(int)
print(len(data[data['stm_progfh_in_duur'] == (data['stm_progfh_in_tijd']-data['stm_aanntpl_tijd'])]))
print(len(data[data['stm_progfh_in_duur'] == data['stm_fh_duur']]))

35
3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['stm_progfh_in_duur'] = pd.to_numeric(data['stm_progfh_in_duur'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['stm_progfh_in_duur'] = data['stm_progfh_in_duur'].fillna(0).astype(int)


## Analyse prognose duur versus werkelijke duur

- 120.000 records met prognose FHT gelijk aan aannemer tijd + prognose tijd
- 9.000 records met prognose duur gelijk aan werkelijke FHT duur

## Target variabele maken

In [7]:
data['targetherstel'] = np.where(
    data['stm_fh_tijd'] - data['stm_aanntpl_tijd'] >= 0,
    data['stm_fh_tijd'] - data['stm_aanntpl_tijd'],
    data['stm_fh_tijd'] - data['stm_aanntpl_tijd'] + 1440
)

ranges = {
    "null": (0, 0),
    "tot 15m": (0, 15),
    "15m tot 30m": (15, 30),
    "30m tot 1h": (30, 60),
    "1h tot 2h": (60, 120),
    "2h tot 3h": (120, 180),
    "3h tot 6h": (180, 360),
    "6h tot 8h": (360, 540),
    "8h+": (540, float('inf'))
}

total = len(data)
for label, (low, high) in ranges.items():
    if label == "null":
        count = len(data[data['targetherstel'] == 0])
    else:
        count = len(data[(data['targetherstel'] > low) & (data['targetherstel'] <= high)])
    print(f'score {label}: {count/total:.2%}')

score null: 4.05%
score tot 15m: 8.11%
score 15m tot 30m: 16.22%
score 30m tot 1h: 16.22%
score 1h tot 2h: 8.11%
score 2h tot 3h: 4.05%
score 3h tot 6h: 1.35%
score 6h tot 8h: 4.05%
score 8h+: 37.84%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['targetherstel'] = np.where(


## Interpretatie verdeling targetherstel
- 20% meldingen zonder spoorstilstand
- 21% herstel binnen 15 minuten
- 13% herstel tussen 15-30 minuten
- 7% spoorstilstand boven 8 uur

## Filteren op relevante herstelduur (15 min tot 8 uur)

In [8]:
data = data.loc[(data['targetherstel'] > 15) & (data['targetherstel'] <= 540)]
data = data.loc[(data['stm_progfh_in_duur'] > 15) & (data['stm_progfh_in_duur'] <= 540)]
data = data.loc[(data["stm_progfh_in_tijd"] <= data["stm_fh_tijd"])]
data = data[data['stm_progfh_in_duur'] != (data['stm_progfh_in_tijd'] - data['stm_aanntpl_tijd'])]
data = data[data['stm_progfh_in_duur'] != data['stm_fh_duur']]

data = data.copy()
data.loc[:, 'stm_sap_melddatum_dag_van_het_jaar'] = pd.to_datetime(data['stm_sap_melddatum'], format='%d/%m/%Y').dt.dayofyear

data = data.loc[(data['stm_prioriteit'] != 8) & (data['stm_prioriteit'] != 9)]
data

Unnamed: 0,stm_oorz_code,stm_sap_melddatum,stm_sap_meldtijd,stm_geo_mld,stm_aanntpl_tijd,stm_fh_tijd,stm_techn_mld,stm_prioriteit,stm_contractgeb_mld,stm_fh_duur,stm_progfh_in_duur,stm_progfh_in_tijd,targetherstel,stm_sap_melddatum_dag_van_het_jaar


## Model voorbereiden en train-test split

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import joblib