In [None]:
# imported libraries
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import TargetEncoder
import numpy as np
import json
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
cols_to_use = [
    '#stm_sap_meldnr',
    'stm_geo_mld',
    'stm_prioriteit',
    'stm_geo_gst',
    'stm_oorz_groep',
    'stm_oorz_code',
    'stm_fh_ddt',
    'stm_contractgeb_mld',
    'stm_techn_mld',
    'stm_progfh_in_duur',
    'stm_progfh_in_invoer_dat',
    'stm_progfh_in_invoer_tijd',
]

df = pd.read_csv("data/sap_storing_data_hu_project.csv", index_col=0, usecols=cols_to_use, engine='pyarrow')
df = df[~df.index.duplicated(keep='first')] # remove duplicate indices
df.head()

# Target variable prep

Prepareren 'stm_progfh_in_invoer_dat' hier staan namelijk waardes in die soms geen '/' hebbem. Dat ziet eruit als volgt: YYYYMMDD, in plaats van: YYYY/MM/DD

In [None]:
df['stm_progfh_in_invoer_dat'].sample(15)

In [None]:
df = df.dropna(subset=['stm_progfh_in_invoer_dat'])

def fix_date_format(val):
    if '/' not in str(val):
        val = f"{val[:4]}/{val[4:6]}/{val[6:8]}"
    return val

df['stm_progfh_in_invoer_dat'] = df['stm_progfh_in_invoer_dat'].apply(fix_date_format)
df['stm_progfh_in_invoer_dat'] = pd.to_datetime(df['stm_progfh_in_invoer_dat'], format='mixed', errors='raise')
df['stm_progfh_in_invoer_dat'].sample(15)

In [None]:
# Drop NaN values van 'stm_progfh_in_invoer_tijd'
df = df.dropna(subset=['stm_progfh_in_invoer_tijd'])
df['stm_progfh_in_invoer_tijd'].info()

In [None]:
# Combineer datum en tijd tot 1 variabele
df['progfh_inv_ddt'] = df['stm_progfh_in_invoer_dat'].astype(str) + ' ' + df['stm_progfh_in_invoer_tijd'].astype(str)
# Gebruik ISO8601 format om datetime te parsen
df['progfh_inv_ddt'] = pd.to_datetime(df['progfh_inv_ddt'], format='ISO8601', errors='raise')
df['progfh_inv_ddt'].info()

In [None]:
df['progfh_inv_ddt'].sample(20)

In [None]:
# Gebruik 'mixed' format om datetime te parsen, omdat sommige data niet in ISO8601 format is
# Soms is het format YYYY-MM-DD HH:MM:SS, en soms MM/DD/YYYY HH:MM:SS.
df['stm_fh_ddt'] = pd.to_datetime(df['stm_fh_ddt'], format='mixed', errors='raise')
df = df.dropna(subset=['stm_fh_ddt'])
df['stm_fh_ddt'].info() 

In [None]:
df['stm_fh_ddt'].sample(20)

Maak een nieuwe kolom 'progfh_inv_tot_fh' dit wordt de target variabele. Dit is de tijd vanaf invoer prognose tot functie herstel.

Verwijder alle waardes kleiner dan 5 min of groter dan 480 min (8 uur).

In [None]:
df['progfh_inv_tot_fh'] = pd.to_timedelta(df['stm_fh_ddt'] - df['progfh_inv_ddt']).dt.total_seconds()
# change from seconds to minutes
df['progfh_inv_tot_fh'] = df['progfh_inv_tot_fh'] / 60
# only keep values >= 5 and <= 480
df = df[(df['progfh_inv_tot_fh'] >= 5) & (df['progfh_inv_tot_fh'] <= 480)]
df['progfh_inv_tot_fh'].info()

In [None]:
df['progfh_inv_tot_fh'].plot.hist(bins=100, figsize=(20, 10))
plt.ylabel('Frequency')
plt.xlabel('Invoer prognose tot functie herstel in minuten')
plt.show()

---

# Feature variables prep

In [None]:

def rare_category(series: pd.Series, other_size=0.01, new_category='Other'):
    """
    Takes a categorical pandas Series object and returns another Series object
    but all values that occur very rarely are put in one category.
    
    Parameters:
    series (pandas.Series): The categorical pandas Series object
    other_size (float): The size of the 'Other' category as a fraction of the total number of values. Default is 0.01.
    new_category (str): The name of the new category for rare values. Default is 'Rare'.
    
    Returns:
    pandas.Series: The new categorical pandas Series object with rare values replaced by new_category.
    """
    counts = series.value_counts(normalize=True)
    rare_values = counts[counts.cumsum() > 1 - other_size].index.tolist()
    return series.apply(lambda x: new_category if x in rare_values else x)

In [None]:
df = df.dropna(subset=['stm_progfh_in_duur'])

"""
==============================================================================================================================
*** WARNING ***
Er zijn ~1000 waardes in progfh_duur die een '-' teken erachter hebben staan. Voor nu worden deze waardes verwijderd.
==============================================================================================================================
"""

count = 0
for val in df['stm_progfh_in_duur']:
    try: 
        int(val)
    except ValueError:
        df = df[df['stm_progfh_in_duur'] != val]
        count += 1
print(f"Removed {count} values from stm_progfh_in_duur column")

df['stm_progfh_in_duur'] = df['stm_progfh_in_duur'].astype(int)
df = df[df['stm_progfh_in_duur'] < (48*60)]
df['stm_progfh_in_duur'].info()

In [None]:
df['stm_progfh_in_duur'].plot(kind='hist', bins=100, figsize=(20, 10))
plt.show()

## NaN values

In [None]:
# PRIORITEIT
df = df.dropna(subset=['stm_prioriteit'])

# GEO CODE
df = df[df.stm_geo_mld != '']
df = df.dropna(subset=['stm_geo_mld'])
df['stm_geo_mld'] = df['stm_geo_mld'].astype(float).astype(int).astype(str)

# OORZ CODE
df['stm_oorz_code'] = df['stm_oorz_code'].fillna(221).astype(int).astype(str)

# TECHN VELD
df['stm_techn_mld'] = df['stm_techn_mld'].replace('', 'X')
df['stm_techn_mld'] = df['stm_techn_mld'].fillna('X')

# CONTRACT GEBIED
df['stm_contractgeb_mld'] = df['stm_contractgeb_mld'].fillna(999)
df['stm_contractgeb_mld'] = df['stm_contractgeb_mld'].astype(float).astype(int).astype(str)

# OORZ GROEP
df['stm_oorz_groep'] = df['stm_oorz_groep'].replace('', 'ONBK', regex=True)
df['stm_oorz_groep'] = df['stm_oorz_groep'].fillna('ONBK')

df.info()


## Dummies

In [None]:
# Dummies voor techniekveld
techn_veld_dummies = pd.get_dummies(df['stm_techn_mld'], prefix='techn_veld')
df = df.join(techn_veld_dummies)

# Dummies voor oorz groep
oorz_groep_dummies = pd.get_dummies(df['stm_oorz_groep'], prefix='oorzgr')
df = df.join(oorz_groep_dummies)

## Target encoding

In [None]:
df['stm_oorz_code'] = rare_category(df['stm_oorz_code'], other_size=0.05)
df['stm_geo_mld'] = rare_category(df['stm_geo_mld'], other_size=0.05)
df['stm_contractgeb_mld'] = rare_category(df['stm_contractgeb_mld'], other_size=0.05)

In [None]:
# Voor random state 33/37 zijn alle unieke waardes waarvoor wij target encoding gebruiken gerepresenteerd.
# df_train is 80% van de dataset
df_train = df.sample(frac = 0.8, random_state=33)
# df_test is de overige 20%
df_test = df.drop(df_train.index)

In [None]:
# Setup TargetEncoder
# Onze target is een duur in minuten, dus continue
tEnc = TargetEncoder(target_type="continuous", random_state=42)
y = df_train['progfh_inv_tot_fh']

### stm_oorz_code

In [None]:
# Fit het model op een feature in de traindata en maak een kolom aan met de geëncodeerde waardes
X = np.array(df_train['stm_oorz_code']).reshape(-1, 1)
tEnc.fit(X,y)
df_train['oorz_code_enc'] = tEnc.transform(X)

In [None]:
# Een dictionary met de encodings (voor het dashboard)
oorzc_dict = {}
for i in range(len(tEnc.categories_[0])):
    cat, enc = tEnc.categories_[0][i], tEnc.encodings_[0][i]
    oorzc_dict[cat] = enc

In [None]:
# In de testdata dezelfde kolom aanmaken met de encodings van de traindata
df_test['oorz_code_enc'] = df_test['stm_oorz_code'].apply(lambda x : oorzc_dict[x])

### stm_geo_mld

In [None]:
# Fit het model op een feature in de traindata en maak een kolom aan met de geëncodeerde waardes
X = np.array(df_train['stm_geo_mld']).reshape(-1, 1)
tEnc.fit(X,y)
df_train['geo_code_enc'] = tEnc.transform(X)

In [None]:
# Een dictionary met de encodings (voor het dashboard)
geo_dict = {}
for i in range(len(tEnc.categories_[0])):
    cat, enc = tEnc.categories_[0][i], tEnc.encodings_[0][i]
    geo_dict[cat] = enc

In [None]:
# In de testdata dezelfde kolom aanmaken met de encodings van de traindata
df_test['geo_code_enc'] = df_test['stm_geo_mld'].apply(lambda x : geo_dict[x])

### stm_contractgeb_mld

In [None]:
# Fit het model op een feature in de traindata en maak een kolom aan met de geëncodeerde waardes
X = np.array(df_train['stm_contractgeb_mld']).reshape(-1, 1)
tEnc.fit(X,y)
df_train['contractgb_enc'] = tEnc.transform(X)

In [None]:
# Een dictionary met de encodings (voor het dashboard)
contrgb_dict = {}
for i in range(len(tEnc.categories_[0])):
    cat, enc = tEnc.categories_[0][i], tEnc.encodings_[0][i]
    contrgb_dict[cat] = enc
# contrgb_dict

In [None]:
# In de testdata dezelfde kolom aanmaken met de encodings van de traindata
df_test['contractgb_enc'] = df_test['stm_contractgeb_mld'].apply(lambda x : contrgb_dict[x])

### stm_techn_mld

In [None]:
# Fit het model op een feature in de traindata en maak een kolom aan met de geëncodeerde waardes
X = np.array(df_train['stm_techn_mld']).reshape(-1, 1)
tEnc.fit(X,y)
df_train['techn_veld_enc'] = tEnc.transform(X)

In [None]:
# Een dictionary met de encodings (voor het dashboard)
techn_dict = {}
for i in range(len(tEnc.categories_[0])):
    cat, enc = tEnc.categories_[0][i], tEnc.encodings_[0][i]
    techn_dict[cat] = enc

In [None]:
# In de testdata dezelfde kolom aanmaken met de encodings van de traindata
df_test['techn_veld_enc'] = df_test['stm_techn_mld'].apply(lambda x : techn_dict[x])

In [None]:
df_train.info()

In [None]:
df_train = df_train.reset_index(drop=True)
prioriteit = df_train['stm_prioriteit'].reset_index(drop=True)
oorz_code_enc = df_train['oorz_code_enc'].reset_index(drop=True)
geo_code_enc = df_train['geo_code_enc'].reset_index(drop=True)
contractgb_enc = df_train['contractgb_enc'].reset_index(drop=True)
techn_enc = df_train['techn_veld_enc'].reset_index(drop=True)
fh_prog = df_train['stm_progfh_in_duur'].reset_index(drop=True)
techn_veld_dummies = df_train[
      ['techn_veld_A', 'techn_veld_B', 'techn_veld_E', 'techn_veld_G', 
       'techn_veld_I', 'techn_veld_K', 'techn_veld_M', 'techn_veld_O', 
       'techn_veld_P', 'techn_veld_S', 'techn_veld_T', 'techn_veld_X']].reset_index(drop=True)
oorz_groep_dummies = df_train[['oorzgr_ONBK',
       'oorzgr_ONR-DERD', 'oorzgr_ONR-RIB', 'oorzgr_TECHONV', 'oorzgr_WEER']].reset_index(drop=True)

features_to_use = [
       fh_prog,
       oorz_code_enc,
       geo_code_enc,
       contractgb_enc,
       techn_enc,
       prioriteit,
       techn_veld_dummies,
       oorz_groep_dummies
       ]

train_df = pd.concat([df_train['progfh_inv_tot_fh'], *features_to_use], axis=1)
train_df.head()

In [None]:
train_df.to_pickle("data/ole_train_df.pkl")

In [None]:
df_test = df_test.reset_index(drop=True)
prioriteit = df_test['stm_prioriteit'].reset_index(drop=True)
oorz_code_enc = df_test['oorz_code_enc'].reset_index(drop=True)
geo_code_enc = df_test['geo_code_enc'].reset_index(drop=True)
contractgb_enc = df_test['contractgb_enc'].reset_index(drop=True)
techn_enc = df_test['techn_veld_enc'].reset_index(drop=True)
fh_prog = df_test['stm_progfh_in_duur'].reset_index(drop=True)
techn_veld_dummies = df_test[
      ['techn_veld_A', 'techn_veld_B', 'techn_veld_E', 'techn_veld_G', 
       'techn_veld_I', 'techn_veld_K', 'techn_veld_M', 'techn_veld_O', 
       'techn_veld_P', 'techn_veld_S', 'techn_veld_T', 'techn_veld_X']].reset_index(drop=True)
oorz_groep_dummies = df_test[['oorzgr_ONBK',
       'oorzgr_ONR-DERD', 'oorzgr_ONR-RIB', 'oorzgr_TECHONV', 'oorzgr_WEER']].reset_index(drop=True)

features_to_use = [
       fh_prog,
       oorz_code_enc,
       geo_code_enc,
       contractgb_enc,
       techn_enc,
       prioriteit,
       techn_veld_dummies,
       oorz_groep_dummies
       ]

test_df = pd.concat([df_test['progfh_inv_tot_fh'], *features_to_use], axis=1)
test_df.head()

In [None]:
test_df.to_pickle("data/ole_test_df.pkl")

In [None]:
model_df = pd.concat([train_df, test_df], axis=0)
model_df.to_pickle("data/ole_model_df.pkl")
model_df.info()

In [None]:
# Dictionary van de dictionaries
encodings = {
    'oorz_code': oorzc_dict,
    'geo_code': geo_dict,
    'contractgb': contrgb_dict,
    'techn_veld': techn_dict
}
with open('data/feature_encodings.json', 'w') as outfile:
    json.dump(encodings, outfile)