In [None]:
%%capture
%run "main.ipynb"
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

In [None]:
def cumulatively_categorise(column: pd.Series,threshold: float = 0.75) -> pd.Series:
    """
    Maak een "Other" kolom aan voor de klasses in een variabele die minder vaak voor komen

    Parameters:
    - column (pd.Series): De kolom waarvan het aantal klasses gereduceerd moet worden
    - threshold (float, optional): 1 - threshold = de grootte van de "Other" klasse in de feature

    Returns:
    - pd.Series: Een nieuwe kolom waarvan het aantal klasses gereduceerd is
    """
    threshold_value=int(threshold*len(column))
    categories_list=[]
    s=0
    counts=Counter(column)
    for i,j in counts.most_common():
        s+=dict(counts)[i]
        categories_list.append(i)
        if s>=threshold_value:
            break
    categories_list.append('Other')
    new_column=column.apply(lambda x: x if x in categories_list else 'Other')
    return pd.Series(new_column)

## stm_prioriteit

In [None]:
# NA's verwijderen omdat het er weinig zijn
df = df.dropna(subset=['stm_prioriteit'])

# veranderen van float naar categorie datatype
# df['stm_prioriteit'] = df['stm_prioriteit'].astype('category')

# df['stm_prioriteit'].value_counts()

De gemiddelde duur is wel degelijk verschillend bij de verschillende prioriteits klasses, zoals te zien in de output van de cel hieronder.

In [None]:
mean_per_prio_lvl = df.groupby('stm_prioriteit')['anm_tot_fh'].mean()
mean_per_prio_lvl

In [None]:
sns.boxplot(x='stm_prioriteit', y='anm_tot_fh', data=df, order=mean_per_prio_lvl.index)

plt.xlabel('Priority Level')
plt.ylabel('anm_tot_fh in minutes')

plt.show()

In [None]:
df['stm_prioriteit'].info()
prioriteit_dummies = pd.get_dummies(df['stm_prioriteit'], prefix='prio')
prioriteit_dummies.info()

## stm_oorz_code

In [None]:
df = df.dropna(subset=['stm_oorz_code'])
df['stm_oorz_code'] = df['stm_oorz_code'].astype('category')

In [None]:
df['stm_oorz_code'].value_counts()

In [None]:
df['stm_oorz_code'] = cumulatively_categorise(df['stm_oorz_code'], threshold=0.8)
# remove .0
df['stm_oorz_code'] = df['stm_oorz_code'].astype(str).replace('\.0', '', regex=True)
df['stm_oorz_code'].value_counts()

In [None]:
oorz_code_dummies = pd.get_dummies(df['stm_oorz_code'], prefix='oorz_code')
oorz_code_dummies.info()

In [None]:
# make dictionary for oorzaakcode median
oorzcd_dict = {}
for oorzc in df['stm_oorz_code'].unique():
    oorzcd_dict[oorzc] = (df[df.stm_oorz_code == oorzc]['anm_tot_fh'].median())
# Make column with median scores
df['oorz_code_median'] = df['stm_oorz_code'].apply(lambda x: oorzcd_dict[x] if str(x) != 'nan' else np.nan)

## stm_techn_mld

In [None]:
df = df.dropna(subset=['stm_techn_mld'])
df['stm_techn_mld'].astype('category')
df['stm_techn_mld'].value_counts()

In [None]:
df['stm_techn_mld'] = df['stm_techn_mld'].replace('', 'X')

Het aantal categorieën wordt gereduceerd voordat er dummies gemaakt worden omdat het aantal kolommen ander te groot wordt

In [None]:
df['stm_techn_mld'] = cumulatively_categorise(df['stm_techn_mld'], threshold=0.9)
df['stm_techn_mld'].value_counts()

In [None]:
mean_per_tech_field = df.groupby('stm_techn_mld')['anm_tot_fh'].mean()
mean_per_tech_field

In [None]:
sns.boxplot(x='stm_techn_mld', y='anm_tot_fh', data=df, order=mean_per_tech_field.index)

plt.xlabel('Techniek veld')
plt.ylabel('anm_tot_fh in minuten')

plt.show()

In [None]:
techn_veld_dummies = pd.get_dummies(df['stm_techn_mld'], prefix='techn_veld')
techn_veld_dummies.info()

## stm_geo_mld

In [None]:
df = df.dropna(subset=['stm_geo_mld'])
df['stm_geo_mld'].astype('category')
df['stm_geo_mld'].value_counts()

In [None]:
df['stm_geo_mld'] = cumulatively_categorise(df['stm_geo_mld'], threshold=0.7)
df['stm_geo_mld'].value_counts()

In [None]:
geo_code_dummies = pd.get_dummies(df['stm_geo_mld'], prefix='geo_code')
geo_code_dummies.info()

In [None]:
# geocode to score dictionary
geo_dict = {}
for geocode in df['stm_geo_mld'].unique():
    geo_dict[geocode] = (df[(df.stm_geo_mld == geocode) & (df.stm_oorz_code == 225)]['anm_tot_fh'].median())
# make column with the score for each geo code
df['geo_score'] = df['stm_geo_mld'].apply(lambda x: geo_dict[x]/max(geo_dict.values()))

## stm_contractgeb_mld

In [None]:
df = df.dropna(subset=['stm_contractgeb_mld'])
df['stm_contractgeb_mld'].astype('category')
df['stm_contractgeb_mld'].value_counts()

In [None]:
df['stm_contractgeb_mld'] = cumulatively_categorise(df['stm_contractgeb_mld'], threshold=0.9)
df['stm_contractgeb_mld'].unique()

In [None]:
contractgebied_dummies = pd.get_dummies(df['stm_contractgeb_mld'], prefix='contractgebied')
contractgebied_dummies.info()

In [None]:
# contractgebied to score dictionary
# contrgb_dict = {}
# for contr_geb in df['stm_contractgeb_mld'].unique():
#     contrgb_dict[contr_geb] = (df[(df.stm_contractgeb_mld == contr_geb) & (df.stm_oorz_code == 225)]['anm_tot_fh'].median())
# # make column with the score for each contractgebied
# max_ctr = max([x for x in sorted(contrgb_dict.values()) if str(x) != 'nan'])
# df['contractgeb_score'] = df['stm_contractgeb_mld'].apply(lambda x: contrgb_dict[x]/max_ctr if str(x) != 'nan' else np.nan)

## stm_oorz_groep

In [None]:
# orz_dict = {'TECHONV':0, '':0, 'ONR-RIB':0, 'ONR-DERD':1, 'WEER':0}
# df['oorz_groep_cluster'] = df['stm_oorz_groep'].apply(lambda x: orz_dict[x])

## stm_techn_mld

In [None]:
# techncl_dict = {'':1, 'S':1,'B':1,'K':1,'E':1,'T':1,'X':1,'I':1, 
#                 'P':0,'O':0,'G':0,
#                 'M':2,'A':2, 
#                 'H':3}
# df['techn_cluster'] = df['stm_techn_mld'].apply(lambda x: techncl_dict[x])
# df['techn_cluster0'] = df['stm_techn_mld'].apply(lambda x: 1 if x in ['','S','B','K','E','T','X','I'] else 0)
# df['techn_cluster1'] = df['stm_techn_mld'].apply(lambda x: 1 if x in ['P','O','G'] else 0)
# df['techn_cluster2'] = df['stm_techn_mld'].apply(lambda x: 1 if x in ['M','A'] else 0)
# df['techn_cluster3'] = df['stm_techn_mld'].apply(lambda x: 1 if x in ['H'] else 0)

In [None]:
df.columns

In [None]:
df = df.reset_index(drop=True)
techn_veld_dummies = techn_veld_dummies.reset_index(drop=True)
oorz_code_dummies = oorz_code_dummies.reset_index(drop=True)
prioriteit_dummies = prioriteit_dummies.reset_index(drop=True)
geo_code_dummies = geo_code_dummies.reset_index(drop=True)
contractgebied_dummies = contractgebied_dummies.reset_index(drop=True)
# oorz_code_median = df['oorz_code_median'].reset_index(drop=True)
# geo_score = df['geo_score'].reset_index(drop=True)
# contractgeb_score = df['contractgeb_score'].reset_index(drop=True)
# oorz_groep_cluster = df['oorz_groep_cluster'].reset_index(drop=True)
# techn_cluster0 = df['techn_cluster0'].reset_index(drop=True)
# techn_cluster1 = df['techn_cluster1'].reset_index(drop=True)
# techn_cluster2 = df['techn_cluster2'].reset_index(drop=True)
# techn_cluster3 = df['techn_cluster3'].reset_index(drop=True)

features_to_use = [
    techn_veld_dummies,
    oorz_code_dummies, 
    df['stm_prioriteit'],
    prioriteit_dummies,
    geo_code_dummies,
    contractgebied_dummies
    # oorz_code_median,
    # geo_score,
    # contractgeb_score,
    # oorz_groep_cluster,
    # techn_cluster0,
    # techn_cluster1,
    # techn_cluster2,
    # techn_cluster3
    ]

model_df = pd.concat([df['anm_tot_fh'], *features_to_use], axis=1)
# model_df = model_df.dropna()
model_df.sample(20)

In [None]:
len(model_df), len(model_df.dropna())

In [None]:
model_df.info()

In [None]:
model_df.dropna().to_csv("data/model_df.csv")