In [None]:
%%capture
%run "main.ipynb"
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import TargetEncoder
import numpy as np

In [None]:
def cumulatively_categorise(column,threshold=0.75):
    threshold_value=int(threshold*len(column))
    categories_list=[]
    s=0
    counts=Counter(column)
    for i,j in counts.most_common():
        s+=dict(counts)[i]
        categories_list.append(i)
        if s>=threshold_value:
            break
    categories_list.append('Other')
    new_column=column.apply(lambda x: x if x in categories_list else 'Other')
    return pd.Series(new_column)

# Data cleaning

In [None]:
# NA's verwijderen omdat het er weinig zijn
df = df.dropna(subset=['stm_prioriteit'])

In [None]:
# stm_geo_mld
df = df[df.stm_geo_mld != '']
df['stm_geo_mld'] = df['stm_geo_mld'].astype(float).astype(int)
# stm_oorz_code
df['stm_oorz_code'] = df['stm_oorz_code'].fillna(221).astype(int) # 221 is the most 'other' categorie

# Data preparation

In [None]:
# Setup TargetEncoder
tEnc = TargetEncoder()
y = df['anm_tot_fh']

## stm_prioriteit

In [None]:
mean_per_prio_lvl = df.groupby('stm_prioriteit')['anm_tot_fh'].mean()
mean_per_prio_lvl

In [None]:
sns.boxplot(x='stm_prioriteit', y='anm_tot_fh', data=df, order=mean_per_prio_lvl.index)

plt.xlabel('Priority Level')
plt.ylabel('anm_tot_fh in minutes')

plt.show()

In [None]:
df['stm_prioriteit'].info()
prioriteit_dummies = pd.get_dummies(df['stm_prioriteit'], prefix='prio')
prioriteit_dummies.info()

## stm_oorz_code

In [None]:
X = np.array(df['stm_oorz_groep']).reshape(-1, 1)
tEnc.fit(X,y)
df['oorz_code_enc'] = tEnc.transform(X)

In [None]:
X = np.array(df['stm_geo_mld']).reshape(-1, 1)
tEnc.fit(X,y)
df['geo_code_enc'] = tEnc.transform(X)

In [None]:
X = np.array(df['stm_contractgeb_mld']).reshape(-1, 1)
tEnc.fit(X,y)
df['contractgb_enc'] = tEnc.transform(X)

In [None]:
X = np.array(df['stm_techn_mld']).reshape(-1, 1)
tEnc.fit(X,y)
df['techn_enc'] = tEnc.transform(X)

## stm_oorz_groep

In [None]:
# orz_dict = {'TECHONV':0, '':0, 'ONR-RIB':0, 'ONR-DERD':1, 'WEER':0}
# df['oorz_groep_cluster'] = df['stm_oorz_groep'].apply(lambda x: orz_dict[x])

In [None]:
# X = np.array(df['stm_oorz_groep']).reshape(-1, 1)
# tEnc.fit(X,y)
# df['oorzgr_enc'] = tEnc.transform(X)

## stm_techn_mld

In [None]:
# # techncl_dict = {'':1, 'S':1,'B':1,'K':1,'E':1,'T':1,'X':1,'I':1, 
# #                 'P':0,'O':0,'G':0,
# #                 'M':2,'A':2, 
# #                 'H':3}
# # df['techn_cluster'] = df['stm_techn_mld'].apply(lambda x: techncl_dict[x])
# df['techn_cluster0'] = df['stm_techn_mld'].apply(lambda x: 1 if x in ['','S','B','K','E','T','X','I'] else 0)
# df['techn_cluster1'] = df['stm_techn_mld'].apply(lambda x: 1 if x in ['P','O','G'] else 0)
# df['techn_cluster2'] = df['stm_techn_mld'].apply(lambda x: 1 if x in ['M','A'] else 0)
# df['techn_cluster3'] = df['stm_techn_mld'].apply(lambda x: 1 if x in ['H'] else 0)

In [None]:
df = df.reset_index(drop=True)
# techn_veld_dummies = techn_veld_dummies.reset_index(drop=True)
# oorz_code_dummies = oorz_code_dummies.reset_index(drop=True)
prioriteit_dummies = prioriteit_dummies.reset_index(drop=True)
# geo_code_dummies = geo_code_dummies.reset_index(drop=True)
# contractgebied_dummies = contractgebied_dummies.reset_index(drop=True)
# oorz_code_median = df['oorz_code_median'].reset_index(drop=True)
# geo_score = df['geo_score'].reset_index(drop=True)
# contractgeb_score = df['contractgeb_score'].reset_index(drop=True)
# oorz_groep_cluster = df['oorz_groep_cluster'].reset_index(drop=True)
# techn_cluster0 = df['techn_cluster0'].reset_index(drop=True)
# techn_cluster1 = df['techn_cluster1'].reset_index(drop=True)
# techn_cluster2 = df['techn_cluster2'].reset_index(drop=True)
# techn_cluster3 = df['techn_cluster3'].reset_index(drop=True)

# prio_enc = df['prio_enc'].reset_index(drop=True)
oorz_code_enc = df['oorz_code_enc'].reset_index(drop=True)
geo_code_enc = df['geo_code_enc'].reset_index(drop=True)
contractgb_enc = df['contractgb_enc'].reset_index(drop=True)
techn_enc = df['techn_enc'].reset_index(drop=True)
# oorzgr_enc = df['oorzgr_enc'].reset_index(drop=True)

features_to_use = [
    prioriteit_dummies,
    oorz_code_enc,
    geo_code_enc,
    contractgb_enc,
#     oorzgr_enc,
    techn_enc
]

model_df = pd.concat([df['anm_tot_fh'], *features_to_use], axis=1)
# model_df = model_df.dropna()
model_df.sample(20)

In [None]:
model_df.info()

In [None]:
model_df.to_csv("data/model_df.csv")