In [1]:
%%capture
%run "main.ipynb"

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import TargetEncoder
import numpy as np
import json

In [3]:
def cumulatively_categorise(column,threshold=0.75):
    threshold_value=int(threshold*len(column))
    categories_list=[]
    s=0
    counts=Counter(column)
    for i,j in counts.most_common():
        s+=dict(counts)[i]
        categories_list.append(i)
        if s>=threshold_value:
            break
    categories_list.append('Other')
    new_column=column.apply(lambda x: x if x in categories_list else 'Other')
    return pd.Series(new_column)

# Data cleaning

Er zijn 2 rijen die een prioriteit missen, deze twee rijen halen we uit de database.

In [4]:
# NA's verwijderen omdat het er weinig zijn
df = df.dropna(subset=['stm_prioriteit'])

Er zijn 5 rijen die een geocode missen. Ook deze halen wij uit de database. \
Verder vullen wij voor missende oorzaakcodes, de code 999. 
Deze wordt in de database al gebruikt, en is niet beschreven in de data dictionary die wij hebben gekregen.
Het is een default/missende waarde. \
Voor het contractgebied vervangen wij missende waardes met 999. Deze komt in de dataset niet voor en is vergeleken met de andere waardes in deze kolom, overduidelijk een default/overig value.

In [5]:
# stm_geo_mld
# Some computers differ in how they interpret a column without values
# So we remove both NaN values and empty strings
df = df[df.stm_geo_mld != '']
df = df.dropna(subset=['stm_geo_mld'])
# df['stm_geo_mld'] = df['stm_geo_mld'].astype(float).astype(int)

In [6]:
# stm_oorz_code
df['stm_oorz_code'] = df['stm_oorz_code'].fillna(221) #.astype(int)

In [7]:
# stm_techn_mld
df['stm_techn_mld'] = df['stm_techn_mld'].replace('', 'X')

In [8]:
# stm_contractgeb_mld
df['stm_contractgeb_mld'] = df['stm_contractgeb_mld'].fillna(999)

In [9]:
# stm_oorz_groep
df['stm_oorz_groep'] = df['stm_oorz_groep'].replace('', 'ONBK', regex=True)

In [10]:
# stm_progfh_in_duur
df['stm_progfh_in_duur'] = df['stm_progfh_in_duur'].str.strip('*- ').astype(float).astype(int)

# Data preparation

In [11]:
# Setup TargetEncoder
tEnc = TargetEncoder(target_type="continuous", random_state=42)
y = df['anm_tot_fh']

## stm_prioriteit

Deze variabele gebruiken wij als een ordinale meetwaarde en hoeft dus niet veranderd te worden.

df['stm_prioriteit'].info()
prioriteit_dummies = pd.get_dummies(df['stm_prioriteit'], prefix='prio')
prioriteit_dummies.info()

## stm_oorz_code

In [12]:
X = np.array(df['stm_oorz_code']).reshape(-1, 1)
tEnc.fit(X,y)
df['oorz_code_enc'] = tEnc.transform(X)

In [13]:
oorzc_dict = {}
for i in range(len(tEnc.categories_[0])):
    cat, enc = tEnc.categories_[0][i], tEnc.encodings_[0][i]
    oorzc_dict[cat] = enc
# oorzc_dict

## stm_geo_mld

In [14]:
X = np.array(df['stm_geo_mld']).reshape(-1, 1)
tEnc.fit(X,y)
df['geo_code_enc'] = tEnc.transform(X)

In [15]:
geo_dict = {}
for i in range(len(tEnc.categories_[0])):
    cat, enc = tEnc.categories_[0][i], tEnc.encodings_[0][i]
    geo_dict[cat] = enc
# geo_dict

## stm_contractgeb_mld

In [16]:
X = np.array(df['stm_contractgeb_mld']).reshape(-1, 1)
tEnc.fit(X,y)
df['contractgb_enc'] = tEnc.transform(X)

In [17]:
contrgb_dict = {}
for i in range(len(tEnc.categories_[0])):
    cat, enc = tEnc.categories_[0][i], tEnc.encodings_[0][i]
    contrgb_dict[cat] = enc
# contrgb_dict

## stm_techn_mld

In [18]:
X = np.array(df['stm_techn_mld']).reshape(-1, 1)
tEnc.fit(X,y)
df['techn_enc'] = tEnc.transform(X)

In [19]:
techn_dict = {}
for i in range(len(tEnc.categories_[0])):
    cat, enc = tEnc.categories_[0][i], tEnc.encodings_[0][i]
    techn_dict[cat] = enc
techn_dict

{'A': 112.0866503553229,
 'B': 75.73536484026081,
 'E': 77.89566696486747,
 'G': 63.251424360323824,
 'H': 360.0,
 'I': 87.79935701633958,
 'K': 77.06976811538303,
 'M': 117.11576429172453,
 'O': 46.59620581108975,
 'P': 54.72145306636579,
 'S': 67.02737156548827,
 'T': 77.28224044981404,
 'X': 67.96894017496277}

In [20]:
techn_veld_dummies = pd.get_dummies(df['stm_techn_mld'], prefix='techn_veld')

## stm_oorz_groep

In [21]:
oorz_groep_dummies = pd.get_dummies(df['stm_oorz_groep'], prefix='oorzgr')

## stm_progfh_in_duur

Deze kolom is boven omgezet naar integer waardes, en heeft verder geen preparatie nodig.

# Geprepareerde data

In [22]:
df = df.reset_index(drop=True)
# prioriteit_dummies = prioriteit_dummies.reset_index(drop=True)
prioriteit = df['stm_prioriteit'].reset_index(drop=True)
oorz_code_enc = df['oorz_code_enc'].reset_index(drop=True)
geo_code_enc = df['geo_code_enc'].reset_index(drop=True)
contractgb_enc = df['contractgb_enc'].reset_index(drop=True)
techn_enc = df['techn_enc'].reset_index(drop=True)
techn_veld_dummies = techn_veld_dummies.reset_index(drop=True)
oorz_groep_dummies = oorz_groep_dummies.reset_index(drop=True)
fh_prog = df['stm_progfh_in_duur']

features_to_use = [
    # prioriteit_dummies
    prioriteit,
    oorz_code_enc,
    geo_code_enc,
    contractgb_enc,
    techn_enc,
    techn_veld_dummies,
    oorz_groep_dummies,
    fh_prog
]

model_df = pd.concat([df['anm_tot_fh'], *features_to_use], axis=1)
# model_df = model_df.dropna()
model_df.sample(20)

Unnamed: 0,anm_tot_fh,stm_prioriteit,oorz_code_enc,geo_code_enc,contractgb_enc,techn_enc,techn_veld_A,techn_veld_B,techn_veld_E,techn_veld_G,...,techn_veld_P,techn_veld_S,techn_veld_T,techn_veld_X,oorzgr_ONBK,oorzgr_ONR-DERD,oorzgr_ONR-RIB,oorzgr_TECHONV,oorzgr_WEER,stm_progfh_in_duur
101515,53.0,2.0,57.569839,77.998775,77.403777,75.735365,0,1,0,0,...,0,0,0,0,0,0,0,1,0,60
94097,61.0,5.0,64.240392,59.546635,68.04751,67.96894,0,0,0,0,...,0,0,0,1,0,1,0,0,0,61
169583,52.0,5.0,74.845369,77.341128,77.403777,67.027372,0,0,0,0,...,0,1,0,0,0,0,0,1,0,54
497137,15.0,2.0,74.845369,55.208092,64.429121,75.735365,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
195873,11.0,5.0,57.569839,65.595733,55.898554,54.721453,0,0,0,0,...,1,0,0,0,0,0,0,1,0,99999999
275398,78.0,5.0,57.569839,77.341128,77.403777,67.027372,0,0,0,0,...,0,1,0,0,0,0,0,1,0,79
525655,59.983333,5.0,72.691048,51.594854,77.241686,67.027372,0,0,0,0,...,0,1,0,0,0,1,0,0,0,131
130013,10.0,5.0,88.07629,52.630646,58.970903,67.027372,0,0,0,0,...,0,1,0,0,0,0,0,1,0,10
39435,68.733333,5.0,57.569839,76.902227,72.56851,75.735365,0,1,0,0,...,0,0,0,0,0,0,0,1,0,153
373704,37.0,2.0,53.288598,66.758165,66.167393,54.721453,0,0,0,0,...,1,0,0,0,0,0,0,1,0,99999999


In [23]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544576 entries, 0 to 544575
Data columns (total 25 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   anm_tot_fh          544576 non-null  float64
 1   stm_prioriteit      544576 non-null  float64
 2   oorz_code_enc       544576 non-null  float64
 3   geo_code_enc        544576 non-null  float64
 4   contractgb_enc      544576 non-null  float64
 5   techn_enc           544576 non-null  float64
 6   techn_veld_A        544576 non-null  uint8  
 7   techn_veld_B        544576 non-null  uint8  
 8   techn_veld_E        544576 non-null  uint8  
 9   techn_veld_G        544576 non-null  uint8  
 10  techn_veld_H        544576 non-null  uint8  
 11  techn_veld_I        544576 non-null  uint8  
 12  techn_veld_K        544576 non-null  uint8  
 13  techn_veld_M        544576 non-null  uint8  
 14  techn_veld_O        544576 non-null  uint8  
 15  techn_veld_P        544576 non-nul

In [24]:
model_df.to_csv("data/model_df.csv")

In [26]:
encodings = {'oorzaak_code': oorzc_dict,
 'geo_code': geo_dict,
 'contractgebied': contrgb_dict,
 'techniekveld': techn_dict
}
with open('data/feature_dictionaries.json', 'w') as outfile:
    json.dump(encodings, outfile)