In this notebook we will look into creating multiple models and fine tuning them to get the best possible results.

I will be using the following models:
1. BaggingClassifier
2. RandomForestClassifier
3. ExtraTreesClassifier
4. VotingClassifier (with TBD models)
5. GaussianNB
6. KNeighborsClassifier
7. MLPClassifier
8. LinearTreeClassifier
9. LinearForestClassifier
10. LinearBoostClassifier

The choise of the models are not based on any specific reason, but rather to try out different models and see how they perform. There is although one condition, the model needs to be able to give a probability output, as this will be used in order to give a confidence score.

In [2]:
from sklearn.ensemble import BaggingClassifier # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
from sklearn.ensemble import RandomForestClassifier #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html#sklearn.ensemble.VotingClassifier
from sklearn.naive_bayes import GaussianNB #https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB
from sklearn.neighbors import KNeighborsClassifier #https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
from sklearn.neural_network import MLPClassifier #https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
from lineartree import LinearTreeClassifier, LinearForestClassifier, LinearBoostClassifier #https://github.com/cerlymarco/linear-tree
from sklearn.linear_model import RidgeClassifier, Ridge
from sklearn.multiclass import OneVsRestClassifier

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from joblib import dump, load
import pandas as pd
import os.path

%run data-cleaning.ipynb
%run model.py

In [4]:
df = pd.read_csv(r"dataset-prorail-clean-3.csv")
df = clean_data(df)

df['duur_prog_fh_seconds'] = df['duur_prog_fh'].dt.total_seconds()
num_bins = [0.0, 0.15, 0.3, 0.45, 0.55, 0.65, 0.75, 0.85, 0.904, 0.945, 0.97, 0.99, 1.0]
df, bin_edges = create_bins(df, 'duur_prog_fh_seconds', num_bins)
label_encoder = LabelEncoder()
df['duur_prog_fh_seconds_bins_enc'] = label_encoder.fit_transform(df['duur_prog_fh_seconds_bins'])

In [5]:
features = ['stm_geo_mld', 'stm_prioriteit', 'stm_oorz_code', 'stm_contractgeb_gst', 'stm_techn_mld']
target = 'duur_prog_fh_seconds_bins_enc'
df_model = df[features]
df_target = df[target]

x_train, x_test, y_train, y_test = train_test_split(df_model, df_target, test_size=0.2, random_state=42)
print(os.path.isfile('pipe.joblib'))
if not os.path.isfile('pipe.joblib'):
    pipe = Pipeline([('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore').set_output(transform="pandas")), ('bc', BaggingClassifier())])
    pipe.fit(x_train, y_train)
    dump(pipe, 'pipe.joblib')
else:
    pipe = load('pipe.joblib')


print(df_model.iloc[0])

pipe.score(x_test, y_test)
print(pipe.predict_proba([[1, 2, 215, 5, 13]]))

train_models(models, x_train, x_test, y_train, y_test)

True
stm_geo_mld             63.0
stm_prioriteit           4.0
stm_oorz_code          215.0
stm_contractgeb_gst     26.0
stm_techn_mld              S
Name: 129628, dtype: object
[[0.05904762 0.03357143 0.1210989  0.00571429 0.05357143 0.36153846
  0.28212454 0.         0.05       0.         0.03333333 0.        ]]
Training models...
BaggingClassifier - 0.2727261801749832 - 6.902838001491143
RandomForestClassifier - 0.27500961446014804 - 5.058873432769105
ExtraTreesClassifier - 0.27589895202384385 - 7.095764210620535
GaussianNB - 0.01770262474762042 - 35.301451668487765
NearestNeighborsClassifier - 0.2240169214498606 - 15.542592400182311
MLPClassifier - 0.21966637823286222 - 2.1922241176376365
LinearTreeClassifier - 0.21094125564849533 - 28.44055988505763
LinearForestClassifier - 0.27462503605422556 - 2.6752959082498577
LinearBoostClassifier - 0.20216806076338814 - 28.756777880611608
VotingClassifier - 0.2746490722045957 - 2.421253213789961
DecisionTreeClassifier - 0.2751297952119989 - 

In [10]:
# RFC = RandomForestClassifier()
# RFC.fit(x_train, y_train)
print(x_test.shape)

(98905, 504)


So just looking at the base models without hyper tuning we can see some models definatlly are performing better than others. Because of this we have decided to continue with the following few to see if we can improve the performance:
1. RandomForestClassifier
2. LinearForestClassifier
3. ExtraTreesClassifier
4. BaggingClassifier
5. DesicionTreeClassifier

In [17]:
import warnings
warnings.simplefilter('ignore')
BC = BaggingClassifier()
BC_Parameters = {'estimator': [LinearTreeClassifier(base_estimator=RidgeClassifier()), OneVsRestClassifier(LinearForestClassifier(Ridge(), max_features="sqrt")), RandomForestClassifier(), ExtraTreesClassifier(), MLPClassifier(), KNeighborsClassifier(), GaussianNB()], 'n_estimators': [10, 50, 100], 'max_samples': [0.5, 1.0], 'max_features': [0.5, 1.0], 'bootstrap': [True, False], 'bootstrap_features': [True, False], 'random_state': [42]}
RFC_Parameters = {'n_estimators': [10, 50, 100], 'criterion': ['gini', 'entropy', 'log_loss'], 'max_depth': [None, 10, 50, 100], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['sqrt', 'log2'], 'random_state': [42]}
ETC_Parameters = {'n_estimators': [10, 50, 100], 'criterion': ['gini', 'entropy', 'log_loss'], 'max_depth': [None, 10, 50, 100], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['sqrt', 'log2'], 'random_state': [42]}
LFC_Parameters = {'n_estimators': [10, 50, 100], 'base_estimator': [Ridge()],'max_depth': [None, 10, 50, 100], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['sqrt', 'log2'], 'random_state': [42]}
DTC_Parameters = {'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random'], 'max_depth': [None, 10, 50, 100], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['sqrt', 'log2'], 'random_state': [42]}
GSCV = GridSearchCV(BC, BC_Parameters, n_jobs=-1)
GSCV.fit(x_train, y_train)
print(GSCV.cv_results_)
print(GSCV.best_params_)



KeyboardInterrupt: 