In [1]:
import pandas as pd

df = pd.read_csv("/Users/jortgommers/Desktop/ai/df_cleaned.csv")

In [2]:
# Controleer op missende waarden in de geselecteerde kolommen
missing_values = df.isnull().sum()

# Toon alle rijen
pd.set_option("display.max_rows", None)

print(missing_values)

COMPANY                              0
OFFICE                               0
OFFICE_MAIN_BRAND                    0
MANUFACTURER_SHORT                   0
VEHICLE_GROUP                        0
VEHICLE_TYPE                         0
MILEAGE                              0
MILAGE_SALES                         0
COLOR                                0
MAX_TRAILOR_LOAD                     0
YEAR_CONSTRUCTION                    0
HORSEPOWER                           0
NUMBER_OWNERS                        0
FINANCING_TYPE                       0
FUEL_TYPE_NAME                       0
LAID_UP_TIME                         0
SCALED_CURRENT_VALUE                 0
SCALED_GUIDE_PRICE                   0
SCALED_TOTAL_SALE_PRICE              0
ACCIDENT_VEHICLE                     0
COMMISSION_TYPE                      0
VEHICLE_MODEL_ID_NAME                0
days_between_leasing_and_purchase    0
PURCHASE_YEAR                        0
PURCHASE_MONTH                       0
PURCHASE_WEEKDAY         

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, r2_score

# Splits de dataset in kenmerken (X) en target (y)
X = df.drop('LAID_UP_TIME', axis=1)
y = df['LAID_UP_TIME']

# Initialiseer het Random Forest-model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Baseline-score met alle features
baseline_scores = cross_validate(rf, X, y, cv=2, scoring=['r2', 'neg_root_mean_squared_error'])
baseline_r2 = baseline_scores['test_r2'].mean()
baseline_rmse = -baseline_scores['test_neg_root_mean_squared_error'].mean()

# Resultaten opslaan
results = []

# Itereer door de features en train het model zonder die feature
for feature in X.columns:
    X_reduced = X.drop(columns=[feature])  # Verwijder 1 feature

    scores = cross_validate(rf, X_reduced, y, cv=2, scoring=['r2', 'neg_root_mean_squared_error'])
    r2 = scores['test_r2'].mean()
    rmse = -scores['test_neg_root_mean_squared_error'].mean()

    # Opslaan van de resultaten
    results.append({
        'Feature_removed': feature,
        'R2': r2,
        'RMSE': rmse,
        'R2_change': r2 - baseline_r2,
        'RMSE_change': rmse - baseline_rmse
    })

# Zet resultaten om naar een DataFrame
results_df = pd.DataFrame(results)

# Sorteer de resultaten op R2-verandering (hoeveel de score daalt/stijgt)
results_df = results_df.sort_values(by='R2_change')

# Print de resultaten
print(results_df)

# Print de resultaten
results_df.to_csv("feature_importance_results.csv", index=False)


| Feature_removed                     | R2       | RMSE      | R2_change  | RMSE_change |
|--------------------------------------|----------|-----------|------------|-------------|
| PURCHASE_YEAR                        | 0.655194 | 56.345360 | -0.066241  | 5.704402    |
| days_between_leasing_and_purchase    | 0.697913 | 52.730111 | -0.023522  | 2.089152    |
| COMMISSION_TYPE                      | 0.704037 | 52.194793 | -0.017398  | 1.553834    |
| PURCHASE_MONTH                        | 0.705100 | 52.102509 | -0.016335  | 1.461551    |
| SCALED_TOTAL_SALE_PRICE              | 0.711922 | 51.495779 | -0.009514  | 0.854820    |
| SCALED_CURRENT_VALUE                 | 0.712123 | 51.475349 | -0.009312  | 0.834391    |
| OFFICE                               | 0.716249 | 51.109029 | -0.005186  | 0.468070    |
| MILAGE_SALES                         | 0.716494 | 51.089407 | -0.004941  | 0.448448    |
| SCALED_GUIDE_PRICE                   | 0.717365 | 51.010329 | -0.004070  | 0.369370    |
| PC-1                                 | 0.718224 | 50.910424 | -0.003211  | 0.269465    |
| YEAR_CONSTRUCTION                    | 0.718617 | 50.891292 | -0.002818  | 0.250333    |
| COMPANY                              | 0.718694 | 50.887581 | -0.002741  | 0.246623    |
| NUMBER_OWNERS                        | 0.719638 | 50.803309 | -0.001797  | 0.162350    |
| MANUFACTURER_SHORT                   | 0.719807 | 50.786271 | -0.001628  | 0.145312    |
| HORSEPOWER                           | 0.719920 | 50.779651 | -0.001516  | 0.138693    |
| MILEAGE                              | 0.720129 | 50.756372 | -0.001306  | 0.115413    |
| VEHICLE_GROUP                        | 0.720388 | 50.733585 | -0.001047  | 0.092627    |
| FUEL_TYPE_NAME                       | 0.720400 | 50.735295 | -0.001035  | 0.094336    |
| FINANCING_TYPE                       | 0.720580 | 50.719085 | -0.000855  | 0.078127    |
| OFFICE_MAIN_BRAND                    | 0.720765 | 50.700208 | -0.000670  | 0.059249    |
| VEHICLE_MODEL_ID_NAME                | 0.720802 | 50.697136 | -0.000633  | 0.056178    |
| MAX_TRAILOR_LOAD                     | 0.721174 | 50.661969 | -0.000261  | 0.021010    |
| PURCHASE_WEEKDAY                     | 0.721327 | 50.648424 | -0.000108  | 0.007465    |
| ACCIDENT_VEHICLE                     | 0.721462 | 50.637791 | 0.000027   | -0.003167   |
| VEHICLE_TYPE                         | 0.721692 | 50.616633 | 0.000257   | -0.024325   |
| PC-4                                 | 0.721798 | 50.606949 | 0.000363   | -0.034010   |
| PC-2                                 | 0.722178 | 50.569220 | 0.000743   | -0.071739   |
| COLOR                                | 0.722358 | 50.553928 | 0.000923   | -0.087031   |
| PC-3                                 | 0.723322 | 50.461349 | 0.001887   | -0.179610   |


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

# Laad de dataset
df = pd.read_csv("df_cleaned.csv")

# Definieer de target- en feature-variabelen
X = df.drop(columns=['LAID_UP_TIME'])
y = df['LAID_UP_TIME']

# Features die weinig bijdragen aan het model
features_to_remove = ['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 'ACCIDENT_VEHICLE']

# Verwijder de zwakke features
X_reduced = X.drop(columns=features_to_remove)

# Initialiseer het Random Forest-model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Evaluatie na feature selectie
scores = cross_validate(rf, X_reduced, y, cv=2, scoring=['r2', 'neg_root_mean_squared_error'])
final_r2 = scores['test_r2'].mean()
final_rmse = -scores['test_neg_root_mean_squared_error'].mean()

# Print de resultaten
print("Modelprestaties na verwijderen van zwakke features:")
print(f"R²-score: {final_r2:.6f}")
print(f"RMSE: {final_rmse:.6f}")


R²-score: 0.728697
RMSE: 49.971820

In [None]:
import pandas as pd

df = pd.read_csv("df_cleaned.csv")

df = df.drop(columns=['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 'ACCIDENT_VEHICLE'], errors='ignore')
# Controleer op missende waarden in de geselecteerde kolommen
missing_values = df.isnull().sum()

# Toon alle rijen
pd.set_option("display.max_rows", None)

print(missing_values)

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, r2_score

# Splits de dataset in kenmerken (X) en target (y)
X = df.drop('LAID_UP_TIME', axis=1)
y = df['LAID_UP_TIME']

# Initialiseer het Random Forest-model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Baseline-score met alle features
baseline_scores = cross_validate(rf, X, y, cv=2, scoring=['r2', 'neg_root_mean_squared_error'])
baseline_r2 = baseline_scores['test_r2'].mean()
baseline_rmse = -baseline_scores['test_neg_root_mean_squared_error'].mean()

# Resultaten opslaan
results = []

# Itereer door de features en train het model zonder die feature
for feature in X.columns:
    X_reduced = X.drop(columns=[feature])  # Verwijder 1 feature

    scores = cross_validate(rf, X_reduced, y, cv=2, scoring=['r2', 'neg_root_mean_squared_error'])
    r2 = scores['test_r2'].mean()
    rmse = -scores['test_neg_root_mean_squared_error'].mean()

    # Opslaan van de resultaten
    results.append({
        'Feature_removed': feature,
        'R2': r2,
        'RMSE': rmse,
        'R2_change': r2 - baseline_r2,
        'RMSE_change': rmse - baseline_rmse
    })

# Zet resultaten om naar een DataFrame
results_df = pd.DataFrame(results)

# Sorteer de resultaten op R2-verandering (hoeveel de score daalt/stijgt)
results_df = results_df.sort_values(by='R2_change')

# Print de resultaten
print(results_df)
results_df.to_csv("feature_importance_results_2.csv", index=False)


| Feature_removed                     | R2       | RMSE      | R2_change  | RMSE_change |
|--------------------------------------|----------|-----------|------------|-------------|
| PURCHASE_YEAR                        | 0.640090 | 57.580533 | -0.088607  | 7.608713    |
| days_between_leasing_and_purchase    | 0.703913 | 52.199388 | -0.024784  | 2.227568    |
| PURCHASE_MONTH                        | 0.705876 | 52.032657 | -0.022821  | 2.060837    |
| COMMISSION_TYPE                      | 0.711344 | 51.547646 | -0.017353  | 1.575827    |
| SCALED_TOTAL_SALE_PRICE              | 0.717963 | 50.950245 | -0.010734  | 0.978425    |
| SCALED_CURRENT_VALUE                 | 0.719347 | 50.820815 | -0.009350  | 0.848995    |
| OFFICE                               | 0.724776 | 50.334980 | -0.003921  | 0.363160    |
| MILAGE_SALES                         | 0.725382 | 50.281723 | -0.003315  | 0.309903    |
| COMPANY                              | 0.725594 | 50.254977 | -0.003103  | 0.283157    |
| YEAR_CONSTRUCTION                    | 0.726969 | 50.129520 | -0.001728  | 0.157701    |
| SCALED_GUIDE_PRICE                   | 0.727481 | 50.082410 | -0.001216  | 0.110591    |
| MANUFACTURER_SHORT                   | 0.727769 | 50.058013 | -0.000928  | 0.086194    |
| PC-1                                 | 0.727825 | 50.033224 | -0.000872  | 0.061405    |
| VEHICLE_GROUP                        | 0.727932 | 50.045369 | -0.000765  | 0.073550    |
| NUMBER_OWNERS                        | 0.727965 | 50.041047 | -0.000732  | 0.069227    |
| FUEL_TYPE_NAME                       | 0.728149 | 50.026209 | -0.000548  | 0.054390    |
| HORSEPOWER                           | 0.728669 | 49.981198 | -0.000028  | 0.009379    |
| MILEAGE                              | 0.728694 | 49.974815 | -0.000003  | 0.002995    |
| OFFICE_MAIN_BRAND                    | 0.728866 | 49.957111 | 0.000169   | -0.014709   |
| FINANCING_TYPE                       | 0.728993 | 49.946817 | 0.000296   | -0.025003   |
| VEHICLE_MODEL_ID_NAME                | 0.729069 | 49.939528 | 0.000372   | -0.032292   |
| MAX_TRAILOR_LOAD                     | 0.729627 | 49.886554 | 0.000930   | -0.085266   |
| PURCHASE_WEEKDAY                     | 0.729703 | 49.879259 | 0.001006   | -0.092560   |

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

# Laad de dataset
df = pd.read_csv("df_cleaned.csv")

# Definieer de target- en feature-variabelen
X = df.drop(columns=['LAID_UP_TIME'])
y = df['LAID_UP_TIME']

# Features die weinig bijdragen aan het model
features_to_remove = ['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 'ACCIDENT_VEHICLE','PURCHASE_WEEKDAY'
,'MAX_TRAILOR_LOAD','VEHICLE_MODEL_ID_NAME']

# Verwijder de zwakke features
X_reduced = X.drop(columns=features_to_remove)

# Initialiseer het Random Forest-model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Evaluatie na feature selectie
scores = cross_validate(rf, X_reduced, y, cv=2, scoring=['r2', 'neg_root_mean_squared_error'])
final_r2 = scores['test_r2'].mean()
final_rmse = -scores['test_neg_root_mean_squared_error'].mean()

# Print de resultaten
print("Modelprestaties na verwijderen van zwakke features:")
print(f"R²-score: {final_r2:.6f}")
print(f"RMSE: {final_rmse:.6f}")


R²-score: 0.730764
RMSE: 49.780356

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

# Laad de dataset
df = pd.read_csv("df_cleaned.csv")

# Definieer de target- en feature-variabelen
X = df.drop(columns=['LAID_UP_TIME'])
y = df['LAID_UP_TIME']

# Features die weinig bijdragen aan het model
features_to_remove = ['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 'ACCIDENT_VEHICLE','PURCHASE_WEEKDAY'
,'MAX_TRAILOR_LOAD','VEHICLE_MODEL_ID_NAME']

# Verwijder de zwakke features
X_reduced = X.drop(columns=features_to_remove)

# Initialiseer het Random Forest-model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Evaluatie na feature selectie
scores = cross_validate(rf, X_reduced, y, cv=2, scoring=['r2', 'neg_root_mean_squared_error'])
final_r2 = scores['test_r2'].mean()
final_rmse = -scores['test_neg_root_mean_squared_error'].mean()

# Print de resultaten
print("Modelprestaties na verwijderen van zwakke features:")
print(f"R²-score: {final_r2:.6f}")
print(f"RMSE: {final_rmse:.6f}")


| Feature_removed                     | R2       | RMSE      | R2_change  | RMSE_change |
|--------------------------------------|----------|-----------|------------|-------------|
| PURCHASE_YEAR                        | 0.645415 | 57.139275 | -0.085350  | 7.358919    |
| days_between_leasing_and_purchase    | 0.704072 | 52.185184 | -0.026692  | 2.404828    |
| PURCHASE_MONTH                        | 0.705293 | 52.082397 | -0.025472  | 2.302041    |
| COMMISSION_TYPE                      | 0.711722 | 51.509933 | -0.019042  | 1.729577    |
| SCALED_CURRENT_VALUE                 | 0.718270 | 50.915531 | -0.012495  | 1.135175    |
| SCALED_TOTAL_SALE_PRICE              | 0.718858 | 50.866687 | -0.011906  | 1.086331    |
| OFFICE                               | 0.725618 | 50.258420 | -0.005146  | 0.478065    |
| COMPANY                              | 0.726574 | 50.164070 | -0.004190  | 0.383714    |
| MILAGE_SALES                         | 0.726720 | 50.155881 | -0.004045  | 0.375525    |
| VEHICLE_GROUP                        | 0.727427 | 50.092717 | -0.003337  | 0.312361    |
| YEAR_CONSTRUCTION                    | 0.727505 | 50.079856 | -0.003259  | 0.299500    |
| SCALED_GUIDE_PRICE                   | 0.727722 | 50.058539 | -0.003043  | 0.278183    |
| NUMBER_OWNERS                        | 0.728015 | 50.033043 | -0.002749  | 0.252687    |
| PC-1                                 | 0.728158 | 50.001792 | -0.002607  | 0.221436    |
| FUEL_TYPE_NAME                       | 0.728497 | 49.993130 | -0.002267  | 0.212774    |
| MANUFACTURER_SHORT                   | 0.728795 | 49.963508 | -0.001969  | 0.183152    |
| HORSEPOWER                           | 0.729013 | 49.948476 | -0.001751  | 0.168120    |
| FINANCING_TYPE                       | 0.729375 | 49.909777 | -0.001389  | 0.129421    |
| MILEAGE                              | 0.729596 | 49.889660 | -0.001168  | 0.109304    |
| OFFICE_MAIN_BRAND                    | 0.729909 | 49.859393 | -0.000856  | 0.079037    |


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

#  Stap 1: Data inladen en opschonen
df = pd.read_csv("df_cleaned.csv")

# Verwijder de minst belangrijke features
df = df.drop(columns=['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 
                      'ACCIDENT_VEHICLE', 'PURCHASE_WEEKDAY', 'MAX_TRAILOR_LOAD', 
                      'VEHICLE_MODEL_ID_NAME'], errors='ignore')

# Controleer op missende waarden
missing_values = df.isnull().sum()
print("Missende waarden per kolom:\n", missing_values)

#  Stap 2: Data splitsen in features (X) en target (y)
X = df.drop('LAID_UP_TIME', axis=1)
y = df['LAID_UP_TIME']

#  Stap 3: Baseline model trainen en evalueren
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# K-fold cross-validatie (cv=2)
baseline_scores = cross_validate(rf, X, y, cv=2, scoring=['r2', 'neg_root_mean_squared_error'])
baseline_r2 = baseline_scores['test_r2'].mean()
baseline_rmse = -baseline_scores['test_neg_root_mean_squared_error'].mean()

print(f"Baseline R2-score: {baseline_r2:.4f}")
print(f"Baseline RMSE: {baseline_rmse:.4f}")

#  Stap 4: Hyperparameter-tuning met GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200, 500],  # Meer bomen testen
    'max_depth': [None, 10, 20, 30, 50],  # Extra diepe bomen toestaan
    'min_samples_split': [2, 5, 10, 20],  # Experimenteer met splitsing
    'min_samples_leaf': [1, 2, 4, 8],  # Test grotere bladeren
    'max_features': ['sqrt', 'log2', None],  # Feature-selectie optimaliseren
}

# Initialiseer GridSearch
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='r2', cv=5, n_jobs=-1, verbose=2)

# Train Grid Search
grid_search.fit(X, y)

#  Stap 5: Beste model en prestaties tonen
print("Beste parameters:", grid_search.best_params_)
print("Beste R2-score:", grid_search.best_score_)





Beste parameters: {'max_depth': 50, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

# Laad de dataset
df = pd.read_csv("df_cleaned.csv")

# Definieer de target- en feature-variabelen
X = df.drop(columns=['LAID_UP_TIME'])
y = df['LAID_UP_TIME']

# Features die weinig bijdragen aan het model verwijderen
features_to_remove = ['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 'ACCIDENT_VEHICLE', 
                      'PURCHASE_WEEKDAY', 'MAX_TRAILOR_LOAD', 'VEHICLE_MODEL_ID_NAME']
X = X.drop(columns=features_to_remove, errors='ignore')  # errors='ignore' voorkomt fouten als een feature niet bestaat

# Initialiseer het Random Forest-model met de beste parameters
rf = RandomForestRegressor(n_estimators=500, max_depth=50, max_features=None, 
                           min_samples_leaf=1, min_samples_split=2, random_state=42)

# Evaluatie na feature selectie
scores = cross_validate(rf, X, y, cv=2, scoring=['r2', 'neg_root_mean_squared_error'])
final_r2 = scores['test_r2'].mean()
final_rmse = -scores['test_neg_root_mean_squared_error'].mean()

# Print de resultaten
print("Modelprestaties na verwijderen van zwakke features:")
print(f"R²-score: {final_r2:.6f}")
print(f"RMSE: {final_rmse:.6f}")


R²-score: 0.732794
RMSE: 49.595427

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Laad de dataset
df = pd.read_csv("df_cleaned.csv")

# Definieer de target- en feature-variabelen
X = df.drop(columns=['LAID_UP_TIME'])
y = df['LAID_UP_TIME']

# Features die weinig bijdragen verwijderen
features_to_remove = ['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 'ACCIDENT_VEHICLE', 
                      'PURCHASE_WEEKDAY', 'MAX_TRAILOR_LOAD', 'VEHICLE_MODEL_ID_NAME']
X = X.drop(columns=features_to_remove, errors='ignore')

# Split de data in train- en testsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialiseer het XGBoost-model met getunede hyperparameters
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    objective='reg:squarederror',
    random_state=42
)

# Train het model
xgb_model.fit(X_train, y_train)

# Voorspel en evalueer op testdata
y_pred = xgb_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print resultaten
print("XGBoost Model Performance:")
print(f"R²-score: {r2:.6f}")
print(f"RMSE: {rmse:.6f}")


R²-score: 0.740730
RMSE: 46.988551

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Laad de dataset
df = pd.read_csv("df_cleaned.csv")

# Definieer de target- en feature-variabelen
X = df.drop(columns=['LAID_UP_TIME'])
y = df['LAID_UP_TIME']

# Features die weinig bijdragen verwijderen
features_to_remove = ['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 'ACCIDENT_VEHICLE', 
                      'PURCHASE_WEEKDAY', 'MAX_TRAILOR_LOAD', 'VEHICLE_MODEL_ID_NAME']
X = X.drop(columns=features_to_remove, errors='ignore')

# Split de data in train- en testsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definieer de hyperparameter grid voor GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [6, 8, 10, 12],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'min_child_weight': [1, 5, 10]
}

# Initialiseer het XGBoost-model zonder hyperparameters
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Initialiseer GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model,
                           param_grid=param_grid,
                           cv=5,  # 5-voudige cross-validatie
                           scoring='neg_mean_squared_error',  # Negatieve MSE (om het te maximaliseren)
                           n_jobs=-1,  # Gebruik alle cores
                           verbose=1)  # Print voortgang

# Voer de GridSearchCV uit
grid_search.fit(X_train, y_train)

# Print de beste hyperparameters
print("Beste parameters gevonden door GridSearchCV:")
print(grid_search.best_params_)

# Gebruik het beste model uit de GridSearchCV
best_model = grid_search.best_estimator_

# Train het beste model
best_model.fit(X_train, y_train)

# Voorspel en evalueer op testdata
y_pred = best_model.predict(X_test)

# Bereken R² en RMSE
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print de resultaten
print("XGBoost Model Performance:")
print(f"R²-score: {r2:.6f}")
print(f"RMSE: {rmse:.6f}")


Beste parameters gevonden door GridSearchCV:
{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 12, 'min_child_weight': 5, 'n_estimators': 500, 'subsample': 0.9}

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Laad de dataset
df = pd.read_csv("df_cleaned.csv")

# Definieer de target- en feature-variabelen
X = df.drop(columns=['LAID_UP_TIME'])
y = df['LAID_UP_TIME']

# Features die weinig bijdragen verwijderen
features_to_remove = ['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 'ACCIDENT_VEHICLE', 
                      'PURCHASE_WEEKDAY', 'MAX_TRAILOR_LOAD', 'VEHICLE_MODEL_ID_NAME']
X = X.drop(columns=features_to_remove, errors='ignore')

# Split de data in train- en testsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialiseer het XGBoost-model met getunede hyperparameters
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.7,
    min_child_weight=5,
    objective='reg:squarederror',
    random_state=42
)

# Train het model
xgb_model.fit(X_train, y_train)

# Voorspel en evalueer op testdata
y_pred = xgb_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print resultaten
print("XGBoost Model Performance:")
print(f"R²-score: {r2:.6f}")
print(f"RMSE: {rmse:.6f}")


R²-score: 0.767338
RMSE: 44.512071

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Laad de dataset
df = pd.read_csv("df_cleaned.csv")

# Definieer de target- en feature-variabelen
X = df.drop(columns=['LAID_UP_TIME'])
y = df['LAID_UP_TIME']

# Features die weinig bijdragen verwijderen
features_to_remove = ['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 'ACCIDENT_VEHICLE', 
                      'PURCHASE_WEEKDAY', 'MAX_TRAILOR_LOAD', 'VEHICLE_MODEL_ID_NAME']
X = X.drop(columns=features_to_remove, errors='ignore')

# Split de data in train- en testsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialiseer het XGBoost-model met getunede hyperparameters
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.7,
    min_child_weight=5,
    objective='reg:squarederror',
    random_state=42
)

# Train het XGBoost-model
xgb_model.fit(X_train, y_train)

# Initialiseer het Random Forest-model met de beste parameters
rf = RandomForestRegressor(n_estimators=500, max_depth=50, max_features=None, 
                           min_samples_leaf=1, min_samples_split=2, random_state=42)

# Train het Random Forest-model
rf.fit(X_train, y_train)

# Voorspel de waarden op de testset met beide modellen
y_pred_xgb = xgb_model.predict(X_test)
y_pred_rf = rf.predict(X_test)

# Combineer de voorspellingen door het gemiddelde te nemen
y_pred_combined = (y_pred_xgb + y_pred_rf) / 2

# Bereken R² en RMSE voor het gecombineerde model
r2_combined = r2_score(y_test, y_pred_combined)
rmse_combined = np.sqrt(mean_squared_error(y_test, y_pred_combined))

# Print de resultaten van het gecombineerde model
print("Gecombineerd Model Prestaties:")
print(f"R²-score: {r2_combined:.6f}")
print(f"RMSE: {rmse_combined:.6f}")


Gecombineerd Model Prestaties:
R²-score: 0.769368
RMSE: 44.317542

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Laad de dataset
df = pd.read_csv("df_cleaned.csv")

# Definieer de target- en feature-variabelen
X = df.drop(columns=['LAID_UP_TIME'])
y = df['LAID_UP_TIME']

# Features die weinig bijdragen verwijderen
features_to_remove = ['PC-3', 'PC-2', 'PC-4', 'COLOR', 'VEHICLE_TYPE', 'ACCIDENT_VEHICLE', 
                      'PURCHASE_WEEKDAY', 'MAX_TRAILOR_LOAD', 'VEHICLE_MODEL_ID_NAME']
X = X.drop(columns=features_to_remove, errors='ignore')

# Split de data in train- en testsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialiseer het XGBoost-model met getunede hyperparameters
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.7,
    min_child_weight=5,
    objective='reg:squarederror',
    random_state=42
)

# Train het XGBoost-model
xgb_model.fit(X_train, y_train)

# Initialiseer het Random Forest-model met de beste parameters
rf = RandomForestRegressor(n_estimators=500, max_depth=50, max_features=None, 
                           min_samples_leaf=1, min_samples_split=2, random_state=42)

# Train het Random Forest-model
rf.fit(X_train, y_train)

# Voorspel de waarden op de testset met beide modellen
y_pred_xgb = xgb_model.predict(X_test)
y_pred_rf = rf.predict(X_test)

# Combineer de voorspellingen door het gemiddelde te nemen
y_pred_combined = (y_pred_xgb + y_pred_rf) / 2

# Bereken initiële R² en RMSE voor het gecombineerde model
r2_combined = r2_score(y_test, y_pred_combined)
rmse_combined = np.sqrt(mean_squared_error(y_test, y_pred_combined))

# Print de initiële resultaten
print("Initiële Gecombineerd Model Prestaties:")
print(f"R²-score: {r2_combined:.6f}")
print(f"RMSE: {rmse_combined:.6f}")

# Variabele om de vorige score op te slaan voor vergelijking
previous_r2 = r2_combined
previous_rmse = rmse_combined

# Teller voor het aantal keer dat we +1 toevoegen
increase_count = 0

# Herhaal het proces totdat een score niet verbetert
while True:
    # Voeg 1 toe aan de voorspellingen
    y_pred_combined_updated = y_pred_combined - 1
    
    # Bereken nieuwe R² en RMSE
    r2_updated = r2_score(y_test, y_pred_combined_updated)
    rmse_updated = np.sqrt(mean_squared_error(y_test, y_pred_combined_updated))

    # Vergelijk de nieuwe scores met de oude
    if r2_updated > previous_r2 or rmse_updated < previous_rmse:
        # Als de nieuwe score beter is, update de voorspellingen, sla de nieuwe scores op, en verhoog de teller
        y_pred_combined = y_pred_combined_updated
        previous_r2 = r2_updated
        previous_rmse = rmse_updated
        increase_count += 1  # Verhoog de teller
    else:
        # Stop als de score verslechtert
        break

# Print de uiteindelijke resultaten en het aantal aanpassingen
print("\nFinale Gecombineerd Model Prestaties na aanpassen:")
print(f"R²-score: {previous_r2:.6f}")
print(f"RMSE: {previous_rmse:.6f}")
print(f"Aantal keer -1 toegevoegd: {increase_count}")


Initiële Gecombineerd Model Prestaties:
R²-score: 0.769368
RMSE: 44.317542

Finale Gecombineerd Model Prestaties na aanpassen:
R²-score: 0.769510
RMSE: 44.303875
Aantal keer -1 toegevoegd: 1