In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor   


In [9]:
df = pd.read_csv('dataset_for_machine_learning.csv')

# Convert the date columns to datetimeint
df['DatumFileBeginInt'] = df['DatumFileBeginInt'].str.replace('-', '').astype(int)

# Define the road section researched
df = df[df['RouteOms']== 'A4']

#  Remove duplicate columns and unnecessary columns
columns_to_remove = ['RouteOms', 'latitude_x', 'longitude_x', 'provincie_x', 'latitude_y', 
                     'longitude_y', 'RouteLet', 'GemLengte', 'FileDuur', 'NLSitNummer',
                    'RouteNum', 'RouteLet_encoded', 'HectometerStaart', 'DatumFileBeginInt', 
                    'station_code', 'DatumTijdFileEind'
]

df = df.drop(columns=columns_to_remove)

df['DatumTijdFileBegin'] = pd.to_datetime(df['DatumTijdFileBegin']) 

# Define target column to drop
columns_to_drop = ['FileZwaarte', 'DatumTijdFileBegin']

features = [
    "HectometerKop",
    "SecondsSinceMidnight_Begin",
    "Diesel",
    "Aantal_check_ins_previous",
    "Aantal_check_ins_current",
    "Gasoline",
    "LPG",
    "P",
    "T",
    "KopWegvakVan_encoded",
    "KopWegvakNaar_encoded",
    "U",
    "Year",
    "hectometreringsrichting",
    "TrajVan_encoded",
    "provincie_x_encoded",
    "R",
    "OorzaakCode",
    "TrajNaar_encoded"
]



In [11]:
X = df[features]
y = df['FileZwaarte']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that scales the data then applies RandomForestRegressor
pipeline = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(
        n_estimators=100, 
        max_features='sqrt', 
        max_depth=20, 
        min_samples_split=10, 
        min_samples_leaf=4, 
        random_state=42
    )
)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model on the test set
test_mae = mean_absolute_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

print(f"Test MAE: {test_mae}")
print(f"Test MSE: {test_mse}")
print(f"Test R2: {test_r2}")

Test MAE: 28.00078410696901
Test MSE: 2099.8670299269047
Test R2: 0.13908361119398616


In [13]:
X = df[features]
y = df['FileZwaarte']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that scales the data then applies XGBRegressor
pipeline = make_pipeline(
    StandardScaler(),
    XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=1.0,
        colsample_bytree=1.0,
        min_child_weight=9,
        random_state=42
    )
)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model on the test set
test_mae = mean_absolute_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

print(f"Test MAE: {test_mae}")
print(f"Test MSE: {test_mse}")
print(f"Test R2: {test_r2}")

Test MAE: 28.00966866495624
Test MSE: 2084.9169336446416
Test R2: 0.1452129435375089


In [14]:
X = df[features]
y = df['FileZwaarte']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that scales the data then applies LGBMRegressor
pipeline = make_pipeline(
    StandardScaler(),
    LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.7,
        colsample_bytree=0.7,
        min_child_weight=1,
        random_state=42
    )
)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model on the test set
test_mae = mean_absolute_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

print(f"Test MAE: {test_mae}")
print(f"Test MSE: {test_mse}")
print(f"Test R2: {test_r2}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2517
[LightGBM] [Info] Number of data points in the train set: 68761, number of used features: 19
[LightGBM] [Info] Start training from score 31.443126
Test MAE: 27.906966380919926
Test MSE: 2073.2693248413198
Test R2: 0.1499883017703253
