# before replacement

In [None]:
import pandas
import matplotlib.pyplot as plot
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
before_water_data = pandas.read_csv('../data/cleaned_before.csv')

# datetime object maken van datetime
before_water_data['datetime'] = pandas.to_datetime(before_water_data['datetime'])


Deze functie maakt de lag features voor ons mode. hier kan je invullen hoeveel dagen je terug wilt, en hoeveel stappen terug de lag features zijn

In [None]:
def create_lag_features(df, feature, lag, step_size):
    for i in range(1, lag + 1):
        if (i - 1) % step_size == 0:
            df[f'{feature}_lag_{i}'] = df[feature].shift(i)
    df = df.dropna()  # wegja halen van de NaN waardes door het shiften van de data
    return df

In [None]:
before_water_data = create_lag_features(before_water_data, 'water_level_cmNAP', 6, 2)
before_water_data

Lock movement is toegevoegd omdat uit visualisaties bleek dat deze een grote correlatie heeft met het energie gebruik. De motor gebruikt meer energie wanneer de sluis deuren open gaan vergeleken met wanneer ze dicht gaan

In [None]:
# vervangt lock movement met boolean, zodat het gebruikt wordt door ons model
before_water_data.lock_movement = before_water_data['lock_movement'].replace('closing', 0)
before_water_data.lock_movement = before_water_data['lock_movement'].replace('opening', 1)


Onze theorie was dat de motor minder efficient draait hoe langer het is dat het onderstel voor het laatst vervangen is. Dit blijkt ook waar te zijn, omdat het aantal dagen verschil een hele grote correlatie heeft met het energie verbruik

In [None]:
min_date = before_water_data[before_water_data['undercarriage_replacement'].diff() == 0].index[0]

# aantal dagen verschil tussen dag en dag van onderstel vervanging
before_water_data['date difference'] = (before_water_data['datetime'] - before_water_data.datetime[min_date]).dt.days
before_water_data.head()

Hier slaan we de correlaties op waar de heatmap van gemaakt wordt

In [None]:
# de correlatie van onze numerieke data
whole_corr = before_water_data.select_dtypes(include=['float64', 'int64', 'int32']).corr()

In [None]:
import seaborn

plot.figure(figsize=(10, 8))

# plot de heatmap
seaborn.heatmap(whole_corr, annot=True, cmap='coolwarm')

# laat de plot zien
plot.show()

## Het model

Hier wordt de data opgesplitst in train en test data. Air temperature wordt hiervoor uitgesloten aangezien deze een hoge correlatie heeft met water temperatuur

In [None]:
# alleen numerieke data gebruiken
numeric_features = before_water_data.select_dtypes(include=[float, int]).columns
before_water_data = before_water_data[numeric_features]


# dataframe opsplitsen in features en target
X_before = before_water_data.drop(['energy_proxy', 'air_temperature_01C'], axis=1)
y_before = before_water_data['energy_proxy']

# dataframe opsplitsen in train en test set
X_train_before, X_test_before, y_train_before, y_test_before = train_test_split(X_before, y_before, test_size=0.2, random_state=42)
X_before


In [None]:
# de parameters voor de randomforest regressor
param_grid = {
    'n_estimators': [600],
    'max_depth': [12],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [6, 12]
}

# de randomForestRegressor
random_forest_regressor = RandomForestRegressor(random_state=42)

# grid search
grid_search = GridSearchCV(estimator=random_forest_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_before, y_train_before)

# pak beste estimator/ combinatie van parameters
best_rf = grid_search.best_estimator_

# voorspel met beste estimator
y_pred_before = best_rf.predict(X_test_before)


Het opslaan van de scores

In [None]:

# bereken de scores
mae = mean_absolute_error(y_test_before, y_pred_before)
r2 = r2_score(y_test_before, y_pred_before)

# Print de scores
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

cv_scores = cross_val_score(best_rf, X_train_before, y_train_before, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-validate: {-cv_scores}")


visualizaties van de voorspellingen

In [None]:
# Scatter plot
plot.scatter(y_pred_before, y_test_before, alpha=0.2)

# diagonale lijn
plot.plot([min(y_pred_before), max(y_pred_before)], [min(y_pred_before), max(y_pred_before)], color='red', alpha=0.5)

# Labels en titel
plot.title('Energy proxy voorspellingen van lineair regessie model v echte waarden')
plot.xlim(2750000, 4000000)
plot.ylim(2750000, 4000000)

# laat de plot zien
plot.show()

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# werkt alleen als de kolommen een list zijn
feature_names = X_train_before.columns.tolist()  # Assuming X_train is a DataFrame

# Visualize the tree with a limited depth and adjusted spacing and text size
plt.figure(figsize=(20, 10))
plot_tree(best_rf.estimators_[0], feature_names=feature_names, filled=True, rounded=True, max_depth=3, proportion=True, fontsize=7)
plt.show()

In [None]:
results = pandas.DataFrame({'Actual': y_test_before, 'Predicted': y_pred_before})
results = results.sort_index()

plt.figure(figsize=(14, 7))
plt.scatter(results.index, results['Actual'], label='Actual', color='blue')
plt.scatter(results.index, results['Predicted'], label='Predicted', color='red', linestyle='--', alpha=0.4)
plt.xlabel('Index (Tijd)')
plt.ylabel('Energy Proxy')
plt.title('Voorspelde waarden van linear regressie model & echte waarde')
plt.legend()
plt.show()

In [None]:
# feature importance van beste estimator
feature_importances = best_rf.feature_importances_
features = X_train_before.columns

# het maken van een dataframe om de feature importance te plotten
importance_df = pandas.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# sorteren op importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot
plot.figure(figsize=(10, 8))
seaborn.barplot(x='Importance', y='Feature', data=importance_df)
plot.title('Feature Importance')
plot.show()


# after replacement

In [None]:
import pandas
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score


### laden van de data

In [None]:
after_water_data = pandas.read_csv('../data/cleaned_after.csv')
after_water_data['datetime'] = pandas.to_datetime(after_water_data['datetime'])

### toevoegen van variabelen

Het toevoegen van de lag features. hier worden 3 kolommen gemaakt wat lag features zijn van water temp

In [None]:
def create_lag_features(df, feature, lag, step_size):
    for i in range(1, lag + 1):
        if (i - 1) % step_size == 0:
            df[f'{feature}_lag_{i}'] = df[feature].shift(i)
    df = df.dropna()  # wegja halen van de NaN waardes door het shiften van de data
    return df

after_water_data = create_lag_features(after_water_data, 'water_level_cmNAP', 6, 2)
after_water_data

voegt lock movement toe (dus of de lock opent of sluit) als boolean i.p.v. object

In [None]:
after_water_data.lock_movement = after_water_data['lock_movement'].replace('closing', 0)
after_water_data.lock_movement = after_water_data['lock_movement'].replace('opening', 1)

Voegt als nieuwe variable het aantal dagen tot/ sinds de vervanging van het onderstel. Dit is gemaakto p de theorie dat de motor midner efficient werkt hoe langer het duurt dat het onderstel vervangen is

In [None]:
min_date = after_water_data[after_water_data['undercarriage_replacement'].diff() == 0].index[0]

# aantal dagen verschil tussen dag en dag van onderstel vervanging
after_water_data['date difference'] = (after_water_data['datetime'] - after_water_data.datetime[min_date]).dt.days
after_water_data.head()

### De correlaties en heatmap

In [None]:
whole_corr = after_water_data.select_dtypes(include=['float64', 'int64', 'int32']).corr()

In [None]:
import seaborn

plot.figure(figsize=(10, 8))

# plot de heatmap
seaborn.heatmap(whole_corr, annot=True, cmap='coolwarm')

# laat de plot zien
plot.show()

### het opsplitsen en voorspellen van de data. 

In [None]:
# alleen numerieke data gebruiken
numeric_features = after_water_data.select_dtypes(include=[float, int]).columns
after_water_data = after_water_data[numeric_features]


# dataframe opsplitsen in features en target
X_after = after_water_data.drop(['energy_proxy', 'air_temperature_01C'], axis=1)
y_after = after_water_data['energy_proxy']

# dataframe opsplitsen in train en test set
X_train_after, X_test_after, y_train_after, y_test_after = train_test_split(X_after, y_after, test_size=0.2, random_state=42)
X_after


In [None]:

# de parameters voor de randomforest regressor
param_grid = {
    'n_estimators': [600],
    'max_depth': [10, 12],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [6, 12]
}

# de randomForestRegressor
random_forest_regressor = RandomForestRegressor(random_state=42)

# grid search
grid_search = GridSearchCV(estimator=random_forest_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_after, y_train_after)

# pak beste estimator/ combinatie van parameters
best_rf = grid_search.best_estimator_

# voorspel met beste estimator
y_pred_after = best_rf.predict(X_test_after)


Hier worden de verschillende scores geprint, zoals de MAE, MAPE en de cross validatie resultaten

In [None]:
mape = mean_absolute_percentage_error(y_test_after, y_pred_after)
mae = mean_absolute_error(y_test_after, y_pred_after)
r2 = r2_score(y_test_after, y_pred_after)

# Print de scores
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best mape: {mape * 100}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

cv_scores = cross_val_score(best_rf, X_train_after, y_train_after, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-validate: {-cv_scores}")

### Visualisaties van de voorspellingen

In [None]:
# Scatter plot
plot.scatter(y_pred_after, y_test_after, alpha=0.2)

# diagonale lijn
plot.plot([min(y_pred_after), max(y_pred_after)], [min(y_pred_after), max(y_pred_after)], color='red', alpha=0.5)

# Labels en titel
plot.title('Energy proxy voorspellingen van lineair regessie model v echte waarden')
plot.xlim(2100000, 3100000)
plot.ylim(2100000, 3100000)

# laat de plot zien
plot.show()

In [None]:
results = pandas.DataFrame({'Actual': y_test_after, 'Predicted': y_pred_after})
results = results.sort_index()

plt.figure(figsize=(14, 7))
plt.scatter(results.index, results['Actual'], label='Actual', color='blue', s=35)
plt.scatter(results.index, results['Predicted'], label='Predicted', color='red', linestyle='--', alpha=0.4, s=35)
plt.xlabel('Index (Time)')
plt.ylabel('Energy Proxy')
plt.title('Voorspelde waarden van linear regressie model & echte waarde')
plt.legend()
plt.show()

het visualizeren van de beste boom van de random forest

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# werkt alleen als de kolommen een list zijn
feature_names = X_train_after.columns.tolist()  # Assuming X_train is a DataFrame

# Visualize the tree with a limited depth and adjusted spacing and text size
plt.figure(figsize=(20, 10))
plot_tree(best_rf.estimators_[0], feature_names=feature_names, filled=True, rounded=True, max_depth=3, proportion=True, fontsize=7)
plt.show()


feature importance

In [None]:
# feature importance van beste estimator
feature_importances = best_rf.feature_importances_
features = X_train_after.columns

# het maken van een dataframe om de feature importance te plotten
importance_df = pandas.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# sorteren op importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot
plot.figure(figsize=(10, 8))
seaborn.barplot(x='Importance', y='Feature', data=importance_df)
plot.title('Feature Importance')
plot.show()

# gehele dataset

In [None]:
import pandas
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score


### importeren van de data

In [None]:
all_water_data = pandas.read_csv('../data/cleaned_all.csv')

all_water_data['datetime'] = pandas.to_datetime(all_water_data['datetime'])
all_water_data.info()

### nieuwe features toevoegen

Dit voegt de lag features toe voor water level aan de hand van de functie. Hier kan je aangeven hoe veel rijen het model terug gaat, en in hoeveel stappen dit moet gebeuren

In [None]:
def create_lag_features(df, feature, lag, step_size):
    for i in range(1, lag + 1):
        if (i - 1) % step_size == 0:
            df[f'{feature}_lag_{i}'] = df[feature].shift(i)
    df = df.dropna()  # wegja halen van de NaN waardes door het shiften van de data
    return df
all_water_data = create_lag_features(all_water_data, 'water_level_cmNAP', 6, 2)

Voegt de richting dat de sluisdeuren gaan toe aan de dataset

In [None]:
all_water_data.lock_movement = all_water_data['lock_movement'].replace('closing', 0)
all_water_data.lock_movement = all_water_data['lock_movement'].replace('opening', 1)


Het aantal dagen verschil tussen de dag en wanneer het onderstel vervangen is

In [None]:
min_date = all_water_data[all_water_data['undercarriage_replacement'].diff() == 0].index[0]

# aantal dagen verschil tussen dag en dag van onderstel vervanging
all_water_data['date difference'] = (all_water_data['datetime'] - all_water_data.datetime[min_date]).dt.days
all_water_data.head()


### de correlaties en heatmap

In [None]:
whole_corr = after_water_data.select_dtypes(include=['float64', 'int64', 'int32']).corr()

In [None]:
import seaborn
plot.figure(figsize=(10, 8))

# plot de heatmap
seaborn.heatmap(whole_corr, annot=True, cmap='coolwarm')

# laat de plot zien
plot.show()


### het opsplitsen en voorspellen met de data

de splitsing van de data

In [None]:

# alleen numerieke data gebruiken
numeric_features = all_water_data.select_dtypes(include=[float, int]).columns
all_water_data = all_water_data[numeric_features]


# dataframe opsplitsen in features en target
X_all = all_water_data.drop(['energy_proxy', 'air_temperature_01C'], axis=1)
y_all = all_water_data['energy_proxy']

# dataframe opsplitsen in train en test set
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.2, random_state=42)
X_all


het model en hyper parameters

In [None]:
# de parameters voor de randomforest regressor
param_grid = {
    'n_estimators': [1000],
    'max_depth': [None, 12],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [9, 12]
}

# de randomForestRegressor
random_forest_regressor = RandomForestRegressor(random_state=42)

# grid search
grid_search = GridSearchCV(estimator=random_forest_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_all, y_train_all)

# pak beste estimator/ combinatie van parameters
best_rf = grid_search.best_estimator_

# voorspel met beste estimator
y_pred_all = best_rf.predict(X_test_all)

het opslaan en printen van de resultaten van het model

In [None]:
mape = mean_absolute_percentage_error(y_test_all, y_pred_all)
mae = mean_absolute_error(y_test_all, y_pred_all)
r2 = r2_score(y_test_all, y_pred_all)

# Print de scores
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best mape: {mape * 100}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

cv_scores = cross_val_score(best_rf, X_train_all, y_train_all, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-validate: {-cv_scores}")

### visualizaties over de voorspellingen

scatterplot voorspelde waarden v echte waarden

In [None]:
# Scatter plot
plot.scatter(y_pred_all, y_test_all, alpha=0.2)

# diagonale lijn
plot.plot([min(y_pred_all), max(y_pred_all)], [min(y_pred_all), max(y_pred_all)], color='red', alpha=0.5)

# Labels en titel
plot.title('Energy proxy voorspellingen van lineair regessie model v echte waarden')
plot.xlim(2000000, 4000000)
plot.ylim(2000000, 4000000)

# laat de plot zien
plot.show()

scatterplot van energyproxy, waarbij de blauwe punten de deuren zijn die open gaan, en de groene deuren de deuren die dicht gaan

In [None]:
results = pandas.DataFrame({'Actual': y_test_all, 'Predicted': y_pred_all})
results = results.sort_index()

plot.figure(figsize=(14, 7))
plot.scatter(all_water_data.index[all_water_data.lock_movement == 1], all_water_data.energy_proxy[all_water_data.lock_movement == 1], label='lock opening', color='blue', alpha=0.5)
plot.scatter(all_water_data.index[all_water_data.lock_movement == 0], all_water_data.energy_proxy[all_water_data.lock_movement == 0], label='lock closing', color='green', alpha=0.5)

plot.xlabel('Index (Time)')
plot.ylabel('Energy Proxy')
plot.title('Energy Usage of the Lock (Opening vs Closing)')
plot.legend()
plot.show()

feature importance

In [None]:
# feature importance van beste estimator
feature_importances = best_rf.feature_importances_
features = X_train_all.columns

# het maken van een dataframe om de feature importance te plotten
importance_df = pandas.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# sorteren op importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot
plot.figure(figsize=(10, 8))
seaborn.barplot(x='Importance', y='Feature', data=importance_df)
plot.title('Feature Importance')
plot.show()

In [None]:
results = pandas.DataFrame({'Actual': y_test_all, 'Predicted': y_pred_all})
results = results.sort_index()

plt.figure(figsize=(14, 7))
plt.scatter(results.index, results['Actual'], label='Actual', color='blue')
plt.scatter(results.index, results['Predicted'], label='Predicted', color='red', linestyle='--', alpha=0.4)
plt.xlabel('Index (Time)')
plt.ylabel('Energy Proxy')
plt.title('Voorspelde waarden van linear regressie model & echte waarde')
plt.legend()
plt.show()
