# before replacement

In [None]:
import pandas
import matplotlib.pyplot as plot
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
before_water_data = pandas.read_csv('../data/cleaned_before.csv')

# datetime object maken van datetime
before_water_data['datetime'] = pandas.to_datetime(before_water_data['datetime'])

# verwijderen van undercarriage_replacement, aangezien die gelijk is voor de hele dataset
before_water_data = before_water_data.drop('undercarriage_replacement', axis=1)

In [None]:
def create_lag_features(df, feature, lag, step_size):
    for i in range(1, lag + 1):
        if (i - 1) % step_size == 0:
            df[f'{feature}_lag_{i}'] = df[feature].shift(i)
    df = df.dropna()  # wegja halen van de NaN waardes door het shiften van de data
    return df

In [None]:
before_water_data = create_lag_features(before_water_data, 'water_level_cmNAP', 6, 2)
# before_water_data = create_lag_features(before_water_data, 'air_temperature_01C', 3, 1)
before_water_data

In [None]:
# vervangt lock movement met boolean, zodat het gebruikt wordt door ons model
before_water_data.lock_movement = before_water_data['lock_movement'].replace('closing', 0)
before_water_data.lock_movement = before_water_data['lock_movement'].replace('opening', 1)

# eerste datum
min_date = before_water_data['datetime'].iloc[0]

# aantal dagen sinds die dag
before_water_data['days_since_first_measurement'] = (before_water_data['datetime'] - min_date).dt.days

In [None]:
# functie om datetime object om te zetten naar variabelen
# tijdens het testen bleek dit niet niet voor verbetering te zorgen
def addDates(df):
  # df['day'] = df.datetime.dt.day
  # df['month'] = df.datetime.dt.month
  # df['year'] = df.datetime.dt.year
  # df['hour'] = df.datetime.dt.hour
  # df['minute'] = df.datetime.dt.minute
  return df

before_water_data = addDates(before_water_data)


In [None]:
# de correlatie van onze numerieke data
whole_corr = before_water_data.select_dtypes(include=['float64', 'int64', 'int32']).corr()

In [None]:
import seaborn

plot.figure(figsize=(10, 8))

# plot de heatmap
seaborn.heatmap(whole_corr, annot=True, cmap='coolwarm')

# laat de plot zien
plot.show()

In [None]:
# alleen numerieke data gebruiken
numeric_features = before_water_data.select_dtypes(include=[float, int]).columns
before_water_data = before_water_data[numeric_features]


# dataframe opsplitsen in features en target
X = before_water_data.drop(['energy_proxy', 'air_temperature_01C'], axis=1)
y = before_water_data['energy_proxy']

# dataframe opsplitsen in train en test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X


In [None]:
# de parameters voor de randomforest regressor
param_grid = {
    'n_estimators': [600],
    'max_depth': [10, 12],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [4, 6]
}

# de randomForestRegressor
random_forest_regressor = RandomForestRegressor(random_state=42)

# grid search
grid_search = GridSearchCV(estimator=random_forest_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# pak beste estimator/ combinatie van parameters
best_rf = grid_search.best_estimator_

# voorspel met beste estimator
y_pred = best_rf.predict(X_test)

# bereken de scores
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print de scores
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-validate: {-cv_scores}")


In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# werkt alleen als de kolommen een list zijn
feature_names = X_train.columns.tolist()  # Assuming X_train is a DataFrame

# Visualize the tree with a limited depth and adjusted spacing and text size
plt.figure(figsize=(20, 10))
plot_tree(best_rf.estimators_[0], feature_names=feature_names, filled=True, rounded=True, max_depth=3, proportion=True, fontsize=7)
plt.show()

In [None]:
# Scatter plot
plot.scatter(y_pred, y_test, alpha=0.4)

# diagonale lijn
plot.plot([min(y_pred), max(y_pred)], [min(y_pred), max(y_pred)], color='red', alpha=0.5)

# Labels en titel
plot.xlabel('Predicted values')
plot.ylabel('Actual values')
plot.title('Scatter plot of Predicted vs Actual values')

# laat de plot zien
plot.show()

In [None]:
# feature importance van beste estimator
feature_importances = best_rf.feature_importances_
features = X_train.columns

# het maken van een dataframe om de feature importance te plotten
importance_df = pandas.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# sorteren op importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot
plot.figure(figsize=(10, 8))
seaborn.barplot(x='Importance', y='Feature', data=importance_df)
plot.title('Feature Importance')
plot.show()


# after replacement

In [None]:
import pandas
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score


In [None]:
after_water_data = pandas.read_csv('../data/cleaned_after.csv')

# datetime object maken van datetime
after_water_data['datetime'] = pandas.to_datetime(after_water_data['datetime'])

# verwijderen van undercarriage_replacement, aangezien die gelijk is voor de hele dataset
after_water_data = after_water_data.drop('undercarriage_replacement', axis=1)


In [None]:
def create_lag_features(df, feature, lag, step_size):
    for i in range(1, lag + 1):
        if (i - 1) % step_size == 0:
            df[f'{feature}_lag_{i}'] = df[feature].shift(i)
    df = df.dropna()  # wegja halen van de NaN waardes door het shiften van de data
    return df


In [None]:
after_water_data = create_lag_features(after_water_data, 'water_level_cmNAP', 6, 2)
after_water_data


In [None]:
# vervangt lock movement met boolean, zodat het gebruikt wordt door ons model
after_water_data.lock_movement = after_water_data['lock_movement'].replace('closing', 0)
after_water_data.lock_movement = after_water_data['lock_movement'].replace('opening', 1)

# eerste datum
min_date = after_water_data['datetime'].iloc[0]

# aantal dagen sinds die dag
after_water_data['days_since_first_measurement'] = (after_water_data['datetime'] - min_date).dt.days
after_water_data


In [None]:
# functie om datetime object om te zetten naar variabelen
# tijdens het testen bleek dit niet niet voor verbetering te zorgen
def addDates(df):
  # df['day'] = df.datetime.dt.day
  # df['month'] = df.datetime.dt.month
  # df['year'] = df.datetime.dt.year
  # df['hour'] = df.datetime.dt.hour
  # df['minute'] = df.datetime.dt.minute
  return df


In [None]:

after_water_data = addDates(after_water_data)

In [None]:

# de correlatie van onze numerieke data
whole_corr = after_water_data.select_dtypes(include=['float64', 'int64', 'int32']).corr()


In [None]:
import seaborn

plot.figure(figsize=(10, 8))

# plot de heatmap
seaborn.heatmap(whole_corr, annot=True, cmap='coolwarm')

# laat de plot zien
plot.show()

In [None]:
# alleen numerieke data gebruiken
numeric_features = after_water_data.select_dtypes(include=[float, int]).columns
after_water_data = after_water_data[numeric_features]


# dataframe opsplitsen in features en target
X = after_water_data.drop(['energy_proxy'], axis=1)
y = after_water_data['energy_proxy']

# dataframe opsplitsen in train en test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X


In [None]:

# de parameters voor de randomforest regressor
param_grid = {
    'n_estimators': [600],
    'max_depth': [12],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [4, 6]
}

# de randomForestRegressor
random_forest_regressor = RandomForestRegressor(random_state=42)

# grid search
grid_search = GridSearchCV(estimator=random_forest_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# pak beste estimator/ combinatie van parameters
best_rf = grid_search.best_estimator_

# voorspel met beste estimator
y_pred = best_rf.predict(X_test)

# bereken de scores
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print de scores
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-validate: {-cv_scores}")



In [None]:
results = pandas.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results = results.sort_index()

plt.figure(figsize=(14, 7))
plt.scatter(results.index, results['Actual'], label='Actual', color='blue')
plt.scatter(results.index, results['Predicted'], label='Predicted', color='red', linestyle='--', alpha=0.4)
plt.xlabel('Index (Time)')
plt.ylabel('Energy Proxy')
plt.title('Actual vs Predicted Values over Time')
plt.legend()
plt.show()

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# werkt alleen als de kolommen een list zijn
feature_names = X_train.columns.tolist()  # Assuming X_train is a DataFrame

# Visualize the tree with a limited depth and adjusted spacing and text size
plt.figure(figsize=(20, 10))
plot_tree(best_rf.estimators_[0], feature_names=feature_names, filled=True, rounded=True, max_depth=3, proportion=True, fontsize=7)
plt.show()


In [None]:
# Scatter plot
plot.scatter(y_pred, y_test, alpha=0.3)

# diagonale lijn
plot.plot([min(y_pred), max(y_pred)], [min(y_pred), max(y_pred)], color='red', alpha=0.5)

# Labels en titel
plot.xlabel('Predicted values')
plot.ylabel('Actual values')
plot.title('Energy proxy predictions of RandomForest regression vs actual values')

# laat de plot zien
plot.show()


In [None]:
# feature importance van beste estimator
feature_importances = best_rf.feature_importances_
features = X_train.columns

# het maken van een dataframe om de feature importance te plotten
importance_df = pandas.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# sorteren op importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot
plot.figure(figsize=(10, 8))
seaborn.barplot(x='Importance', y='Feature', data=importance_df)
plot.title('Feature Importance')
plot.show()

# all data

In [None]:
import pandas
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score


In [None]:
all_water_data = pandas.read_csv('../data/cleaned_after.csv')

# datetime object maken van datetime
all_water_data['datetime'] = pandas.to_datetime(all_water_data['datetime'])
all_water_data.info()


In [None]:

def create_lag_features(df, feature, lag, step_size):
    for i in range(1, lag + 1):
        if (i - 1) % step_size == 0:
            df[f'{feature}_lag_{i}'] = df[feature].shift(i)
    df = df.dropna()  # wegja halen van de NaN waardes door het shiften van de data
    return df


In [None]:

all_water_data = create_lag_features(all_water_data, 'water_level_cmNAP', 6, 2)
# # all_water_data = create_lag_features(all_water_data, 'water_temperature_C', 1, 3)
# all_water_data[0:20]


In [None]:

# vervangt lock movement met boolean, zodat het gebruikt wordt door ons model
all_water_data.lock_movement = all_water_data['lock_movement'].replace('closing', 0)
all_water_data.lock_movement = all_water_data['lock_movement'].replace('opening', 1)


In [None]:
min_date = all_water_data[all_water_data['undercarriage_replacement'].diff() == 0].index[0]

# aantal dagen verschil tussen dag en dag van onderstel vervanging
all_water_data['date difference'] = (all_water_data['datetime'] - all_water_data.datetime[min_date]).dt.days
all_water_data.head()


In [None]:
# functie om datetime object om te zetten naar variabelen
# tijdens het testen bleek dit niet niet voor verbetering te zorgen
def addDates(df):
  # df['day'] = df.datetime.dt.day
  # df['month'] = df.datetime.dt.month
  # df['year'] = df.datetime.dt.year
  # df['hour'] = df.datetime.dt.hour
  # df['minute'] = df.datetime.dt.minute
  return df

all_water_data = addDates(all_water_data)


In [None]:
# de correlatie van onze numerieke data
whole_corr = all_water_data.select_dtypes(include=['float64', 'int64', 'int32']).corr()


In [None]:
import seaborn
plot.figure(figsize=(10, 8))

# plot de heatmap
seaborn.heatmap(whole_corr, annot=True, cmap='coolwarm')

# laat de plot zien
plot.show()


In [None]:

# alleen numerieke data gebruiken
numeric_features = all_water_data.select_dtypes(include=[float, int]).columns
all_water_data = all_water_data[numeric_features]


# dataframe opsplitsen in features en target
X = all_water_data.drop(['energy_proxy', 'air_temperature_01C'], axis=1)
y = all_water_data['energy_proxy']

# dataframe opsplitsen in train en test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X


In [None]:
# de parameters voor de randomforest regressor
param_grid = {
    'n_estimators': [600, 1200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [3, 6, 9, 12]
}

# de randomForestRegressor
random_forest_regressor = RandomForestRegressor(random_state=42)

# grid search
grid_search = GridSearchCV(estimator=random_forest_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# pak beste estimator/ combinatie van parameters
best_rf = grid_search.best_estimator_

# voorspel met beste estimator
y_pred = best_rf.predict(X_test)

# bereken de scores
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print de scores
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-validate: {-cv_scores}")

In [None]:
mape = mean_absolute_percentage_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print de scores
print(f"Best mape: {mape}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

In [None]:
# Scatter plot
plot.scatter(y_pred, y_test, alpha=0.3)

# diagonale lijn
plot.plot([min(y_pred), max(y_pred)], [min(y_pred), max(y_pred)], color='red', alpha=0.5)

# Labels en titel
plot.xlabel('Predicted values')
plot.ylabel('Actual values')
plot.title('Scatter plot of Predicted vs Actual values')

# laat de plot zien
plot.show()

In [None]:
results = pandas.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results = results.sort_index()

plt.figure(figsize=(14, 7))
# plt.scatter(results.index, results['Actual'], label='Actual', color='blue')
plt.scatter(all_water_data.index[all_water_data.lock_movement == 1], all_water_data.energy_proxy[all_water_data.lock_movement == 1], label='lock opening', color='blue', alpha=0.5)
plt.scatter(all_water_data.index[all_water_data.lock_movement == 0], all_water_data.energy_proxy[all_water_data.lock_movement == 0], label='lock closing', color='green', alpha=0.5)

# plt.scatter(results.index, results['Predicted'], label='Predicted', color='red', linestyle='--', alpha=0.4)
plt.xlabel('Index (Time)')
plt.ylabel('Energy Proxy')
plt.title('Energy Usage of the Lock (Opening vs Closing)')
plt.legend()
plt.show()

In [None]:
# feature importance van beste estimator
feature_importances = best_rf.feature_importances_
features = X_train.columns

# het maken van een dataframe om de feature importance te plotten
importance_df = pandas.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# sorteren op importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot
plot.figure(figsize=(10, 8))
seaborn.barplot(x='Importance', y='Feature', data=importance_df)
plot.title('Feature Importance')
plot.show()

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# werkt alleen als de kolommen een list zijn
feature_names = X_train.columns.tolist()  # Assuming X_train is a DataFrame

# Visualize the tree with a limited depth and adjusted spacing and text size
plt.figure(figsize=(20, 10))
plot_tree(best_rf.estimators_[0], feature_names=feature_names, filled=True, rounded=True, max_depth=3, proportion=True, fontsize=7)
plt.show()


In [None]:
# feature importance van beste estimator
feature_importances = best_rf.feature_importances_
features = X_train.columns

# het maken van een dataframe om de feature importance te plotten
importance_df = pandas.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# sorteren op importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot
plot.figure(figsize=(10, 8))
seaborn.barplot(x='Importance', y='Feature', data=importance_df)
plot.title('Feature Importance')
plot.show()