In [None]:
import pandas
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score

# predictions from before replacement

In [None]:
water_data_before = pandas.read_csv('../data/cleaned_before.csv')

In [None]:
water_data_before.describe()

In [None]:
# Separating features and target
X = water_data_before['water_temperature_C'].values.reshape(-1, 1)
y = water_data_before['energy_proxy']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# de classifier
random_forest_regressor = RandomForestRegressor(random_state=42)

# Training met train set
random_forest_regressor.fit(X_train, y_train)

# voorspellen met test set
y_pred = random_forest_regressor.predict(X_test)

# scores berekenen
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the scores
print(f"Mean absolute Error: {mae}")
print(f"R-squared: {r2}")
# print(forest.feature_importances_)
# Accuracy: 0.7865168539325843

In [None]:
plot.scatter(y_pred, y_test, alpha=0.4)

### predictions before replacement - hyper parameters + lag features

In [None]:
water_data_before = pandas.read_csv('../data/cleaned_before.csv')

In [None]:
def create_lag_features(df, feature, lag=3):
    for i in range(1, lag + 1):
        df[f'{feature}_lag_{i}'] = df[feature].shift(i)
    df = df.dropna()  # Remove rows with NaN values generated by shifting
    return df

water_data_before = create_lag_features(water_data_before, 'water_temperature_C')

In [None]:
# Separating features and target
X = water_data_before['water_temperature_C'].values.reshape(-1, 1)
y = water_data_before['energy_proxy']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
param_grid = {
    'n_estimators': [200],
    'max_depth': [5,20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize Random Forest regressor
random_forest_regressor = RandomForestRegressor(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=random_forest_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_rf = grid_search.best_estimator_

# Predict with the best estimator
y_pred = best_rf.predict(X_test)

# Calculate scores
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the best parameters and scores
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

In [None]:
plot.scatter(y_pred, y_test, alpha=0.4)

# predictions after replacement

In [None]:
water_data_after = pandas.read_csv('./waterinfo/cleaned_after.csv')

In [None]:
water_data_after.describe()

In [None]:
# Separating features and target
X = water_data_after['water_temperature_C'].values.reshape(-1, 1)
y = water_data_after['energy_proxy']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
random_forest_regressor = RandomForestRegressor(random_state=42)

# Training met train set
random_forest_regressor.fit(X_train, y_train)

# voorspellen met test set
y_pred = random_forest_regressor.predict(X_test)

# scores berekenen

# Calculate scores
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the scores
print(f"Mean absolute Error: {mae}")
print(f"R-squared: {r2}")

In [None]:
plot.scatter(y_pred, y_test, alpha=0.4)

### prediction after replacement - hyperparameters + lag features

In [None]:
water_data_after = pandas.read_csv('./waterinfo/cleaned_after.csv')

In [None]:
def create_lag_features(df, feature, lag=3):
    for i in range(1, lag + 1):
        df[f'{feature}_lag_{i}'] = df[feature].shift(i)
    df = df.dropna()  # Remove rows with NaN values generated by shifting
    return df

water_data_after = create_lag_features(water_data_after, 'water_temperature_C')

In [None]:
X = water_data_after['water_temperature_C'].values.reshape(-1, 1)
y = water_data_after['energy_proxy']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3]
}

# Initialize Random Forest regressor
random_forest_regressor = RandomForestRegressor(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=random_forest_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_rf = grid_search.best_estimator_

# Predict with the best estimator
y_pred = best_rf.predict(X_test)

# Calculate scores
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the best parameters and scores
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

In [None]:
plot.scatter(y_pred, y_test, alpha=0.4)

# predictions entire dataset

In [None]:
all_water_data = pandas.read_csv('../data/cleaned_all.csv')
all_water_data.describe()

In [None]:
# Separating features and target
X = all_water_data['water_temperature_C'].values.reshape(-1, 1)
y = all_water_data['energy_proxy']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
linear_regression = LinearRegression()

# Training with the train set
linear_regression.fit(X_train, y_train)

# Predict with the test set
y_pred = linear_regression.predict(X_test)

# Calculate scores
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the scores
print(f"Mean absolute Error: {mae}")
print(f"R-squared: {r2}")
plot.scatter(y_pred, y_test, alpha=0.4)

### predictions entire dataset - hyperparameters + lag features

In [None]:
all_water_data = pandas.read_csv('./waterinfo/cleaned_all.csv')
all_water_data.describe()

In [None]:
def create_lag_features(df, feature, lag=3):
    for i in range(1, lag + 1):
        df[f'{feature}_lag_{i}'] = df[feature].shift(i)
    df = df.dropna()  # Remove rows with NaN values generated by shifting
    return df

In [None]:

all_water_data = create_lag_features(all_water_data, 'water_temperature_C')

In [None]:
X = all_water_data['water_temperature_C'].values.reshape(-1, 1)
y = all_water_data['energy_proxy']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3]
}

# Initialize Random Forest regressor
random_forest_regressor = RandomForestRegressor(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=random_forest_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_rf = grid_search.best_estimator_

# Predict with the best estimator
y_pred = best_rf.predict(X_test)

# Calculate scores
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the best parameters and scores
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")
plot.scatter(y_pred, y_test, alpha=0.4)