In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('Datasets/hydro.csv')
data = df.drop(df.columns[7:14], axis=1)
data = data.drop(columns=["year"])
data

Unnamed: 0,installed_cap,humidity,wind,temp,precipitation,Energy_output
0,0.0,75.73,6.7,28.45,151.21,0.0
1,1610.0,30.63,7.5,64.34,47.54,1373.19
2,1115.0,73.22,7.0,25.08,170.79,4278.18
3,350.0,73.87,3.9,26.14,145.83,614.7
4,0.0,50.41,12.0,29.51,31.22,0.0
5,0.0,45.64,10.5,25.55,55.65,0.0
6,120.0,49.28,4.9,29.61,67.04,321.76
7,0.0,40.7,8.6,28.91,20.88,0.0
8,0.0,71.46,4.1,28.53,118.42,0.0
9,1990.0,47.57,15.0,29.95,49.06,6133.34


In [3]:
X = data.drop(df.columns[5], axis=1)
y = data[df.columns[5]]

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_training, X_validation, y_training, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
lin_reg = LinearRegression()
lin_reg.fit(X_training, y_training)
y_pred = lin_reg.predict(X_validation)
lin_mse = mean_squared_error(y_validation, y_pred)
r2 = r2_score(y_validation, y_pred)
print("Linear Regression MSE:", lin_mse)
print("Linear Regression R2:", r2)
y_pred_test = lin_reg.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)
print("Linear Regression Test R2:", r2_test)

Linear Regression MSE: 3130.354163711389
Linear Regression R2: 0.27757659453021555
Linear Regression Test R2: 0.599717894714203


In [5]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_reg.fit(X_training, y_training)
y_pred = rf_reg.predict(X_validation)
rf_mse = mean_squared_error(y_validation, y_pred)
r2 = r2_score(y_validation, y_pred)
print("Random Forest MSE:", rf_mse)
print("Random Forest R2:", r2)
y_pred_test = rf_reg.predict(X_test)

Random Forest MSE: 957.0299772066718
Random Forest R2: 0.7791365388347612


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
import numpy as np
from scipy.stats import randint
forest_reg = RandomForestRegressor(random_state=42)


# Step 2: Hyperparameter tuning
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2']
}

random_search = RandomizedSearchCV(
    estimator=forest_reg,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# random_search.fit(X_train, y_train)

# print("Best Parameters:", random_search.best_params_)
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                            scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_training, y_training)
# Step 3: Evaluate best model
best_model = grid_search.best_estimator_
y_pred_valid = best_model.predict(X_validation)
r2 = r2_score(y_validation, y_pred_valid)
print("R² Score on Test Set (after tuning):", r2)

R² Score on Test Set (after tuning): 0.7847071685424034
