In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('Datasets/wind_dataset.csv')
data = df.drop(columns=['year'])
data

Unnamed: 0,installed_cap,humidity,wind,temp,precipitation,Energy_output
0,0.0,75.73,6.7,28.45,151.21,0.0
1,4096.65,30.63,7.5,64.34,47.54,7426.46
2,0.0,73.22,7.0,25.08,170.79,0.0
3,0.0,73.87,3.9,26.14,145.83,0.0
4,0.0,50.41,12.0,29.51,31.22,0.0
5,0.0,45.64,10.5,25.55,55.65,0.0
6,0.0,49.28,4.9,29.61,67.04,0.0
7,0.0,40.7,8.6,28.91,20.88,0.0
8,0.0,71.46,4.1,28.53,118.42,0.0
9,11722.72,47.57,15.0,29.95,49.06,24794.5


In [7]:
X = data.drop(columns=["Energy_output"])
y = data["Energy_output"]

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_training, X_validation, y_training, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
lin_reg = LinearRegression()
lin_reg.fit(X_training, y_training)
y_pred = lin_reg.predict(X_validation)
lin_mse = mean_squared_error(y_validation, y_pred)
r2 = r2_score(y_validation, y_pred)
print("Linear Regression MSE:", lin_mse)
print("Linear Regression R2:", r2)
y_pred_test = lin_reg.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)
print("Linear Regression Test R2:", r2_test)

Linear Regression MSE: 119198.54989772978
Linear Regression R2: 0.9921273839586329
Linear Regression Test R2: 0.931680341082776


In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_training_scaled = scaler.fit_transform(X_training)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)
y_training_scaled = scaler.fit_transform(y_training.values.reshape(-1, 1))
y_validation_scaled = scaler.transform(y_validation.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
import numpy as np
from scipy.stats import randint
forest_reg = RandomForestRegressor(n_estimators=23,random_state=42)
forest_reg.fit(X_training_scaled, y_training_scaled.ravel())
y_pred_valid = forest_reg.predict(X_validation_scaled)
r2 = r2_score(y_validation_scaled, y_pred_valid)
print("R² Score on Validation Set (before tuning):", r2)


# Step 2: Hyperparameter tuning
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2']
}

random_search = RandomizedSearchCV(
    estimator=forest_reg,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# random_search.fit(X_train, y_train)

# print("Best Parameters:", random_search.best_params_)
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                            scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_training_scaled, y_training_scaled.ravel())
# Step 3: Evaluate best model
best_model = grid_search.best_estimator_
y_pred_valid = best_model.predict(X_validation_scaled)
r2 = r2_score(y_validation_scaled, y_pred_valid)
print("R² Score on Test Set (after tuning):", r2)

R² Score on Validation Set (before tuning): 0.8795585206903708
R² Score on Test Set (after tuning): 0.8312496244779959
