In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [19]:
# Try importing XGBoost
try:
    from xgboost import XGBRegressor
    xgboost_available = True
except ImportError:
    xgboost_available = False


In [20]:
data = pd.read_csv("/kaggle/input/wind-turbine-scada-dataset/T1.csv", delimiter=',')

data.columns = ['DateTime', 'LV_ActivePower', 'Wind_Speed', 'Theoretical_Power_Curve', 'Wind_Direction']

data['DateTime'] = pd.to_datetime(data['DateTime'], format='%d %m %Y %H:%M')

data['Hour'] = data['DateTime'].dt.hour
data['Day'] = data['DateTime'].dt.day
data['Month'] = data['DateTime'].dt.month

In [21]:

features = ['Wind_Speed', 'Theoretical_Power_Curve', 'Wind_Direction', 'Hour', 'Day', 'Month']
target = 'LV_ActivePower'

X_train, X_test, y_train, y_test = train_test_split(
    data[features], data[target], test_size=0.2, random_state=42, shuffle=True)

In [22]:
models = {
    "Linear Regression": LinearRegression(),
    "Support Vector Machine": SVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor()
}

In [23]:
if xgboost_available:
    models["XGBoost"] = XGBRegressor()

In [24]:
# Hyperparameter tuning using RandomizedSearchCV
rf_params = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

gb_params = {
    "n_estimators": [100, 200, 300, 400, 500],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [3, 5, 7, 10],
    "subsample": [0.7, 0.8, 0.9, 1.0]
}

if xgboost_available:
    xgb_params = {
        "n_estimators": [100, 200, 300, 400, 500],
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "max_depth": [3, 5, 7, 10],
        "subsample": [0.7, 0.8, 0.9, 1.0],
        "colsample_bytree": [0.7, 0.8, 0.9, 1.0]
    }

In [25]:
# Function to perform hyperparameter tuning
def tune_model(model, params):
    search = RandomizedSearchCV(model, params, cv=3, n_jobs=-1, scoring='neg_mean_absolute_error', random_state=42, n_iter=20)
    search.fit(X_train, y_train)
    return search.best_estimator_

In [26]:
# Tune models
print("Tuning Random Forest...")
models["Random Forest"] = tune_model(models["Random Forest"], rf_params)

print("Tuning Gradient Boosting...")
models["Gradient Boosting"] = tune_model(models["Gradient Boosting"], gb_params)

if xgboost_available:
    print("Tuning XGBoost...")
    models["XGBoost"] = tune_model(models["XGBoost"], xgb_params)

Tuning Random Forest...
Tuning Gradient Boosting...
Tuning XGBoost...


In [27]:

# Evaluate all models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    results[name] = {
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R² Score": r2
    }

In [28]:
# Display results
print("\nModel Performance Comparison:")
for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"  Mean Absolute Error (MAE): {metrics['MAE']:.2f}")
    print(f"  Mean Squared Error (MSE): {metrics['MSE']:.2f}")
    print(f"  Root Mean Squared Error (RMSE): {metrics['RMSE']:.2f}")
    print(f"  R² Score: {metrics['R² Score']:.4f}")


Model Performance Comparison:

Linear Regression:
  Mean Absolute Error (MAE): 196.68
  Mean Squared Error (MSE): 166183.13
  Root Mean Squared Error (RMSE): 407.66
  R² Score: 0.9026

Support Vector Machine:
  Mean Absolute Error (MAE): 164.75
  Mean Squared Error (MSE): 178382.97
  Root Mean Squared Error (RMSE): 422.35
  R² Score: 0.8955

Decision Tree:
  Mean Absolute Error (MAE): 93.37
  Mean Squared Error (MSE): 79658.12
  Root Mean Squared Error (RMSE): 282.24
  R² Score: 0.9533

Random Forest:
  Mean Absolute Error (MAE): 78.91
  Mean Squared Error (MSE): 45809.57
  Root Mean Squared Error (RMSE): 214.03
  R² Score: 0.9732

Gradient Boosting:
  Mean Absolute Error (MAE): 65.42
  Mean Squared Error (MSE): 34626.67
  Root Mean Squared Error (RMSE): 186.08
  R² Score: 0.9797

XGBoost:
  Mean Absolute Error (MAE): 66.60
  Mean Squared Error (MSE): 27967.34
  Root Mean Squared Error (RMSE): 167.23
  R² Score: 0.9836


In [30]:
import joblib

best_model_name = max(results, key=lambda k: results[k]['R² Score'])
best_model = models[best_model_name]

In [31]:
best_model

In [32]:
joblib.dump(best_model, f"{best_model_name}_best_model.pkl")
print(f"\n✅ Best model '{best_model_name}' saved as {best_model_name}_best_model.pkl")


✅ Best model 'XGBoost' saved as XGBoost_best_model.pkl
