In [221]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [222]:
# Load dataset
data = pd.read_csv('weather-energy-data-update.csv')


In [223]:
# Ensure 'Datetime' column is in datetime format
data['Datetime'] = pd.to_datetime(data['Datetime'], format='%m/%d/%Y %H:%M')

In [224]:
# Extract 'year' and 'month' from 'Datetime'
data['year'] = data['Datetime'].dt.year
data['month'] = data['Datetime'].dt.month

In [225]:
# Check for null values and handle them
data['humidity'] = data['humidity'].fillna(data['humidity'].median())
data['wind_speed'] = data['wind_speed'].fillna(data['wind_speed'].median())

# Aggregate data to get monthly total energy consumption by year and month
monthly_data = data.groupby(['year', 'month']).agg({
    'kWh': 'sum',  # Sum up kWh for each month
    'humidity': 'mean',  # Use mean humidity for each month
    'wind_speed': 'mean'  # Use mean wind speed for each month
}).reset_index()

In [226]:
# Debugging: Check the aggregated monthly data
print("\nAggregated Monthly Data (by Year and Month):")
print(monthly_data)


Aggregated Monthly Data (by Year and Month):
    year  month       kWh   humidity  wind_speed
0   2018     11  134.4040  91.026944    3.950153
1   2018     12  128.3700  89.874552    3.928315
2   2019      1  153.4080  86.756272    3.777554
3   2019      2  135.1960  86.829489    4.045337
4   2019      3  158.7335  83.968504    4.586295
5   2019      4  118.2790  70.245417    3.248370
6   2019      5  139.7170  77.988665    3.569158
7   2019      6  131.8230  81.023148    3.167245
8   2019      7  147.0860  79.379928    3.098723
9   2019      8  151.3580  81.688172    2.991364
10  2019      9  152.6090  85.289815    3.423755
11  2019     10  167.3670  88.991039    3.051815
12  2019     11  141.2850  93.267500    3.602806
13  2019     12  152.5480  90.600627    4.190253
14  2020      1  175.2810  90.875672    4.721931
15  2020      2  147.7400  87.723851    5.322598
16  2020      3  152.8080  77.039382    4.157191
17  2020      4  127.3900  70.162037    3.454977
18  2020      5  161.73

In [227]:
# Define features and target variable
features = ['month', 'humidity', 'wind_speed']
target = 'kWh'
X = monthly_data[features]
y = monthly_data[target]

In [228]:
# Feature scaling (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Scaling the features

In [229]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=37)
model.fit(X_train, y_train)

In [230]:
# Evaluate the model
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [231]:
# Save the model and scaler as pickle files
with open('energy_model.pkl', 'wb') as f:
    pickle.dump(model, f)
    pickle.dump(scaler, f)

print("Model training complete and saved as 'energy_model.pkl'.")
print(f"Root Mean Squared Error: {rmse:.2f}")

Model training complete and saved as 'energy_model.pkl'.
Root Mean Squared Error: 25.88
