In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data
usage_data = pd.read_csv('combined_file.csv')
weather_data = pd.read_csv('weather_file.csv')

# Convert date columns to datetime
usage_data['StartDate'] = pd.to_datetime(usage_data['StartDate'])
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])

# Merge the datasets on date and time
merged_data = pd.merge_asof(usage_data.sort_values('StartDate'),
                            weather_data.sort_values('datetime'),
                            left_on='StartDate',
                            right_on='datetime')

# Drop rows with NaN values in the target column (electricity usage)
merged_data = merged_data.dropna(subset=['Value (kWh)'])

# Feature engineering
merged_data['hour'] = merged_data['StartDate'].dt.hour
merged_data['day'] = merged_data['StartDate'].dt.day
merged_data['month'] = merged_data['StartDate'].dt.month

# Select relevant features for the model
features = ['temp', 'humidity', 'windgust', 'solarradiation', 'hour', 'day', 'month']
X = merged_data[features]
y = merged_data['Value (kWh)']  # Target variable is the electricity usage

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to compare
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# Evaluate each model
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    results.append({
        'Model': name,
        'MAE': mae,
        'MSE': mse
    })

# Convert results to DataFrame for easy viewing
results_df = pd.DataFrame(results)

# Show the results
print(results_df)


               Model       MAE       MSE
0  Linear Regression  0.822356  1.219067
1      Decision Tree  0.756143  1.427761
2      Random Forest  0.579778  0.735226
3  Gradient Boosting  0.684640  0.898106


Testing RandomForestRegressor to find best hyperparameters

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],              # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40, 50],                # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],                        # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],                          # Minimum samples required at a leaf node
    'max_features': ['auto', 'sqrt'],                       # Number of features to consider for the best split
    'bootstrap': [True, False]                              # Whether to use bootstrap samples when building trees
}

# Instantiate the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Use RandomizedSearchCV to search across a wide range of hyperparameters
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                                   n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring='neg_mean_absolute_error')

# Fit the random search model
random_search.fit(X_train, y_train)

# Print the best parameters and the corresponding MAE
print(f"Best Parameters: {random_search.best_params_}")
best_mae = -random_search.best_score_  # Convert from negative MAE
print(f"Best Mean Absolute Error (Cross-Validation): {best_mae}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
Best Mean Absolute Error (Cross-Validation): 0.6013189570849058


In [3]:
# Retrain the Random Forest model with the best parameters
best_rf = RandomForestRegressor(n_estimators=100, min_samples_split=2, min_samples_leaf=1, 
                                max_features='sqrt', max_depth=50, bootstrap=True, random_state=42)

# Fit the model on the training data
best_rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred_best_rf = best_rf.predict(X_test)

# Evaluate the model's performance
mae_best_rf = mean_absolute_error(y_test, y_pred_best_rf)
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)

print(f'Mean Absolute Error (Test Set): {mae_best_rf}')
print(f'Mean Squared Error (Test Set): {mse_best_rf}')

Mean Absolute Error (Test Set): 0.5647301803482586
Mean Squared Error (Test Set): 0.6943068087213307


Model to Predict Electric Bill

In [4]:
def predict_future_bill(model, data, features, electricity_rate=0.179):
    """
    Predicts the future electricity bill based on the input model and data.
    
    Parameters:
    - model: The trained Random Forest model.
    - data: The data for the period to predict (should include relevant weather data and features).
    - features: List of features used in the model.
    - electricity_rate: The rate per kWh (default is $0.179).
    
    Returns:
    - total_kwh: The total predicted electricity usage for the period.
    - estimated_bill: The estimated electricity bill for the period.
    """
    
    # Ensure data contains only the required features
    X_period = data[features]
    
    # Predict electricity usage for the specific period
    predicted_usage = model.predict(X_period)
    
    # Sum up the predicted usage (total kWh)
    total_kwh = predicted_usage.sum()
    
    # Calculate the estimated bill
    estimated_bill = total_kwh * electricity_rate
    
    print(f"Total Predicted kWh: {total_kwh:.2f}")
    print(f"Estimated Bill: ${estimated_bill:.2f}")
    
    return total_kwh, estimated_bill


In [14]:
# Assuming the data for October is filtered
october_data = merged_data[merged_data['StartDate'].dt.month == 9]

# Call the function to predict the bill for October
total_kwh, estimated_bill = predict_future_bill(best_rf, october_data, features, electricity_rate=0.12)


Total Predicted kWh: 1626.85
Estimated Bill: $195.22


Model to Predcit Free Electric from 8pm-8am

In [8]:
def predict_time_based_bill(model, data, features, electricity_rate=0.179, free_hours_start=20, free_hours_end=8):
    """
    Predicts the future electricity bill for a time-based pricing plan with free electricity from 8 PM to 8 AM.
    
    Parameters:
    - model: The trained Random Forest model.
    - data: The data for the period to predict (should include relevant weather data and features).
    - features: List of features used in the model.
    - electricity_rate: The rate per kWh during paid hours (default is $0.179).
    - free_hours_start: The start of the free period (default is 8 PM, which is 20 in 24-hour format).
    - free_hours_end: The end of the free period (default is 8 AM, which is 8 in 24-hour format).
    
    Returns:
    - total_kwh_paid: The total predicted electricity usage during paid hours.
    - total_kwh_free: The total predicted electricity usage during free hours.
    - estimated_bill: The estimated electricity bill for the period.
    """
    
    # Create a copy of the data to avoid modifying the original DataFrame slice
    data_copy = data.copy()
    
    # Ensure data contains only the required features
    X_period = data_copy[features]
    
    # Predict electricity usage for the specific period
    predicted_usage = model.predict(X_period)
    
    # Add the predicted usage to the data copy for analysis
    data_copy['predicted_usage'] = predicted_usage
    
    # Define conditions for paid and free hours
    is_free_hour = (data_copy['hour'] >= free_hours_start) | (data_copy['hour'] < free_hours_end)
    
    # Calculate total kWh during paid and free hours
    total_kwh_paid = data_copy.loc[~is_free_hour, 'predicted_usage'].sum()
    total_kwh_free = data_copy.loc[is_free_hour, 'predicted_usage'].sum()
    
    # Calculate the estimated bill for paid hours only
    estimated_bill = total_kwh_paid * electricity_rate
    
    print(f"Total kWh during paid hours: {total_kwh_paid:.2f}")
    print(f"Total kWh during free hours: {total_kwh_free:.2f}")
    print(f"Estimated Bill: ${estimated_bill:.2f}")
    
    return total_kwh_paid, total_kwh_free, estimated_bill


In [12]:
# Filter data for September (or any desired period)
september_data = merged_data[merged_data['StartDate'].dt.month == 9]

# Call the function to predict the bill for September with free electricity from 8 PM to 8 AM
total_kwh_paid, total_kwh_free, estimated_bill = predict_time_based_bill(
    best_rf, september_data, features, electricity_rate=0.29, free_hours_start=20, free_hours_end=8)


Total kWh during paid hours: 766.60
Total kWh during free hours: 860.25
Estimated Bill: $222.31


Test Multiple Advance Models:

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Prepare the data (assuming 'merged_data' and 'features' are defined)
X = merged_data[features]
y = merged_data['Value (kWh)']  # Target variable is the electricity usage

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to compare
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=200, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
}

# Evaluate each model
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    results.append({
        'Model': name,
        'MAE': mae,
        'MSE': mse
    })

# Convert results to DataFrame for easy viewing
results_df = pd.DataFrame(results)

# Show the results
print(results_df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000065 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 852
[LightGBM] [Info] Number of data points in the train set: 6431, number of used features: 7
[LightGBM] [Info] Start training from score 1.808701
               Model       MAE       MSE
0  Linear Regression  0.822356  1.219067
1      Random Forest  0.576801  0.725145
2            XGBoost  0.627159  0.804574
3           LightGBM  0.633043  0.805378
4  Gradient Boosting  0.625085  0.798247
