In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor


In [2]:
# Replace with your actual dataset path
df = pd.read_csv(r'L:\Guvi\Power\household_power_consumption.txt', sep=';', na_values='?', low_memory=False)

# Combine Date and Time
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H:%M:%S')
df = df.drop(columns=['Date', 'Time'])


Data Preprocessing

In [3]:
# Remove missing values
df.dropna(inplace=True)

# Set datetime as index
df.set_index('Datetime', inplace=True)

# Convert columns to numeric
df = df.apply(pd.to_numeric)

# Resample to hourly data (optional for smoothing)
df_hourly = df.resample('H').mean()

# Fill NA after resampling
df_hourly.fillna(method='ffill', inplace=True)


  df_hourly = df.resample('H').mean()
  df_hourly.fillna(method='ffill', inplace=True)


Feature Engineering

In [4]:
# Add time-based features
df_hourly['hour'] = df_hourly.index.hour
df_hourly['day'] = df_hourly.index.day
df_hourly['month'] = df_hourly.index.month
df_hourly['weekday'] = df_hourly.index.weekday

# Define features and target
features = df_hourly.drop('Global_active_power', axis=1)
target = df_hourly['Global_active_power']


Train test split and scaling

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Model Training

In [7]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Neural Network": MLPRegressor(random_state=42, max_iter=500)
}

results = {}

print("Training models...")
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    predictions = model.predict(X_test_scaled)

    # Compute RMSE manually using sqrt of MSE
    mse = mean_squared_error(y_test, predictions)
    rmse = sqrt(mse)

    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    results[name] = {"RMSE": rmse, "MAE": mae, "R2": r2}
    print(f"{name} -> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


Training models...
Linear Regression -> RMSE: 0.0206, MAE: 0.0143, R2: 0.9995
Random Forest -> RMSE: 0.0184, MAE: 0.0113, R2: 0.9996
Gradient Boosting -> RMSE: 0.0196, MAE: 0.0132, R2: 0.9995
Neural Network -> RMSE: 0.0271, MAE: 0.0197, R2: 0.9991


In [8]:
best_model_name = min(results, key=lambda x: results[x]['RMSE'])
best_model = models[best_model_name]

print(f"\nBest model: {best_model_name}")



Best model: Random Forest


Plot

In [9]:
y_pred = best_model.predict(X_test_scaled)

plt.figure(figsize=(14, 5))
plt.plot(y_test.values[:200], label='Actual')
plt.plot(y_pred[:200], label='Predicted')
plt.title(f'{best_model_name} Predictions vs Actual')
plt.xlabel('Time Index')
plt.ylabel('Global Active Power (kilowatts)')
plt.legend()
plt.tight_layout()
plt.savefig('prediction_vs_actual.png')
plt.close()


In [10]:
if best_model_name in ["Random Forest", "Gradient Boosting"]:
    importances = best_model.feature_importances_
    feature_names = features.columns

    plt.figure(figsize=(10, 6))
    sns.barplot(x=importances, y=feature_names)
    plt.title("Feature Importance")
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()


In [11]:
print("\nDone! Plots saved as PNG files.")



Done! Plots saved as PNG files.
