In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
import joblib

def main():
    # Load and preprocess data
    df = pd.read_csv(
        r"D:\Data_Science\VS_Code Projects\Power_Pulse\household_power_consumption.txt", 
        sep=';', 
        low_memory=False, 
        na_values=['?']
    )

    df['DateTime'] = pd.to_datetime(
        df['Date'] + ' ' + df['Time'], 
        dayfirst=True, 
        format='%d/%m/%Y %H:%M:%S'
    )
    df = df.drop(['Date', 'Time'], axis=1)

    # Handle missing values
    imputer = SimpleImputer(strategy='median')
    # Select only numeric columns using data types
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

    # Feature engineering
    df['Hour'] = df['DateTime'].dt.hour
    df['DayOfWeek'] = df['DateTime'].dt.dayofweek
    df['Month'] = df['DateTime'].dt.month

    # EDA and save plots
    print("\nData Summary:")
    print(df.describe())

    plt.figure(figsize=(12,8))
    sns.pairplot(df[['Global_active_power', 'Global_reactive_power',
                      'Voltage', 'Global_intensity']])
    plt.savefig('eda_pairplot.png')
    plt.close()

    # Modeling
    X = df.drop(['DateTime', 'Global_active_power'], axis=1)
    y = df['Global_active_power']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    r2 = model.score(X_test, y_test)

    # Format each metric with six decimal places
    print(f'\nMAE: {mae:.6f}')
    print(f'RMSE: {rmse:.6f}')
    print(f'R²: {r2:.6f}')

    # Feature importance visualization
    plt.figure(figsize=(10,6))
    features = X.columns
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.title('Feature Importance')
    plt.barh(range(len(indices)), importances[indices], align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.gca().invert_yaxis()
    plt.savefig('feature_importance.png')
    plt.close()

    # Save model
    joblib.dump(model, 'power_consumption_model.pkl')
    print("\nModel saved to power_consumption_model.pkl")

if __name__ == "__main__":
    main()



Data Summary:
       Global_active_power  Global_reactive_power       Voltage  \
count         2.075259e+06           2.075259e+06  2.075259e+06   
mean          1.085486e+00           1.234176e-01  2.408420e+02   
min           7.600000e-02           0.000000e+00  2.232000e+02   
25%           3.100000e-01           4.800000e-02  2.390200e+02   
50%           6.020000e-01           1.000000e-01  2.410100e+02   
75%           1.520000e+00           1.920000e-01  2.428600e+02   
max           1.112200e+01           1.390000e+00  2.541500e+02   
std           1.052065e+00           1.120452e-01  3.219699e+00   

       Global_intensity  Sub_metering_1  Sub_metering_2  Sub_metering_3  \
count      2.075259e+06    2.075259e+06    2.075259e+06    2.075259e+06   
mean       4.602375e+00    1.107879e+00    1.282265e+00    6.390116e+00   
min        2.000000e-01    0.000000e+00    0.000000e+00    0.000000e+00   
25%        1.400000e+00    0.000000e+00    0.000000e+00    0.000000e+00   
50%   