In [4]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib  # for saving the model
# Step 2: Load dataset
data = pd.read_csv("household_power_consumption.txt", sep=';', low_memory=False)
data.head()
# Step 3: Data cleaning
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)

# Convert columns to numeric type
cols_to_convert = ['Global_active_power', 'Global_reactive_power', 'Voltage',
                   'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']

for col in cols_to_convert:
    data[col] = data[col].astype(float)

# Check the cleaned data
data.info()
# Step 4: Use only a small sample (for faster training)
data_sample = data.sample(frac=0.02, random_state=42)  # 2% of data (~40k rows)

# Features and target
X = data_sample[['Global_reactive_power', 'Voltage', 'Global_intensity',
                 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']]
y = data_sample['Global_active_power']

# Step 5: Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train a smaller Random Forest model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(
    n_estimators=30,   # fewer trees
    max_depth=10,      # limit tree depth
    random_state=42,
    n_jobs=-1          # use all CPU cores
)
model.fit(X_train, y_train)

# Step 7: Predictions
y_pred = model.predict(X_test)

# Step 8: Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("âœ… Model Evaluation Results:")
print("Mean Absolute Error (MAE):", round(mae, 4))
print("Mean Squared Error (MSE):", round(mse, 4))
print("RÂ² Score:", round(r2, 4))

# Step 9: Save model
import joblib
joblib.dump(model, "power_model.h5")
print("ðŸ’¾ Model saved successfully as power_model.h5!")

 

<class 'pandas.core.frame.DataFrame'>
Index: 2049280 entries, 0 to 2075258
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Date                   object 
 1   Time                   object 
 2   Global_active_power    float64
 3   Global_reactive_power  float64
 4   Voltage                float64
 5   Global_intensity       float64
 6   Sub_metering_1         float64
 7   Sub_metering_2         float64
 8   Sub_metering_3         float64
dtypes: float64(7), object(2)
memory usage: 156.3+ MB
âœ… Model Evaluation Results:
Mean Absolute Error (MAE): 0.0207
Mean Squared Error (MSE): 0.0013
RÂ² Score: 0.9988
ðŸ’¾ Model saved successfully as power_model.h5!
