In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import joblib
matplotlib.use('TkAgg')
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load Dataset
df = pd.read_csv("smart_home_energy_consumption.csv")

In [3]:
# Display basic info
display(df.head())
display(df.info())
display(df.describe())

Unnamed: 0,Timestamp,Device ID,Device Type,Power Consumption (W),Room Location,Temperature (°C),Humidity (%),Usage Duration (minutes),Energy Cost ($),On/Off Status
0,2024-03-25 16:38:00,D-1860,Washing Machine,341.72,Garage,23.95,52.29,75,51.26,On
1,2024-05-01 16:03:00,D-9322,Laptop Charger,31.24,Bedroom,25.83,76.93,130,8.12,On
2,2024-06-12 17:56:00,D-5555,Smart Bulb,11.12,Garage,21.48,44.56,59,1.31,On
3,2024-04-13 19:43:00,D-2899,Heater,2028.47,Living Room,15.7,60.38,21,85.2,On
4,2024-08-30 16:17:00,D-9792,Washing Machine,732.82,Bedroom,20.78,30.8,130,190.53,On


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Timestamp                 50000 non-null  object 
 1   Device ID                 50000 non-null  object 
 2   Device Type               50000 non-null  object 
 3   Power Consumption (W)     50000 non-null  float64
 4   Room Location             50000 non-null  object 
 5   Temperature (°C)          50000 non-null  float64
 6   Humidity (%)              50000 non-null  float64
 7   Usage Duration (minutes)  50000 non-null  int64  
 8   Energy Cost ($)           50000 non-null  float64
 9   On/Off Status             50000 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 3.8+ MB


None

Unnamed: 0,Power Consumption (W),Temperature (°C),Humidity (%),Usage Duration (minutes),Energy Cost ($)
count,50000.0,50000.0,50000.0,50000.0,50000.0
mean,570.842,22.499879,54.993893,90.41814,103.3127
std,743.596529,4.321193,14.469001,51.659167,165.772992
min,2.5,15.0,30.0,1.0,0.01
25%,50.96,18.76,42.42,46.0,4.04
50%,134.725,22.5,54.98,90.0,22.605
75%,1013.8575,26.22,67.45,135.0,130.1525
max,2999.71,30.0,80.0,179.0,1041.14


In [4]:
# Check for missing values
print(df.isnull().sum())

Timestamp                   0
Device ID                   0
Device Type                 0
Power Consumption (W)       0
Room Location               0
Temperature (°C)            0
Humidity (%)                0
Usage Duration (minutes)    0
Energy Cost ($)             0
On/Off Status               0
dtype: int64


In [5]:
# Exploratory Data Analysis (EDA)
plt.figure(figsize=(12, 6))
sns.histplot(df['Power Consumption (W)'], bins=30, kde=True)
plt.title('Distribution of Power Consumption')
plt.show()

In [6]:
# Filter numeric columns  
numeric_df = df.select_dtypes(include=['number'])  

# Handle NaN if needed  
numeric_df = numeric_df.fillna(0)  # or numeric_df.dropna()  

# Create the heatmap  
plt.figure(figsize=(12, 6))  
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')  
plt.title('Feature Correlation Heatmap')  
plt.show()

In [7]:
# Additional EDA
plt.figure(figsize=(12, 6))
sns.boxplot(x='Device Type', y='Power Consumption (W)', data=df)
plt.title('Power Consumption by Device Type')
plt.xticks(rotation=45)
plt.show()

In [8]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Temperature (°C)', y='Power Consumption (W)', hue='Device Type', data=df)
plt.title('Power Consumption vs Temperature')
plt.show()

In [9]:
# Convert categorical features
# Encode and save encoders
device_encoder = LabelEncoder()
df['Device Type'] = device_encoder.fit_transform(df['Device Type'])

room_encoder = LabelEncoder()
df['Room Location'] = room_encoder.fit_transform(df['Room Location'])

# Store encoders for later
encoders = {
    'device': device_encoder,
    'room': room_encoder
}

df['On/Off Status'] = df['On/Off Status'].map({'On': 1, 'Off': 0})

In [10]:
# Selecting Features and Target
X = df[['Device Type', 'Power Consumption (W)', 'Room Location', 'Temperature (°C)', 'Humidity (%)', 'Usage Duration (minutes)', 'On/Off Status']]
y = df['Energy Cost ($)']

In [None]:
# Split for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# Model Training - Random Forest
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest R2 Score:", r2_score(y_test, y_pred_rf))
print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))

Random Forest R2 Score: 0.9998325498624763
Random Forest RMSE: 2.1169925665710756


In [14]:
# Model Training - Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
print("Gradient Boosting R2 Score:", r2_score(y_test, y_pred_gb))
print("Gradient Boosting RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_gb)))

Gradient Boosting R2 Score: 0.9998191046309137
Gradient Boosting RMSE: 2.200342581199195


In [15]:
# Model Training - Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
print("Ridge Regression R2 Score:", r2_score(y_test, y_pred_ridge))
print("Ridge Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_ridge)))

Ridge Regression R2 Score: 0.7725308573572106
Ridge Regression RMSE: 78.0257270516386


In [16]:
# Feature Importance (Random Forest)
plt.figure(figsize=(12, 6))
sns.barplot(x=X.columns, y=rf_model.feature_importances_)
plt.title('Feature Importance - Random Forest')
plt.xticks(rotation=45)
plt.show()

In [17]:
import joblib

# Save the best model (e.g., Random Forest)
joblib.dump(rf_model, "testModel.pkl")

# Save the encoders
joblib.dump(device_encoder, "device_encoder.pkl")
joblib.dump(room_encoder, "room_encoder.pkl")

# Save the scaler
joblib.dump(scaler, "scaler.pkl")

joblib.dump(gb_model, 'testModel.pkl')


['testModel.pkl']