In [3]:
###import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Model Link for Hugging Face
https://huggingface.co/spaces/mrunalrinait/Household_Electricity_Unit_Prediction

In [5]:
df=pd.read_csv(r"C:\INNOMATICS RESEARCH LAB\7 ML\Projects\Regression Projects\household_electricity_demand.csv")
df.head()

Unnamed: 0,Household_ID,State,No_of_Residents,No_of_Appliances,AC_Units,Has_Solar,Monthly_Income,Region,Year,Monthly_Electricity_kWh
0,1,West Bengal,8,26,0,0,38247,Urban,2015,479.47
1,2,Gujarat,9,6,0,0,118679,Rural,2018,361.22
2,3,Uttar Pradesh,5,8,0,0,144381,Urban,2016,177.11
3,4,West Bengal,3,17,0,0,17237,Urban,2023,306.52
4,5,Karnataka,6,17,2,0,20967,Rural,2017,610.7


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Household_ID             20000 non-null  int64  
 1   State                    20000 non-null  object 
 2   No_of_Residents          20000 non-null  int64  
 3   No_of_Appliances         20000 non-null  int64  
 4   AC_Units                 20000 non-null  int64  
 5   Has_Solar                20000 non-null  int64  
 6   Monthly_Income           20000 non-null  int64  
 7   Region                   20000 non-null  object 
 8   Year                     20000 non-null  int64  
 9   Monthly_Electricity_kWh  20000 non-null  float64
dtypes: float64(1), int64(7), object(2)
memory usage: 1.5+ MB


In [9]:
df=df.drop(columns=["Household_ID","State"])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   No_of_Residents          20000 non-null  int64  
 1   No_of_Appliances         20000 non-null  int64  
 2   AC_Units                 20000 non-null  int64  
 3   Has_Solar                20000 non-null  int64  
 4   Monthly_Income           20000 non-null  int64  
 5   Region                   20000 non-null  object 
 6   Year                     20000 non-null  int64  
 7   Monthly_Electricity_kWh  20000 non-null  float64
dtypes: float64(1), int64(6), object(1)
memory usage: 1.2+ MB


In [13]:
df.isnull().sum()

No_of_Residents            0
No_of_Appliances           0
AC_Units                   0
Has_Solar                  0
Monthly_Income             0
Region                     0
Year                       0
Monthly_Electricity_kWh    0
dtype: int64

In [15]:
df.duplicated().sum()

0

# EDA

In [18]:
df1=df.copy()

In [20]:
df1["Year"]=pd.to_datetime(df1["Year"])

In [22]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   No_of_Residents          20000 non-null  int64         
 1   No_of_Appliances         20000 non-null  int64         
 2   AC_Units                 20000 non-null  int64         
 3   Has_Solar                20000 non-null  int64         
 4   Monthly_Income           20000 non-null  int64         
 5   Region                   20000 non-null  object        
 6   Year                     20000 non-null  datetime64[ns]
 7   Monthly_Electricity_kWh  20000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(5), object(1)
memory usage: 1.2+ MB


# Non-Visual Analysis

In [25]:
solar_impact = df1.groupby("Has_Solar")["Monthly_Electricity_kWh"].mean()
print(solar_impact)

Has_Solar
0    518.349231
1    413.266216
Name: Monthly_Electricity_kWh, dtype: float64


In [27]:
residents_consumption = df1.groupby("No_of_Residents")["Monthly_Electricity_kWh"].mean().sort_index()
print(residents_consumption)

No_of_Residents
2    398.253625
3    427.331513
4    453.775751
5    485.828009
6    517.132099
7    545.951672
8    576.756845
9    608.414596
Name: Monthly_Electricity_kWh, dtype: float64


In [29]:
ac_usage = df.groupby("AC_Units")["Monthly_Electricity_kWh"].mean()
print(ac_usage)

AC_Units
0    351.108917
1    504.872878
2    652.500112
Name: Monthly_Electricity_kWh, dtype: float64


In [31]:
region_usage = df.groupby("Region")["Monthly_Electricity_kWh"].mean().sort_values(ascending=False)
print(region_usage)

Region
Urban    519.286852
Rural    470.382085
Name: Monthly_Electricity_kWh, dtype: float64


In [33]:
df1["Year"] = df1["Year"].astype(str).str[-4:].astype(int)

In [35]:
yearly_trend = df1.groupby("Year")["Monthly_Electricity_kWh"].mean()
print(yearly_trend)

Year
2015    503.170943
2016    498.825040
2017    499.942448
2018    504.096024
2019    506.238854
2020    504.945478
2021    494.762043
2022    505.547621
2023    502.737168
Name: Monthly_Electricity_kWh, dtype: float64


# Visual Analysis

In [39]:
df

Unnamed: 0,No_of_Residents,No_of_Appliances,AC_Units,Has_Solar,Monthly_Income,Region,Year,Monthly_Electricity_kWh
0,8,26,0,0,38247,Urban,2015,479.47
1,9,6,0,0,118679,Rural,2018,361.22
2,5,8,0,0,144381,Urban,2016,177.11
3,3,17,0,0,17237,Urban,2023,306.52
4,6,17,2,0,20967,Rural,2017,610.70
...,...,...,...,...,...,...,...,...
19995,7,19,1,0,50654,Urban,2023,568.76
19996,5,7,0,0,33321,Urban,2019,252.97
19997,8,6,0,0,142536,Rural,2023,345.69
19998,7,11,2,0,89844,Rural,2022,575.60


In [41]:
y=df["Monthly_Electricity_kWh"]

In [43]:
X=df.drop(columns=["Monthly_Electricity_kWh","Year"])

In [45]:
X.head()

Unnamed: 0,No_of_Residents,No_of_Appliances,AC_Units,Has_Solar,Monthly_Income,Region
0,8,26,0,0,38247,Urban
1,9,6,0,0,118679,Rural
2,5,8,0,0,144381,Urban
3,3,17,0,0,17237,Urban
4,6,17,2,0,20967,Rural


In [47]:
from sklearn.preprocessing import OrdinalEncoder,StandardScaler, RobustScaler
# Column Transformer
from sklearn.compose import ColumnTransformer
transformer=ColumnTransformer(transformers=[('t1',OrdinalEncoder(),[0,1,2,3,5]),('t2',StandardScaler(),[4])],remainder="passthrough")
X_train_transform1=transformer.fit_transform(X)
print(X_train_transform1.shape)
X_train_transform2=pd.DataFrame(X_train_transform1,columns=X.columns)

(20000, 6)


In [49]:
from sklearn.feature_selection import mutual_info_regression
mutual_info=mutual_info_regression(X_train_transform2,y)
mutual_info=pd.Series(mutual_info)
mutual_info.index=X.columns
mutual_info.sort_values(ascending=False)

AC_Units            0.358413
No_of_Appliances    0.115968
No_of_Residents     0.103899
Has_Solar           0.027683
Monthly_Income      0.011870
Region              0.000000
dtype: float64

In [50]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

selector = SelectKBest(score_func=mutual_info_regression, k=6)
X_encoded = X_train_transform2
selector.fit(X_encoded, y)

In [53]:
selected_features=X_encoded.columns[selector.get_support()]
print(selected_features)

Index(['No_of_Residents', 'No_of_Appliances', 'AC_Units', 'Has_Solar',
       'Monthly_Income', 'Region'],
      dtype='object')


In [55]:
y=df["Monthly_Electricity_kWh"]

In [57]:
X=df.drop(columns=["Monthly_Electricity_kWh","Year", 'Region'])

In [59]:
X

Unnamed: 0,No_of_Residents,No_of_Appliances,AC_Units,Has_Solar,Monthly_Income
0,8,26,0,0,38247
1,9,6,0,0,118679
2,5,8,0,0,144381
3,3,17,0,0,17237
4,6,17,2,0,20967
...,...,...,...,...,...
19995,7,19,1,0,50654
19996,5,7,0,0,33321
19997,8,6,0,0,142536
19998,7,11,2,0,89844


In [61]:
X["Has_Solar"].value_counts()

Has_Solar
0    16937
1     3063
Name: count, dtype: int64

In [63]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test=train_test_split(X,y,test_size=0.25)

In [65]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 15, 20],
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'max_depth': 10, 'n_estimators': 200}
Best Score: 0.8858001599777442


In [67]:
rf = RandomForestRegressor(
    n_estimators=200,      # number of trees
    max_depth=10,          # max depth of each tree
    min_samples_split=5,   
    min_samples_leaf=3,   
    max_features="sqrt",   # number of features to consider
    random_state=42,
    n_jobs=-1     
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

# Evaluation
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


MSE: 3218.425914795049
R² Score: 0.8906851620597134


In [69]:
# Let us ceate pipeline
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
pipe=Pipeline(steps=[("t1",ColumnTransformer(transformers=[("t1",RobustScaler(),[0,1,2,3,4])])),("t2",rf)])

In [71]:
# Fitting Data
pipe.fit(X_train,y_train)

In [73]:
# Make Predictions
y_pred=pipe.predict(X_test)

In [75]:
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
print ("Mean Squared Error is ", mean_squared_error(y_pred,y_test))
print("Mean Absolute Error is ",mean_absolute_error(y_pred,y_test))
print ("R2 Score is ",r2_score(y_pred,y_test))

Mean Squared Error is  3218.0523515512605
Mean Absolute Error is  45.42351920986972
R2 Score is  0.8723188196319918


In [77]:
import pickle

In [79]:
with open ("ML_Regress.pkl","wb") as f:
    pickle.dump(pipe,f)

In [81]:
with open ("ML_Regress.pkl","rb") as f:
    model=pickle.load(f)

In [83]:
model

In [85]:
model.predict([[8,26,0,0,45000]])

array([535.50304391])

In [87]:
model.predict([[6,17,2,0,12000]])

array([682.24196203])