In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score


In [2]:
data = pd.read_csv(r'C:\Users\PC\advanced_machine_learning-3\AI-ML-Portfolio-Hub\Machine Learning\Supervised Learning\Regression\Datasets\train_energy_data.csv')

In [3]:
data.head()

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,7063,76,10,29.84,Weekday,2713.95
1,Commercial,44372,66,45,16.72,Weekday,5744.99
2,Industrial,19255,37,17,14.3,Weekend,4101.24
3,Residential,13265,14,41,32.82,Weekday,3009.14
4,Commercial,13375,26,18,11.92,Weekday,3279.17


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1000 non-null   object 
 1   Square Footage       1000 non-null   int64  
 2   Number of Occupants  1000 non-null   int64  
 3   Appliances Used      1000 non-null   int64  
 4   Average Temperature  1000 non-null   float64
 5   Day of Week          1000 non-null   object 
 6   Energy Consumption   1000 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 54.8+ KB


In [5]:
data.describe()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,25462.388,48.372,25.606,22.61139,4166.25257
std,14294.554,29.061972,14.105166,7.139943,933.313064
min,560.0,1.0,1.0,10.05,1683.95
25%,13169.75,22.0,13.0,16.475,3509.4825
50%,25477.0,47.0,26.0,22.815,4175.73
75%,37446.25,73.25,38.0,28.85,4863.85
max,49997.0,99.0,49.0,34.99,6530.6


In [6]:
data.describe(include='object')

Unnamed: 0,Building Type,Day of Week
count,1000,1000
unique,3,2
top,Residential,Weekday
freq,347,507


In [7]:
data.isnull().sum()

Building Type          0
Square Footage         0
Number of Occupants    0
Appliances Used        0
Average Temperature    0
Day of Week            0
Energy Consumption     0
dtype: int64

In [8]:
data = data.dropna()

In [9]:
from sklearn.preprocessing import LabelEncoder
categorical_columns = ['Building Type', 'Day of Week']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [10]:
data.head()

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,2,7063,76,10,29.84,0,2713.95
1,0,44372,66,45,16.72,0,5744.99
2,1,19255,37,17,14.3,1,4101.24
3,2,13265,14,41,32.82,0,3009.14
4,0,13375,26,18,11.92,0,3279.17


In [11]:
X = data.drop(columns=['Energy Consumption'])  # Features
y = data['Energy Consumption']  # Target variable

In [12]:
X.head()

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week
0,2,7063,76,10,29.84,0
1,0,44372,66,45,16.72,0
2,1,19255,37,17,14.3,1
3,2,13265,14,41,32.82,0
4,0,13375,26,18,11.92,0


In [13]:
y.head()

0    2713.95
1    5744.99
2    4101.24
3    3009.14
4    3279.17
Name: Energy Consumption, dtype: float64

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [15]:
X

array([[ 1.19680781, -1.28780475,  0.95113382, -1.10695673,  1.0129251 ,
        -0.98609664],
       [-1.22343043,  1.32351621,  0.60686937,  1.37564519, -0.82554417,
        -0.98609664],
       [-0.01331131, -0.43446574, -0.39149753, -0.61043634, -1.16465207,
         1.01409939],
       ...,
       [-1.22343043,  0.98685605,  1.36425116, -0.39764189,  1.34082283,
        -0.98609664],
       [ 1.19680781, -1.1978654 ,  0.64129581,  0.80819332, -0.8591747 ,
         1.01409939],
       [-1.22343043, -0.67537723,  0.29703137, -1.03602524,  1.23152358,
         1.01409939]])

In [16]:
X.shape

(1000, 6)

In [17]:
y.shape

(1000,)

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
(X_train.shape, X_test.shape),( y_train.shape, y_test.shape)


(((800, 6), (200, 6)), ((800,), (200,)))

In [20]:
poly_features_1 = PolynomialFeatures(degree=3, include_bias=False)
X_train_pol1 = poly_features_1.fit_transform(X_train)
X_test_poly1 = poly_features_1.transform(X_test)

In [21]:
poly_model = LinearRegression()
poly_model.fit(X_train_pol1, y_train)

In [22]:
y_pred = poly_model.predict(X_test_poly1)
r2 = r2_score(y_test, y_pred)
r2

0.9999999997466823

In [23]:
poly_features_2 = PolynomialFeatures(degree=2, include_bias=False)
X_train_pol2 = poly_features_2.fit_transform(X_train)
X_test_poly2 = poly_features_2.transform(X_test)

In [24]:
poly_model.fit(X_train_pol2, y_train)

In [25]:
y_pred_2 = poly_model.predict(X_test_poly2)
r2 = r2_score(y_test,y_pred_2)

In [26]:
r2

0.999999999762742