some resources:
1. https://financetrain.com/multivariate-linear-regression-in-python-with-scikit-learn-library


In [2]:
import pandas as pd # Perform EDA and Creating Data frame
from sklearn.model_selection import train_test_split # splitting data for training and testing

from sklearn.impute import SimpleImputer # HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.preprocessing import MinMaxScaler # scaling data - Preprocessing
## Model Training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error # Checking accuracy of Model 

## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
path_to_data = '/home/omkar/Omkar Pawar/Data Science/Projects/Energy_Efficiency/notebooks/data /ENB2012_data.xlsx'

df = pd.read_excel(path_to_data)
df1 = df 
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [4]:
# changing the column names: 
new_names = {"X1": "Relative Compactness","X2": "Surface Area", "X3": "Wall Area", "X4": "Roof Area", "X5": "Overall Height", 
             "X6": "Orientation", "X7": "Glazing Area", "X8": "Glazing Area Distribution", "Y1": "Heating Load", "Y2": "Cooling Load"}

df = df.rename(columns=new_names)

df.head()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [5]:
# separating categorical and numerical variables:

numerical_col = df.drop(labels="Orientation", axis=1)
categorical_col = df['Orientation']

In [6]:
# separate dataframe into dependent and independent variables: 

X = df.drop(columns=["Heating Load", "Cooling Load"], axis=1)
y = df[['Heating Load', 'Cooling Load']]

In [7]:
X.head() # independent variable

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0


In [8]:
y.head() # dependent variable

Unnamed: 0,Heating Load,Cooling Load
0,15.55,21.33
1,15.55,21.33
2,15.55,21.33
3,15.55,21.33
4,20.84,28.28


In [9]:
# Splitting the data into Training and Testing data:

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)


In [10]:
X_train.shape

(614, 8)

In [11]:
X_test.shape

(154, 8)

Data is split into train and test sets.

In [12]:
preprocessor = MinMaxScaler() # creating a scaler object

In [13]:
# train data
X_train_scaled = preprocessor.fit_transform(X_train)

In [14]:
# test data 
X_test_scaled = preprocessor.transform(X_test)

In [15]:
reg_model = LinearRegression()

In [16]:
reg_model.fit(X_train_scaled, y_train)

In [17]:
#train data 
training_data_prediction = reg_model.predict(X_train_scaled)
r2_train = r2_score(y_train, training_data_prediction) # comparing training and prediction values.
r2_train 

0.8958100215834952

In [18]:
#test data
test_data_prediction = reg_model.predict(X_test_scaled)
r2_test = r2_score(y_test, test_data_prediction) 
r2_test


0.9246614241787632

In [19]:
# predicting for individual row from user:

In [23]:
import numpy as np

user_data = np.array([[0.97, 445.4, 295.0, 130, 8.0, 2, 0.0, 0]])
user_data_scaled = preprocessor.transform(user_data) # scaled 
user_data_pred = reg_model.predict(user_data_scaled)
print(user_data_pred)


[[-1.17476405e+14  5.97070931e+13]]




In [21]:
user_data = [[0.97, 445.4, 295.0, 130, 8.0, 2, 0.0, 0]]
user_data_scaled = preprocessor.transform(user_data)
print(user_data_scaled)
user_data_pred = reg_model.predict(user_data_scaled)
print(user_data_pred) # output: [[-2.57422890e+14 -4.61205355e+13]]

[[ 0.97222222 -0.23503401  0.29154519  0.17913832  1.28571429  0.
   0.          0.        ]]
[[-1.17476405e+14  5.97070931e+13]]




In [22]:
df.head()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28
