# Regression Analysis

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

### Import Dataset

In [2]:
# function to download or import data frame
def FetchData(file='',url=''):
    if url == '' and file == '': return;
    try:
        data = pd.read_excel(file)
        print('File Loaded!')
    except FileNotFoundError:
        try: 
            data = pd.read_csv(url)
            print('File Imported!')
        except OSError: 
            print('Network Connection Error!')
        finally:
            data.to_excel(file,index=False)
            print('File Saved!')
    return data

In [3]:
# define files and path
url            = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv'
folder         = r'Datasets'
file_name      = r'energy_dataset.xlsx'
file_directory =  folder + '/' + file_name

# Create file directory
os.makedirs(os.path.dirname(file_directory), exist_ok=True)

# import dataset
df = FetchData(file_directory,url)
df.info()

File Loaded!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  floa

# Describe Dataset

In [4]:
# get column names
col_names = {'T1':'Kitchen_Temp_Celsuis',
             'RH_1':'Kitchen_Humidity(%)',
            'T2':'Living_Room_Temp_Celsuis',
            'RH_2':'Living_Room_Humidity(%)',
            'T3':'Laundary_Room_Celsius_Temp',
            'RH_3':'Laundary_Room_Humidity(%)',
             'T4':'Office_Room_Celsius_Temp',
             'RH_4':'Office_Room_Humidity(%)',
             'T5':'Bathroom_Celsuis_Temp',
             'RH_5':'Bathroom_Humidity(%)',
             'T6':'Outside_Building_northside_Celsius_Temp',
             'RH_6':'Outside_Building_northside_Humidity(%)',
             'T7':'Ironing_Room_Celsuis_Temp',
             'RH_7':'Ironing_Room_Humidity(%)',
             'T8':'Teenager_Room2_Celsuis_Temp',
             'RH_8':'Teenager_Room2_Humidity(%)',
             'T9':'Parent_Room_Celsuis_Temp',
             'RH_9':'Parent_Room_Humidity(%)',
             'rv1':'Random_Variable1',
             'rv2':'Random_Variable2'
            }
# rename columns
df.rename(columns=col_names, inplace=True)
df

Unnamed: 0,date,Appliances,lights,Kitchen_Temp_Celsuis,Kitchen_Humidity(%),Living_Room_Temp_Celsuis,Living_Room_Humidity(%),Laundary_Room_Celsius_Temp,Laundary_Room_Humidity(%),Office_Room_Celsius_Temp,...,Parent_Room_Celsuis_Temp,Parent_Room_Humidity(%),T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,Random_Variable1,Random_Variable2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,2016-05-27 17:40:00,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


## Drop irrelevant columns

In [5]:
# drop light and date columns
df.drop(columns=['date','lights'],inplace=True)

## Check dataset correlation and drop one feature for two highly correlated features

In [6]:
# get correlations
df.corr()

Unnamed: 0,Appliances,Kitchen_Temp_Celsuis,Kitchen_Humidity(%),Living_Room_Temp_Celsuis,Living_Room_Humidity(%),Laundary_Room_Celsius_Temp,Laundary_Room_Humidity(%),Office_Room_Celsius_Temp,Office_Room_Humidity(%),Bathroom_Celsuis_Temp,...,Parent_Room_Celsuis_Temp,Parent_Room_Humidity(%),T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,Random_Variable1,Random_Variable2
Appliances,1.0,0.055447,0.086031,0.120073,-0.060465,0.08506,0.036292,0.040281,0.016965,0.01976,...,0.01001,-0.051462,0.099155,-0.034885,-0.152282,0.087122,0.00023,0.015353,-0.011145,-0.011145
Kitchen_Temp_Celsuis,0.055447,1.0,0.164006,0.836834,-0.002509,0.892402,-0.02855,0.877001,0.097861,0.885247,...,0.844777,0.071756,0.682846,-0.150574,-0.345481,-0.087654,-0.07621,0.571309,-0.006203,-0.006203
Kitchen_Humidity(%),0.086031,0.164006,1.0,0.269839,0.797535,0.25323,0.844677,0.10618,0.880359,0.205797,...,0.115263,0.764001,0.340767,-0.293957,0.274126,0.204932,-0.021057,0.639106,-0.000699,-0.000699
Living_Room_Temp_Celsuis,0.120073,0.836834,0.269839,1.0,-0.16561,0.735245,0.121497,0.762066,0.231563,0.72055,...,0.675535,0.157346,0.792255,-0.133028,-0.505291,0.052495,-0.069721,0.582602,-0.011087,-0.011087
Living_Room_Humidity(%),-0.060465,-0.002509,0.797535,-0.16561,1.0,0.137319,0.678326,-0.047304,0.721435,0.110409,...,0.054544,0.676467,0.033674,-0.255646,0.584911,0.06919,-0.005368,0.499152,0.006275,0.006275
Laundary_Room_Celsius_Temp,0.08506,0.892402,0.25323,0.735245,0.137319,1.0,-0.011234,0.852778,0.122737,0.888169,...,0.901324,0.134602,0.699417,-0.189974,-0.281718,-0.100776,-0.10231,0.645886,-0.005194,-0.005194
Laundary_Room_Humidity(%),0.036292,-0.02855,0.844677,0.121497,0.678326,-0.011234,1.0,-0.140457,0.898978,-0.050062,...,-0.19527,0.833538,0.118207,-0.233274,0.356192,0.263188,0.017041,0.414387,-0.000477,-0.000477
Office_Room_Celsius_Temp,0.040281,0.877001,0.10618,0.762066,-0.047304,0.852778,-0.140457,1.0,-0.04865,0.871813,...,0.889439,-0.025549,0.663478,-0.075292,-0.388602,-0.185747,-0.104768,0.519471,-0.001815,-0.001815
Office_Room_Humidity(%),0.016965,0.097861,0.880359,0.231563,0.721435,0.122737,0.898978,-0.04865,1.0,0.091812,...,-0.044518,0.856591,0.293289,-0.250748,0.336813,0.300192,0.002636,0.616509,-0.001787,-0.001787
Bathroom_Celsuis_Temp,0.01976,0.885247,0.205797,0.72055,0.110409,0.888169,-0.050062,0.871813,0.091812,1.0,...,0.911055,0.072308,0.651321,-0.170999,-0.273953,-0.145011,-0.084164,0.588362,-0.00549,-0.00549


from the correlation result above it was noticed that the random variable (1 & 2) are highly correlated

In [7]:
# Create a correlation matrix
corr_df = df.corr().abs()

# select the upper triangle of correlation matrix
upper_matrix = corr_df.where(np.triu(np.ones(corr_df.shape),k=1).astype(np.bool))

# Find index of features columns with correlation greater than o.98
col_to_drop = [column for column in upper_matrix.columns if any(upper_matrix[column] > 0.98)]

# drop one of the highly correlated features
df.drop(df[col_to_drop],axis=1,inplace=True)

## Describe new dataframe

In [8]:
df.describe()

Unnamed: 0,Appliances,Kitchen_Temp_Celsuis,Kitchen_Humidity(%),Living_Room_Temp_Celsuis,Living_Room_Humidity(%),Laundary_Room_Celsius_Temp,Laundary_Room_Humidity(%),Office_Room_Celsius_Temp,Office_Room_Humidity(%),Bathroom_Celsuis_Temp,...,Teenager_Room2_Humidity(%),Parent_Room_Celsuis_Temp,Parent_Room_Humidity(%),T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,Random_Variable1
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,19.592106,...,42.936165,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033
std,102.524891,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,1.844623,...,5.224361,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634
min,10.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,15.33,...,29.6,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322
25%,50.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,18.2775,...,39.066667,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889
50%,60.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,19.39,...,42.375,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653
75%,100.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,20.619643,...,46.536,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769
max,1080.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,25.795,...,58.78,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653


In [9]:
# select a random feature in the dataset
simple_linear_reg_matrix = df[['Appliances','Living_Room_Temp_Celsuis']].sample(42,random_state=1)

simple_linear_reg_matrix

Unnamed: 0,Appliances,Living_Room_Temp_Celsuis
343,50,18.29
13136,60,20.945
19315,100,23.29
4706,130,18.1
5322,60,19.5
10555,60,19.79
15413,160,19.39
17327,50,23.1
10077,60,19.6
12759,70,17.823333


In [10]:
# plot regression
sns.regplot(x='Appliances',y='Living_Room_Temp_Celsuis',data=simple_linear_reg_matrix)

<matplotlib.axes._subplots.AxesSubplot at 0xac1264c0>

# Data Pre-processing

In [11]:
from sklearn import preprocessing

### 1. Normalize Data

In [12]:
# store pointer to object min max scaler
scaler = preprocessing.MinMaxScaler()

# transform data
transformed_df = scaler.fit_transform(df)

# normalize data 
normalized_df = pd.DataFrame(transformed_df, columns=df.columns)

normalized_df.head(3)

Unnamed: 0,Appliances,Kitchen_Temp_Celsuis,Kitchen_Humidity(%),Living_Room_Temp_Celsuis,Living_Room_Humidity(%),Laundary_Room_Celsius_Temp,Laundary_Room_Humidity(%),Office_Room_Celsius_Temp,Office_Room_Humidity(%),Bathroom_Celsuis_Temp,...,Teenager_Room2_Humidity(%),Parent_Room_Celsuis_Temp,Parent_Room_Humidity(%),T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,Random_Variable1
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.661412,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.660155,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.655586,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848


### 2. Seperate Feature data from Response data

In [13]:
featured_df = normalized_df.drop(columns='Appliances')
response_df = normalized_df['Appliances']

# describe dataframes
print(f'Feature dataframe: {featured_df.shape}, \nResponse dataframe: {response_df.shape}')

Feature dataframe: (19735, 25), 
Response dataframe: (19735,)


# Split the datasets

In [14]:
from sklearn import model_selection as ms

In [15]:
# Split the featured and target dataframes into both train and test datasets
x_train,x_test,y_train,y_test = ms.train_test_split(featured_df,response_df,test_size=0.3,random_state=1)

# describe datasets
print(f'X train: {x_train.shape},\nX Test: {x_test.shape},\nY Train: {y_train.shape},\nY Test: {y_test.shape}')

X train: (13814, 25),
X Test: (5921, 25),
Y Train: (13814,),
Y Test: (5921,)


# Linear Regression Modelling

In [16]:
# import regression library
from sklearn import linear_model

### 1. Linear Regression

In [17]:
# get linear regression object
linear_reg_model = linear_model.LinearRegression()

# fit the model to the training test and obtain prediction
linear_predictions = linear_reg_model.fit(x_train,y_train).predict(x_test)

# display predictions from linear regression model
print(f'Linear Regression Predictions: {linear_predictions}')

Linear Regression Predictions: [0.06292756 0.09480349 0.12716414 ... 0.12401726 0.15356705 0.0977636 ]


### 2. Penelization Regression models/methods

##### a. Ridge Regression Model

In [18]:
# get ridge regression object
ridge_reg_model = linear_model.Ridge(alpha=0.5)

# fit the model to the training set and obtain predictions
ridge_predictions = ridge_reg_model.fit(x_train,y_train).predict(x_test)

# display predictions from reg model
print(f'Ridge Regression Predictions: {ridge_predictions}')

Ridge Regression Predictions: [0.06287326 0.09483339 0.12434684 ... 0.12157495 0.15181549 0.09597857]


##### b. Lasso Regression Model

In [19]:
# get lasso regression object
lasso_reg_model = linear_model.Lasso(alpha=0.001)

# fit the model to the training set and obtain predictions
lasso_predictions = lasso_reg_model.fit(x_train,y_train).predict(x_test)

# display predictions from reg model
print(f'Lasso Regression Predictions: {lasso_predictions}')

Lasso Regression Predictions: [0.07427652 0.08867662 0.08733824 ... 0.09384486 0.09190669 0.07555846]


In [20]:
# create a table function to compare datas
def get_dataframe(data, col_name, name='Features'):
    data_df = pd.DataFrame(data).reset_index()
    data_df.columns = [name,col_name]
    data_df[col_name].round(3)
    return data_df

In [21]:
# get data frame of models
actual_response_df = get_dataframe(y_test,'Actual_Response')
linear_model_df = get_dataframe(linear_predictions, 'Linear_Model_Predictions')
ridge_model_df = get_dataframe(ridge_predictions,'Ridge_Model_Predictions')
lasso_model_df = get_dataframe(lasso_predictions,'Lasso_Model_Predictions')

# get data
prediction_table = pd.merge(actual_response_df,linear_model_df, on='Features')
prediction_table = pd.merge(prediction_table,ridge_model_df, on='Features')
prediction_table = pd.merge(prediction_table,lasso_model_df, on='Features')

# display the table first 10 data
prediction_table.head(10)

Unnamed: 0,Features,Actual_Response,Linear_Model_Predictions,Ridge_Model_Predictions,Lasso_Model_Predictions
0,343,0.037383,0.057705,0.058849,0.07952
1,4706,0.11215,0.095543,0.097667,0.092212
2,5322,0.046729,0.084785,0.084483,0.077197
3,5304,0.046729,0.092342,0.092706,0.094428
4,811,0.084112,0.168086,0.166303,0.075115
5,5531,0.028037,0.045177,0.045009,0.079275
6,4106,0.037383,0.075787,0.075215,0.088463
7,168,0.046729,0.127319,0.12668,0.083248
8,3315,0.102804,0.080866,0.080349,0.072082
9,4601,0.102804,0.093232,0.095372,0.101482


In [22]:
# plot regplot for the linear model
sns.regplot(x='Actual_Response',y='Linear_Model_Predictions',data=prediction_table)


<matplotlib.axes._subplots.AxesSubplot at 0xac1264c0>

In [23]:
# plot regplot for the ridge regression model
sns.regplot(x='Actual_Response',y='Ridge_Model_Predictions',data=prediction_table)


<matplotlib.axes._subplots.AxesSubplot at 0xac1264c0>

In [24]:
# plot regplot for the lasso regression model
sns.regplot(x='Actual_Response',y='Lasso_Model_Predictions',data=prediction_table)


<matplotlib.axes._subplots.AxesSubplot at 0xac1264c0>

# Measuring Perfromance of Models

In [25]:
# get metrics objects from sklearn library
from sklearn import metrics

In [26]:
# define a function to calculate the MAE, RSS, RMSE and R-Squared error of each model
def performance(model_predictions,test):
    # get the mean absolute error (MAE)
    mae = metrics.mean_absolute_error(test, model_predictions)
    # get the Residual Sum of Squares (RSS)
    rss = np.sum(np.square(test - model_predictions))
    # get the Root Mean Squared Error (RMSE)
    rmse = np.sqrt(metrics.mean_squared_error(test, model_predictions))
    # get the R-Squared 
    rsquared = metrics.r2_score(test, model_predictions)
    return [mae.round(3), rss.round(3), rmse.round(3), rsquared.round(3)]    

##### Regression performance table

In [27]:
# get linear performance
linear_performance_df = get_dataframe(performance(linear_predictions,y_test),'Linear_Model_Performance')
ridge_performance_df = get_dataframe(performance(ridge_predictions,y_test),'Ridge_Model_Performance')
lasso_performance_df = get_dataframe(performance(lasso_predictions,y_test),'Lasso_Model_Performance')

# get performance table
performance_table = pd.merge(linear_performance_df,ridge_performance_df, on='Features')
performance_table = pd.merge(performance_table,lasso_model_df)  
performance_table.Features= ['MAE','RSS','RMSE','R-Squared']

# display the table first 10 data
performance_table.head() 

Unnamed: 0,Features,Linear_Model_Performance,Ridge_Model_Performance,Lasso_Model_Predictions
0,MAE,0.05,0.05,0.074277
1,RSS,46.097,46.126,0.088677
2,RMSE,0.088,0.088,0.087338
3,R-Squared,0.156,0.156,0.075017


In [28]:
# get weights
def get_weight_df(model, feat, colName):
    weight = pd.Series(model.coef_, feat.columns).sort_values()
    weight_df = pd.DataFrame(weight).reset_index()
    weight_df.columns = ['Features',colName]
    weight_df[colName].round(3)
    return weight_df

In [29]:
# create weight 
linear_reg_weight_df = get_weight_df(linear_reg_model,x_train,'Linear_model_weight')
ridge_reg_weight_df = get_weight_df(ridge_reg_model,x_train,'Ridge_Model_Weight')
lasso_reg_weight_df = get_weight_df(lasso_reg_model,x_train,'Lasso_Reg_Model')
                                    
                                    
weight_df = pd.merge(linear_reg_weight_df,ridge_reg_weight_df, on=['Features'])
weight_df = pd.merge(weight_df,lasso_reg_weight_df,on=['Features'])
                     
weight_df.head()                     
                     
                     

Unnamed: 0,Features,Linear_model_weight,Ridge_Model_Weight,Lasso_Reg_Model
0,Living_Room_Humidity(%),-0.458176,-0.402797,-0.0
1,T_out,-0.330375,-0.257457,0.0
2,Living_Room_Temp_Celsuis,-0.249801,-0.20722,8.4e-05
3,Parent_Room_Celsuis_Temp,-0.204718,-0.202972,-0.0
4,Teenager_Room2_Humidity(%),-0.156534,-0.155679,-0.0
