In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
%matplotlib inline
from scipy.integrate import simps

In [2]:
CDD_BASE_TEMP = 65

In [42]:
data_df = pd.read_csv('building_electricity_consumption.csv')
cdd_df = pd.read_csv('building_electricity_consumption_for_integration.csv')

In [43]:
data_df['month_day'] = data_df.month.astype('str')+'-'+data_df.day.astype('str')

In [44]:
daily_data_df = data_df[['month_day', 'month', 'day']].drop_duplicates().reset_index(drop=True)

kW_groupby = data_df[['month_day', 'kW']].groupby(by='month_day').sum()
kW_groupby.reset_index(inplace=True)

mean_temp_groupby = data_df[['month_day', 'temp']].groupby(by='month_day').mean()
mean_temp_groupby.reset_index(inplace=True)
mean_temp_groupby.rename(columns={'temp': 'mean_temp'}, inplace=True)

max_temp_groupby = data_df[['month_day', 'temp']].groupby(by='month_day').max()
max_temp_groupby.reset_index(inplace=True)
max_temp_groupby.rename(columns={'temp': 'max_temp'}, inplace=True)

median_temp_groupby = data_df[['month_day', 'temp']].groupby(by='month_day').median()
median_temp_groupby.reset_index(inplace=True)
median_temp_groupby.rename(columns={'temp': 'median_temp'}, inplace=True)

daily_data_df = daily_data_df.merge(kW_groupby, on='month_day')
daily_data_df = daily_data_df.merge(mean_temp_groupby, on='month_day')
daily_data_df = daily_data_df.merge(max_temp_groupby, on='month_day')
daily_data_df = daily_data_df.merge(median_temp_groupby, on='month_day')

daily_data_df.head()

Unnamed: 0,month_day,month,day,kW,mean_temp,max_temp,median_temp
0,8-25,8,25,3134.3325,79.716667,88.0,82.0
1,8-26,8,26,3345.53,86.435417,93.9,87.55
2,8-27,8,27,3150.2325,84.6375,91.9,82.45
3,8-28,8,28,3063.2075,81.027083,90.0,80.1
4,8-29,8,29,3192.7775,82.6625,93.0,83.45


In [45]:
cdd_df['date'] = cdd_df.year.astype('str')+cdd_df.month.astype('str').str.zfill(2)+cdd_df.day.astype('str').str.zfill(2)
cdd_df.date = cdd_df.date.astype('int')
cdd_df.drop(columns=['kW'], inplace=True)
first_day = cdd_df[cdd_df.hour == 1].reset_index(drop=True).iloc[1:]
first_day.reset_index(inplace=True, drop=True)
first_day.hour = 25

In [46]:
# transition between months and years isn't working right
prev_month = {1: 12, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11}
prev_month_length = {2: 31, 3: 28, 4: 31, 5: 30, 6: 31, 7:30, 8: 31, 9: 31, 10:30, 11:31, 12: 30, 1:31}
for i in first_day.index:
    if first_day.date.iloc[i] == 20170101:
        first_day.day.iloc[i] = 31
        first_day.month.iloc[i] = 12
        first_day.year.iloc[i] = 2016
    else:
        if first_day.day.iloc[i] == 1:
            first_day.day.iloc[i] = prev_month_length[first_day.month.iloc[i]]
            first_day.month.iloc[i] = prev_month[first_day.month.iloc[i]]
        else:
            first_day.day.iloc[i] -= 1

first_day.date = first_day.year.astype('str')+first_day.month.astype('str').str.zfill(2)+first_day.day.astype('str').str.zfill(2)

In [47]:
cdd_df = pd.concat([cdd_df, first_day])
cdd_df.reset_index(inplace=True, drop=True)
cdd_df.sort_values(by=['date', 'hour'], inplace=True)

In [48]:
cdd_df['month_day'] = cdd_df.month.astype('str')+'-'+cdd_df.day.astype('str')

In [49]:
cdd_df['cdd'] = cdd_df.temp.apply(lambda x: max(round((x-CDD_BASE_TEMP)/24,4),0))

In [50]:
cdd_groupby = cdd_df[['month_day', 'cdd']].groupby(by='month_day').agg(simps)

In [51]:
daily_data_df = daily_data_df.merge(cdd_groupby, on='month_day', how='left')

In [52]:
reg_data_df = daily_data_df[['month_day', 'kW', 'mean_temp', 'max_temp', 'median_temp', 'cdd']].copy()

In [53]:
X_cdd = reg_data_df['cdd'].values
y_cdd = reg_data_df['kW'].values
X_cdd = sm.add_constant(X_cdd)
mod_cdd = sm.OLS(y_cdd, X_cdd, hasconst=True)
res_cdd = mod_cdd.fit()
res_cdd.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.618
Model:,OLS,Adj. R-squared:,0.616
Method:,Least Squares,F-statistic:,565.1
Date:,"Mon, 14 Oct 2019",Prob (F-statistic):,4.85e-75
Time:,11:30:19,Log-Likelihood:,-2506.5
No. Observations:,352,AIC:,5017.0
Df Residuals:,350,BIC:,5025.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1884.5105,20.379,92.473,0.000,1844.430,1924.591
x1,56.3588,2.371,23.773,0.000,51.696,61.021

0,1,2,3
Omnibus:,23.218,Durbin-Watson:,0.493
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26.566
Skew:,-0.673,Prob(JB):,1.7e-06
Kurtosis:,2.983,Cond. No.,11.0


In [54]:
X_temp = reg_data_df['mean_temp'].values
y_temp = reg_data_df['kW'].values
X_temp = sm.add_constant(X_temp)
mod_temp = sm.OLS(y_temp, X_temp, hasconst=True)
res_temp = mod_temp.fit()
res_temp.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.382
Model:,OLS,Adj. R-squared:,0.38
Method:,Least Squares,F-statistic:,216.2
Date:,"Mon, 14 Oct 2019",Prob (F-statistic):,1.89e-38
Time:,11:30:19,Log-Likelihood:,-2591.0
No. Observations:,352,AIC:,5186.0
Df Residuals:,350,BIC:,5194.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1042.8071,80.264,12.992,0.000,884.948,1200.667
x1,18.6526,1.269,14.704,0.000,16.158,21.147

0,1,2,3
Omnibus:,20.651,Durbin-Watson:,0.314
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.102
Skew:,-0.627,Prob(JB):,9.63e-06
Kurtosis:,3.024,Cond. No.,250.0


In [55]:
X_temp = reg_data_df['max_temp'].values
y_temp = reg_data_df['kW'].values
X_temp = sm.add_constant(X_temp)
mod_temp = sm.OLS(y_temp, X_temp, hasconst=True)
res_temp = mod_temp.fit()
res_temp.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.333
Model:,OLS,Adj. R-squared:,0.331
Method:,Least Squares,F-statistic:,174.8
Date:,"Mon, 14 Oct 2019",Prob (F-statistic):,1.2e-32
Time:,11:30:19,Log-Likelihood:,-2604.3
No. Observations:,352,AIC:,5213.0
Df Residuals:,350,BIC:,5220.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1033.0629,89.617,11.528,0.000,856.808,1209.318
x1,16.6096,1.256,13.221,0.000,14.139,19.080

0,1,2,3
Omnibus:,19.905,Durbin-Watson:,0.338
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.042
Skew:,-0.612,Prob(JB):,1.64e-05
Kurtosis:,3.07,Cond. No.,303.0


In [56]:
X_temp = reg_data_df['median_temp'].values
y_temp = reg_data_df['kW'].values
X_temp = sm.add_constant(X_temp)
mod_temp = sm.OLS(y_temp, X_temp, hasconst=True)
res_temp = mod_temp.fit()
res_temp.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.383
Model:,OLS,Adj. R-squared:,0.381
Method:,Least Squares,F-statistic:,217.2
Date:,"Mon, 14 Oct 2019",Prob (F-statistic):,1.4e-38
Time:,11:30:19,Log-Likelihood:,-2590.7
No. Observations:,352,AIC:,5185.0
Df Residuals:,350,BIC:,5193.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1027.8609,81.071,12.678,0.000,868.413,1187.309
x1,18.9221,1.284,14.738,0.000,16.397,21.447

0,1,2,3
Omnibus:,20.709,Durbin-Watson:,0.314
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.165
Skew:,-0.628,Prob(JB):,9.33e-06
Kurtosis:,3.03,Cond. No.,252.0


In [64]:
df = pd.DataFrame.from_dict(data={'R-squared':[0.618, 0.382, 0.383, 0.333]}, orient='index', columns=['CDD', 'Mean', 'Median', 'Max'])

In [65]:
df

Unnamed: 0,CDD,Mean,Median,Max
R-squared,0.618,0.382,0.383,0.333
