In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt


# filepath = 'epcshort_postcodefull.csv'
filepath = 'https://media.githubusercontent.com/media/LondonEnergyMap/cleandata/master/pcmeter_epc/pcmeter10_epc.csv'

df_all = pd.read_csv(filepath)

df_all.head()

In [None]:
df_all.columns

In [None]:
df_all.mainsgas.unique()

In [None]:
pd.set_option('display.max_columns', 500)
df_all[df_all.add2=='18 Artillery Lane']

In [None]:
df_all[df_all.transact_type=='new dwelling']

In [None]:
df_temp = df_all[df_all.wall.str.contains('Average', na=False)]

In [None]:
df_temp.head(50)

In [None]:
# clean up nrooms
sns.kdeplot(df_all.nrooms)
plt.xlabel('no. of rooms')
plt.savefig('nrooms.png')

In [None]:
n = 10
df = df_all[(df_all.nrooms<=n)]# & (df_all.nrooms>0)]
sns.kdeplot(df.nrooms)
plt.xlabel('no. of rooms')
plt.savefig('nrooms_cleaned.png')

In [None]:
sns.kdeplot(df.tfa)
plt.xlabel('m2')
plt.savefig('tfa_afternrooms.png')

In [None]:
tfa_upper = 50*n
tfa_lower = 20
df = df[(df.tfa <= tfa_upper) & (df.tfa >= tfa_lower)]
sns.kdeplot(df.tfa)
plt.xlabel('m2')
plt.savefig('tfa_cleaned.png')

In [None]:
df.shape

In [None]:
df.wall.unique()

In [None]:
df['wall_firstword'] = df.wall.str.split().str.get(0)

In [None]:
df.wall_firstword.unique()

In [None]:
wall_mapping = {'Cavity': 2, 'System': 3, 'Timber': 3}
df['age'] = df.wall_firstword.map(wall_mapping)
df.age.fillna(1, inplace=True)
df.age.head(20)

In [None]:
df.loc[df.transact_type == 'new dwelling', 'age'] = 4

In [None]:
sns.regplot(x=df.age, y=df.gasmid, scatter_kws={'alpha':0.3})
plt.savefig('age_wallgas.png')

In [None]:
sns.regplot(x=df.age, y=df.elecmid, scatter_kws={'alpha':0.3})
plt.savefig('age_wallelec.png')

In [None]:
sns.boxplot(data=df, x='age', y='gasmid')

In [None]:
sns.boxplot(data=df, x='age', y='elecmid')

In [None]:
prop_mapping = {'House':0, 'Flat':-2, 'Bungalow':0.5, 'Maisonette':-2, 'Park home':0}
built_mapping = {'Detached':0, 'Mid-Terrace':-2, 'Semi-Detached':-1, 'Enclosed Mid-Terrace': -2.5, 'Enclosed End-Terrace': -1.5, '':0}


df['propmap'] = df.prop_type.map(prop_mapping)
df['builtmap'] = df.builtform.map(built_mapping)
df['exposedsides'] = 6 + df.propmap + df.builtmap

In [None]:
sns.boxplot(data=df, x='exposedsides', y='gasmid')

In [None]:
sns.boxplot(data=df, x='exposedsides', y='elecmid')

In [None]:
m = 10
df10 = df[(df.gasmeters<=m) & (df.elecmeters<=m) & (df.mainsgas=='Y')]

In [None]:
df10.shape

In [None]:
sns.regplot(x=df10.age, y=df10.gasmid, scatter_kws={'alpha':0.3})

In [None]:
sns.regplot(x=df10.age, y=df10.elecmid, scatter_kws={'alpha':0.3})

In [None]:
sns.regplot(x=df10.exposedsides, y=df10.gasmid, scatter_kws={'alpha':0.3})

In [None]:
sns.regplot(x=df10.exposedsides, y=df10.elecmid, scatter_kws={'alpha':0.3})

In [None]:
sns.regplot(x=df10.tfa, y=df10.gasmid, scatter_kws={'alpha':0.3})

In [None]:
sns.regplot(x=df10.tfa, y=df10.elecmid, scatter_kws={'alpha':0.3})

In [None]:
sns.regplot(x=df10.tfa, y=df10.gasmid, scatter_kws={'alpha':0.3})

In [None]:
sns.regplot(x=df10.nrooms, y=df10.gasmid, scatter_kws={'alpha':0.3})

In [None]:
sns.regplot(x=df10.nrooms, y=df10.elecmid, scatter_kws={'alpha':0.3})

In [None]:
m = 6
df10 = df[(df.gasmeters<=m) & (df.elecmeters<=m) & (df.mainsgas=='Y')]
df.shape

In [None]:
# split training test data
from sklearn.model_selection import train_test_split

temp = df10.fillna(value=0)
x = temp[['tfa', 'nrooms', 'age', 'exposedsides']]
y = temp.gasmid

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=42)
temp.shape

In [None]:
# build machine learning model

from sklearn.linear_model import LinearRegression

lrmodel = LinearRegression()
lrmodel.fit(x_train, y_train)
predictions = lrmodel.predict(x_test)
gasscore = lrmodel.score(x_train, y_train)

In [None]:
plt.scatter(y_test, predictions, c='orange', alpha=0.3)
plt.ylim(0, 50000)
plt.title('model score = %.1f%%' % (gasscore*100) )
plt.ylabel('Predictions')
plt.xlabel('Test data')
plt.savefig('gasmlmodel.png')

In [None]:
# do electricity
y = temp.elecmid

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=42)
lrmodel = LinearRegression()
lrmodel.fit(x_train, y_train)
predictions = lrmodel.predict(x_test)
elecscore = lrmodel.score(x_train, y_train)
plt.scatter(y_test, predictions, alpha=0.3)
plt.ylim(0, 16000)
plt.title('model score = %.1f%%' % (elecscore*100) )
plt.ylabel('Predictions')
plt.xlabel('Test data')
plt.savefig('elecmlmodel.png')

In [None]:
df10.nrooms.unique()

In [None]:
porder=["House", "Bungalow", "Maisonette", 'Flat']
dft10 = df10.copy()
dft10['gasmid'] = dft10.gasmid/1000
sns.set_style("darkgrid")
sns.boxplot(data=dft10, x='gasmid', y='prop_type', order=porder)
plt.xlabel('Annual gas consumption (MWh)')
plt.ylabel('')
plt.title('Based on 6 meters')
plt.tight_layout()
plt.savefig('meters6_proptype.png')

In [None]:
dft = df[~df.prop_type.str.contains('Park')]
dft['gasmid'] = dft.gasmid/1000
sns.set_style("darkgrid")
sns.boxplot(data=dft, x='gasmid', y='prop_type', order=porder)
# plt.ylabel('Property type')
plt.ylabel('')
plt.xlabel('Annual gas consumption (MWh)')
plt.title('All meters estimates')
plt.tight_layout()
plt.savefig('metersAll_proptype.png')

In [None]:
def makeplots(xinput, xlabel, figname):
    f, axes = plt.subplots(1, 2, figsize=(15, 5))
    sns.regplot(x=xinput, y=df10.gasmid, scatter_kws={'alpha':0.3}, ax=axes[0], color=gascolor)
    sns.regplot(x=xinput, y=df10.elecmid, scatter_kws={'alpha':0.3}, ax=axes[1], color=eleccolor)
    axes[0].set_ylabel('Annual consumption (MWh)', fontsize=fs, labelpad=10)
    axes[1].set_ylabel('', fontsize=fs)    
#     axes[1].set_ylabel('Electricity consumption (kWh)')
#     axes[0].set_ylabel('Gas consumption (kWh)')
    axes[1].set_xlabel(xlabel, fontsize=fs, labelpad=10)
    axes[0].set_xlabel(xlabel, fontsize=fs, labelpad=10)
    axes[0].text(0.95, 0.95, 'Gas', fontsize=fs,
                 horizontalalignment='right',
                 verticalalignment='top',
                 transform=axes[0].transAxes,
                 bbox={'facecolor': gascolor, 'alpha': 0.5, 'pad': 5})
    axes[1].text(0.95, 0.95, 'Electricity', fontsize=fs,
                 horizontalalignment='right',
                 verticalalignment='top',
                 transform=axes[1].transAxes,
                 bbox={'facecolor': eleccolor, 'alpha': 0.5, 'pad': 5})
    
    plt.subplots_adjust(left=0.07, bottom=0.15, right=0.95, top=0.9, wspace=0.1, hspace=0.2)
    plt.savefig(figname)

In [None]:
gascolor = "coral"
eleccolor = "cornflowerblue"
fs = 14
ft = 9

plot_cols = ['age', 'nrooms', 'tfa', 'exposedsides']
plot_xlabel = ['Building age based on wall type', 'No. of rooms', 'Floor area (sqm)', 'Exposed sides']
fignames = ['age.png', 'nroom.png', 'tfa.png', 'exposedsides.png']

for i in range(4):
    makeplots(df10[plot_cols[i]], plot_xlabel[i], fignames[i])
    i

In [None]:
# build machine learning model

x = temp[['tfa', 'nrooms', 'age', 'exposedsides']]
yg = temp.gasmid
ye = temp.elecmid

x_train, x_test, yg_train, yg_test = train_test_split(x, yg, train_size=0.8, test_size=0.2, random_state=42)
x_train, x_test, ye_train, ye_test = train_test_split(x, ye, train_size=0.8, test_size=0.2, random_state=42)

# gas
glrmodel = LinearRegression()
glrmodel.fit(x_train, yg_train)
gpredictions = glrmodel.predict(x_test)
gasscore = glrmodel.score(x_train, yg_train)

# elec
elrmodel = LinearRegression()
elrmodel.fit(x_train, ye_train)
epredictions = elrmodel.predict(x_test)
elecscore = elrmodel.score(x_train, ye_train)


f, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].scatter(yg_test, gpredictions, c=gascolor, alpha=0.3)

# sns.regplot(yg_test, gpredictions, ax=axes[0], color=gascolor)
axes[0].set_ylim(0, 50000)
# axes[0].set_title('model R^2 score = %.1f%%' % (gasscore*100) )
axes[0].set_ylabel('Predictions (kWh)', fontsize=fs, labelpad=10)
axes[0].set_xlabel('Actual data (kWh)', fontsize=fs, labelpad=10)

axes[1].scatter(ye_test, epredictions, c=eleccolor, alpha=0.3)
axes[1].set_ylim(0, 16000)
# axes[1].set_title('model R^2 score = %.1f%%' % (elecscore*100) )
# axes[1].set_ylabel('Predictions (kWh)', fontsize=fs, labelpad=10)
axes[1].set_ylabel('')
axes[1].set_xlabel('Actual data (kWh)', fontsize=fs, labelpad=10)

axes[0].text(0.95, 0.95, 'gas: R$^{2}$ score = %.1f%%' % (gasscore*100), fontsize=fs,
             horizontalalignment='right',
             verticalalignment='top',
             transform=axes[0].transAxes,
             bbox={'facecolor': gascolor, 'alpha': 0.5, 'pad': 5})
axes[1].text(0.95, 0.95, 'electricity: R$^{2}$ score = %.1f%%' % (elecscore*100), fontsize=fs,
             horizontalalignment='right',
             verticalalignment='top',
             transform=axes[1].transAxes,
             bbox={'facecolor': eleccolor, 'alpha': 0.5, 'pad': 5})

plt.subplots_adjust(left=0.07, bottom=0.15, right=0.95, top=0.9, wspace=0.1, hspace=0.2)
plt.savefig('bothmlmodel.png')

