In [11]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model

Merge all the data

In [12]:
# Read all the data
building1 = pd.read_csv("../data/schemas/warm_up/Building_1.csv", sep=",")
building2 = pd.read_csv("../data/schemas/warm_up/Building_2.csv", sep=",")
building3 = pd.read_csv("../data/schemas/warm_up/Building_3.csv", sep=",")
weather = pd.read_csv("../data/schemas/warm_up/weather.csv", sep=",")
carbon_intensity = pd.read_csv("../data/schemas/warm_up/carbon_intensity.csv", sep=",")

# Distinguish between the buildings
building1 = building1.add_suffix("_1")
building2 = building2.add_suffix("_2")
building3 = building3.add_suffix("_3")

# Merge
data = [building1, building2, building3, weather, carbon_intensity]
merged = pd.concat(data, axis=1)
merged.head()

Unnamed: 0,Month_1,Hour_1,Day Type_1,Daylight Savings Status_1,Indoor Temperature (C)_1,Average Unmet Cooling Setpoint Difference (C)_1,Indoor Relative Humidity (%)_1,Equipment Electric Power (kWh)_1,DHW Heating (kWh)_1,Cooling Load (kWh)_1,...,6h Outdoor Relative Humidity (%),12h Outdoor Relative Humidity (%),24h Outdoor Relative Humidity (%),6h Diffuse Solar Radiation (W/m2),12h Diffuse Solar Radiation (W/m2),24h Diffuse Solar Radiation (W/m2),6h Direct Solar Radiation (W/m2),12h Direct Solar Radiation (W/m2),24h Direct Solar Radiation (W/m2),kg_CO2/kWh
0,6,1,5,0,23.098652,-0.12357,61.086187,0.356839,0.055682,1.119216,...,72.980273,41.822361,83.230997,54.625927,116.842886,0.0,143.324335,1020.756093,0.0,0.402488
1,6,2,5,0,22.234742,0.01252,64.361378,0.345078,0.159338,1.469638,...,87.317655,37.910023,95.353948,78.375479,89.220504,0.0,300.361061,825.097292,0.0,0.382625
2,6,3,5,0,22.223061,0.000838,64.493415,0.338769,0.057004,1.458372,...,60.341453,30.190199,66.266409,259.822134,82.855932,0.0,196.691127,987.90933,0.0,0.369458
3,6,4,5,0,22.222251,2.9e-05,64.769702,0.334856,0.0,1.337342,...,50.063925,31.519739,94.45637,248.631525,105.130119,0.0,465.483196,854.331825,0.0,0.367017
4,6,5,5,0,22.222236,1.4e-05,65.15212,0.348607,0.0,1.163453,...,40.942579,33.119543,100.0,272.825159,75.627986,0.0,569.203332,941.243325,0.0,0.37404


Drop the columns with constant values

In [13]:
merged_dropped = merged.drop(columns=["Month_1", "Month_2", "Month_3",
                                      "Daylight Savings Status_1", "Daylight Savings Status_2", "Daylight Savings Status_3",
                                      "Heating Load (kWh)_1", "Heating Load (kWh)_2", "Heating Load (kWh)_3",
                                      "HVAC Mode (Off/Cooling/Heating)_1", "HVAC Mode (Off/Cooling/Heating)_2", "HVAC Mode (Off/Cooling/Heating)_3"])

# Insert an index column
index = range(1, len(merged_dropped)+1)
merged_dropped.insert(loc=0, column="Index", value=index)

In [14]:
# Add a column with average solar generation for later prediction
merged_dropped["Avg solar generation"] = merged_dropped[["Solar Generation (W/kW)_1", "Solar Generation (W/kW)_2", "Solar Generation (W/kW)_3"]].mean(axis=1)

Split into train-test and normalize

In [15]:
train, test = train_test_split(merged_dropped, test_size=0.05, shuffle=False)
columns = list(merged_dropped)
columns.remove("Index")

for column in columns:
    min_of_column = train[column].min()
    max_of_column = train[column].max()
    train[column] = (train[column] - min_of_column) / (max_of_column - min_of_column)
    test[column] = (test[column] - min_of_column) / (max_of_column - min_of_column)

Export the train and test sets

In [16]:
train.to_csv("../data/schemas/warm_up/TrainSet.csv", index=False)
test.to_csv("../data/schemas/warm_up/TestSet.csv", index=False)