In [None]:
import pandas as pd

Merge all the data

In [None]:
# Read all the data
building1 = pd.read_csv("../data/original/Building_1.csv", sep=",")
building2 = pd.read_csv("../data/original/Building_2.csv", sep=",")
building3 = pd.read_csv("../data/original/Building_3.csv", sep=",")
weather = pd.read_csv("../data/original/weather.csv", sep=",")
carbon_intensity = pd.read_csv("../data/original/carbon_intensity.csv", sep=",")
pricing = pd.read_csv("../data/original/pricing.csv", sep=",")

# Distinguish between the buildings
building1 = building1.add_suffix("_1")
building2 = building2.add_suffix("_2")
building3 = building3.add_suffix("_3")

# Merge
data = [building1, building2, building3, weather, carbon_intensity, pricing]
merged = pd.concat(data, axis=1)
merged.head()

Drop the columns with constant values

In [None]:
merged_dropped = merged.drop(columns=["Month_1", "Month_2", "Month_3",
                                      "Daylight Savings Status_1", "Daylight Savings Status_2", "Daylight Savings Status_3",
                                      "Heating Load (kWh)_1", "Heating Load (kWh)_2", "Heating Load (kWh)_3",
                                      "HVAC Mode (Off/Cooling/Heating)_1", "HVAC Mode (Off/Cooling/Heating)_2", "HVAC Mode (Off/Cooling/Heating)_3"])

# Insert an index column
index = range(1, len(merged_dropped)+1)
merged_dropped.insert(loc=0, column="Index", value=index)

In [None]:
# Add a column with average solar generation for later prediction
merged_dropped["Avg solar generation"] = merged_dropped[["Solar Generation (W/kW)_1", "Solar Generation (W/kW)_2", "Solar Generation (W/kW)_3"]].mean(axis=1)

In [None]:
merged_dropped.describe()

Split into train-test and normalize

In [None]:
total_length = merged_dropped.shape[0]
# last 3 days
SIZE_OF_TEST = 72
SIZE_OF_VALIDATION = 72

train = merged_dropped.head(total_length - SIZE_OF_TEST - SIZE_OF_VALIDATION)
validation = merged_dropped.iloc[total_length - SIZE_OF_TEST - SIZE_OF_VALIDATION: total_length - SIZE_OF_TEST] 
test = merged_dropped.tail(SIZE_OF_TEST)

# Ignore index when normalizing
columns = list(merged_dropped)
columns.remove("Index")

# Normalize
for column in columns:
    min_of_column = train[column].min()
    max_of_column = train[column].max()
    
    train[column] = (train[column] - min_of_column) / (max_of_column - min_of_column)
    validation[column] = (validation[column] - min_of_column) / (max_of_column - min_of_column)
    test[column] = (test[column] - min_of_column) / (max_of_column - min_of_column)
    
    if column == "Avg solar generation":
        print(f"solar max = {max_of_column}")
        print(f"solar min = {min_of_column}")
    if column == "kg_CO2/kWh":
        print(f"carbon max = {max_of_column}")
        print(f"carbon min = {min_of_column}")

Export the train and test sets

In [None]:
train.to_csv("../data/input-output-batches/TrainSet.csv", index=False)
validation.to_csv("../data/input-output-batches/ValidationSet.csv", index=False)
test.to_csv("../data/input-output-batches/TestSet.csv", index=False)