In [1]:
import os

import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import random

import logging
import time
import tensorflow as tf

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import load_model
import pickle

In [2]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/boilers_drive/normalized_df.csv'

Mounted at /content/drive


In [3]:
# Specify the file path of your CSV file
#file_path = 'normalized_df.csv'

# the current script directory
# the current working directory
#current_dir = os.getcwd()

# the path to the CSV file in the parent directory
#parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
#file_path = os.path.join(parent_dir, 'normalized_df.csv')

# Read the CSV file
data = pd.read_csv(file_path)

print("Imported data:")
print(data)

Imported data:
                        time    blr_mod_lvl  absorption  insulation  t_r_set  \
0        2022-10-01 00:00:00   0.000000e+00    0.503910    7.457292     15.0   
1        2022-10-01 00:01:00   0.000000e+00    0.503910    7.455208     15.0   
2        2022-10-01 00:02:00   0.000000e+00    0.518558    7.487500     15.0   
3        2022-10-01 00:03:00   0.000000e+00    0.616207    7.426042     15.0   
4        2022-10-01 00:04:00   0.000000e+00    0.699210    7.425000     15.0   
...                      ...            ...         ...         ...      ...   
7611835  2023-04-30 23:55:00  2.871866e-119    0.000000    1.604167     17.0   
7611836  2023-04-30 23:56:00  1.914578e-119    0.000000    1.614583     17.0   
7611837  2023-04-30 23:57:00  1.276385e-119    0.000000    1.572917     17.0   
7611838  2023-04-30 23:58:00  8.509234e-120    0.000000    1.511458     17.0   
7611839  2023-04-30 23:59:00  5.672823e-120    0.000000    1.432203     17.0   

             t_out house

In [4]:
# Identify NaN values
nan_mask = data.isna()
print("NaN mask:\n", nan_mask)

# Get the count of NaN values in each column
nan_count = data.isna().sum()
print("\nNaN count in each column:\n", nan_count)

# List rows with NaN values
rows_with_nan = data[data.isna().any(axis=1)]
nan_houses = rows_with_nan['house_id'].unique()
print("\nRows with NaN values:\n", rows_with_nan)
print("\nHouses with NaN values:\n", nan_houses)

# List columns with NaN values
columns_with_nan = data.columns[data.isna().any()].tolist()
print("\nColumns with NaN values:\n", columns_with_nan)

NaN mask:
           time  blr_mod_lvl  absorption  insulation  t_r_set  t_out  house_id
0        False        False       False       False    False  False     False
1        False        False       False       False    False  False     False
2        False        False       False       False    False  False     False
3        False        False       False       False    False  False     False
4        False        False       False       False    False  False     False
...        ...          ...         ...         ...      ...    ...       ...
7611835  False        False       False       False    False  False     False
7611836  False        False       False       False    False  False     False
7611837  False        False       False       False    False  False     False
7611838  False        False       False       False    False  False     False
7611839  False        False       False       False    False  False     False

[7611840 rows x 7 columns]

NaN count in each column

In [5]:
# Different houses in data
houses = data['house_id'].unique()
print("Different houses in data:")
print(houses)
print("Number of different houses:")
print(len(houses))

Different houses in data:
['home2' 'home9' 'home13' 'home14' 'home34' 'home46' 'home55' 'home67'
 'home86' 'home93' 'home101' 'home106' 'home110' 'home43' 'home63'
 'home53' 'home79' 'home90' 'home95' 'home5' 'home17' 'home47' 'home51'
 'home65' 'home77' 'home89' 'home111' 'home114']
Number of different houses:
28


In [6]:
house_datasets = {}
for house in houses:
    house_datasets[house] = data[data['house_id'] == house]

#print(house_datasets)

**Preparing the data**

In [7]:
# the categories for prediction
final_category = 'blr_mod_lvl'
prediction_categories = ['blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out']


In [8]:
for house in houses:
  for cat in prediction_categories:
    house_datasets[house][cat+"_min"] = house_datasets[house][cat].min()
    house_datasets[house][cat+"_max"] = house_datasets[house][cat].max()
    house_datasets[house][cat+"_avg"] = house_datasets[house][cat].mean()
    house_datasets[house][cat+"_std"] = house_datasets[house][cat].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_datasets[house][cat+"_min"] = house_datasets[house][cat].min()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_datasets[house][cat+"_max"] = house_datasets[house][cat].max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_datasets[house][cat+"_avg"] = house_datasets[house][cat].mean

In [9]:
print(house_datasets)

{'home2':                        time    blr_mod_lvl  absorption  insulation  t_r_set  \
0       2022-10-01 00:00:00   0.000000e+00    0.503910    7.457292     15.0   
1       2022-10-01 00:01:00   0.000000e+00    0.503910    7.455208     15.0   
2       2022-10-01 00:02:00   0.000000e+00    0.518558    7.487500     15.0   
3       2022-10-01 00:03:00   0.000000e+00    0.616207    7.426042     15.0   
4       2022-10-01 00:04:00   0.000000e+00    0.699210    7.425000     15.0   
...                     ...            ...         ...         ...      ...   
305275  2023-04-30 23:55:00  4.940656e-324    0.005078    0.975000     15.5   
305276  2023-04-30 23:56:00  4.940656e-324    0.005078    0.947917     15.5   
305277  2023-04-30 23:57:00  4.940656e-324    0.005078    0.912500     15.5   
305278  2023-04-30 23:58:00  4.940656e-324    0.005078    0.912500     15.5   
305279  2023-04-30 23:59:00  4.940656e-324    0.005078    0.887076     15.5   

            t_out house_id  blr_mod_lvl_m

In [10]:
# creating graphs for each house for each category with min, max values
def show_graphs(dataset, categories):
    num_subplots = len(houses)
    subplot_width = 10
    subplot_height = 10
    total_width = num_subplots * subplot_width
    # Create the figure and axes
    fig, axes = plt.subplots(1, num_subplots, figsize=(total_width, subplot_height))

    # Iterate over categories and create subplots
    i=0
    for house in houses:
        # Get the data for the current category from each sub-dataset
        categories_data = {}
        categories_max = {}
        categories_min = {}
        categories_avg = {}
        for category in categories:
            categories_data[category] = dataset[house][category]

        # Plot the data on the corresponding subplot
        ax = axes[i]
        for category in categories:
            ax.plot(categories_data[category], linestyle='-', label=category)
        ax.set_title(house)
        ax.legend(loc="upper right")
        ax.set_xlabel('Time')
        ax.set_ylabel(category)
        #ax.grid(True)
        i=i+1

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [11]:
# showing categories per house with min, max, avg values

#for category in prediction_categories:
#    categories_for_plot = [category, category+"_min", category+"_max", category+"_avg"]
#    show_graphs(house_datasets, categories_for_plot)

In [12]:
# alternative for data processing

filtered_data = data[['house_id', 'time', 'blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out']].copy()

# scaler for each category
#scaler_blr_mod_lvl = MinMaxScaler()
scaler_absorption = MinMaxScaler()
scaler_insulation = MinMaxScaler()
scaler_t_r_set = MinMaxScaler()
scaler_t_out = MinMaxScaler()
# dict for scalers
scalers = {
    #"scaler_blr_mod_lvl": scaler_blr_mod_lvl,
    "scaler_absorption": scaler_absorption,
    "scaler_insulation": scaler_insulation,
    "scaler_t_r_set": scaler_t_r_set,
    "scaler_t_out": scaler_t_out
}
# function to get correct scaler
def get_scaler(category_name):
    scaler_name = "scaler_" + category_name
    return scalers.get(scaler_name)

# normalizing categories with scalers (all data from all houses in each category)
for cat in prediction_categories:
    if cat == "blr_mod_lvl":
        filtered_data["normalized_blr_mod_lvl"] = filtered_data["blr_mod_lvl"] / 100
        # blr_mod_lvl only has values between 0 and 100
    else:
        temp = []
        temp = filtered_data[cat].values
        temp = temp.reshape(-1, 1)  # Reshape to a 2D array with a single column
        scaler_temp = get_scaler(cat)
        temp_normalized = scaler_temp.fit_transform(temp)
        new_cat_name = "normalized_" + cat
        filtered_data[new_cat_name] = temp_normalized

# function for reverse scaling of list (depending on category)
def de_scale(cat_list, category):
    if category == "blr_mod_lvl":
        inverse_cat_list = cat_list * 100
        return inverse_cat_list
    else:
        scaler = get_scaler(category)
        return scaler.inverse_transform(cat_list)

In [13]:
#print(filtered_data)

In [16]:
# updating house_datasets with new data
for house in houses:
    house_datasets[house] = filtered_data[filtered_data['house_id'] == house]

In [17]:
print(house_datasets)

{'home2':        house_id                 time    blr_mod_lvl  absorption  insulation  \
0         home2  2022-10-01 00:00:00   0.000000e+00    0.503910    7.457292   
1         home2  2022-10-01 00:01:00   0.000000e+00    0.503910    7.455208   
2         home2  2022-10-01 00:02:00   0.000000e+00    0.518558    7.487500   
3         home2  2022-10-01 00:03:00   0.000000e+00    0.616207    7.426042   
4         home2  2022-10-01 00:04:00   0.000000e+00    0.699210    7.425000   
...         ...                  ...            ...         ...         ...   
305275    home2  2023-04-30 23:55:00  4.940656e-324    0.005078    0.975000   
305276    home2  2023-04-30 23:56:00  4.940656e-324    0.005078    0.947917   
305277    home2  2023-04-30 23:57:00  4.940656e-324    0.005078    0.912500   
305278    home2  2023-04-30 23:58:00  4.940656e-324    0.005078    0.912500   
305279    home2  2023-04-30 23:59:00  4.940656e-324    0.005078    0.887076   

        t_r_set      t_out  normalized_bl

In [18]:
min_blr_mod_lvl_values = {}
max_blr_mod_lvl_values = {}

for house in houses:
    min_blr_mod_lvl_values[house] = house_datasets[house]['normalized_blr_mod_lvl'].min()
    max_blr_mod_lvl_values[house] = house_datasets[house]['normalized_blr_mod_lvl'].max()

print("The min values for normalized_blr_mod_lvl are:")
print(min_blr_mod_lvl_values)
print("The max values for normalized_blr_mod_lvl are:")
print(max_blr_mod_lvl_values)

The min values for normalized_blr_mod_lvl are:
{'home2': 0.0, 'home9': 0.0, 'home13': 0.0, 'home14': 0.0, 'home34': 0.0, 'home46': 0.0, 'home55': 0.0, 'home67': 0.0, 'home86': 0.0, 'home93': 0.0, 'home101': 0.0, 'home106': 0.0, 'home110': 0.0, 'home43': 0.0, 'home63': 0.0, 'home53': 0.0, 'home79': 0.0, 'home90': 0.0, 'home95': 0.0, 'home5': 0.0, 'home17': 0.0, 'home47': 0.0, 'home51': 0.0, 'home65': 0.0, 'home77': 0.0, 'home89': 0.0, 'home111': 0.0, 'home114': 0.0}
The max values for normalized_blr_mod_lvl are:
{'home2': 0.5727160494107193, 'home9': 0.46137527832949793, 'home13': 0.7309388528612211, 'home14': 0.7400465046912482, 'home34': 0.7390665421937713, 'home46': 0.7376484810702111, 'home55': 0.7872875558672591, 'home67': 0.6298777667699708, 'home86': 0.4350339506172839, 'home93': 0.7132497267913068, 'home101': 0.6805356652949246, 'home106': 0.6378462903051156, 'home110': 0.8430771042085702, 'home43': 0.6853310185577202, 'home63': 0.4075061728395499, 'home53': 0.6832515151515152, 

In [19]:
normalized_categories = ["time", "house_id"]
for cat in prediction_categories:
  normalized_categories.append("normalized_" + cat)
print(normalized_categories)

['time', 'house_id', 'normalized_blr_mod_lvl', 'normalized_absorption', 'normalized_insulation', 'normalized_t_r_set', 'normalized_t_out']


In [20]:
# separating training and test data
# test data will be the last 20% (days!) from all houses
# (minutes_per_day=1440)

test_datasets = {}
train_val_datasets = {}

for house in houses:
    temp_house_data = house_datasets[house][normalized_categories]
    len_house = len(temp_house_data)
    days_of_house = len_house / 1440
    # calculate the 80% of the days (make it int before getting the minutes)
    train_size = int(0.8 * days_of_house) * 1440
    test_size = len_house - train_size
    train_val_datasets[house] = temp_house_data.iloc[:train_size]
    test_datasets[house] = temp_house_data.iloc[train_size:]

# from the train_val_datasets, split 80%-20% into training, validation
train_datasets = {}
val_datasets = {}

for house in houses:
    temp_house_data = train_val_datasets[house]
    len_house = len(temp_house_data)
    days_of_house = len_house / 1440
    # calculate the 80% of the days (make it int before getting the minutes)
    train_size = int(0.8 * days_of_house) * 1440
    val_size = len_house - train_size
    train_datasets[house] = temp_house_data.iloc[:train_size]
    val_datasets[house] = temp_house_data.iloc[train_size:]

In [21]:
#print(train_datasets)
#print(val_datasets)
#print(test_datasets)

In [22]:
# combine the train_datasets, test_datasets into one train_set , test_set

train_set = pd.concat(train_datasets.values(), ignore_index=True)
val_set = pd.concat(val_datasets.values(), ignore_index=True)
test_set = pd.concat(test_datasets.values(), ignore_index=True)

In [23]:
#print(train_set)
#print(val_set)
#print(test_set)

In [24]:
# order train, test by house and then by date
train_set = train_set.sort_values(by=['house_id', 'time'])
val_set = val_set.sort_values(by=['house_id', 'time'])
test_set = test_set.sort_values(by=['house_id', 'time'])

In [25]:
# get train-val-test sets and save for easy retrieval

# The paths for the output CSV files
output_train_csv_path = '/content/drive/MyDrive/boilers_drive/train_df.csv'
output_val_csv_path = '/content/drive/MyDrive/boilers_drive/val_df.csv'
output_test_csv_path = '/content/drive/MyDrive/boilers_drive/test_df.csv'

# Export final datasets to CSV files
train_set.to_csv(output_train_csv_path, index=False) # save train dataset
val_set.to_csv(output_val_csv_path, index=False) # save val dataset
test_set.to_csv(output_test_csv_path, index=False) # save test dataset

In [26]:
# !!! Already done this once. No need to repeat (it will change the order of the houses) !!!

'''
# random order of houses, for when I need some houses for training (e.g. 1 or 2 , not all houses)
rand_houses = list(houses)
random.shuffle(rand_houses)
#print(rand_houses)
random_order_houses = pd.DataFrame({'house_id': rand_houses})
print(random_order_houses)
'''

"\n# random order of houses, for when I need some houses for training (e.g. 1 or 2 , not all houses)\nrand_houses = list(houses)\nrandom.shuffle(rand_houses)\n#print(rand_houses)\nrandom_order_houses = pd.DataFrame({'house_id': rand_houses})\nprint(random_order_houses)\n"

In [27]:
# !!! Same as above !!!

# save the random order of houses
#random_order_houses.to_csv('/content/drive/MyDrive/boilers_drive/random_order_houses.csv', index=False)