In [None]:
import os

import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import random

import logging
import time
import tensorflow as tf

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import load_model
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/boilers_drive/normalized_df.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Specify the file path of your CSV file
#file_path = 'normalized_df.csv'

# the current script directory
# the current working directory
#current_dir = os.getcwd()

# the path to the CSV file in the parent directory
#parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
#file_path = os.path.join(parent_dir, 'normalized_df.csv')

# Read the CSV file
data = pd.read_csv(file_path)

#print("Imported data:")
#print(data)

Imported data:
                        time house_id  normalized_blr_mod_lvl  \
0        2022-10-01 00:00:00    home2                     0.0   
1        2022-10-01 00:01:00    home2                     0.0   
2        2022-10-01 00:02:00    home2                     0.0   
3        2022-10-01 00:03:00    home2                     0.0   
4        2022-10-01 00:04:00    home2                     0.0   
...                      ...      ...                     ...   
7611835  2023-04-30 23:55:00  home114                     0.0   
7611836  2023-04-30 23:56:00  home114                     0.0   
7611837  2023-04-30 23:57:00  home114                     0.0   
7611838  2023-04-30 23:58:00  home114                     0.0   
7611839  2023-04-30 23:59:00  home114                     0.0   

         normalized_absorption  normalized_insulation  normalized_t_r_set  \
0                         0.50                   7.46                15.0   
1                         0.50                   7

In [None]:
# Mapping for renaming normalized categories
rename_map = {col: col.replace('normalized_', '') for col in data.columns if col.startswith('normalized_')}

# Rename columns
data = data.rename(columns=rename_map)

In [None]:
# Identify NaN values
nan_mask = data.isna()
#print("NaN mask:\n", nan_mask)

# Get the count of NaN values in each column
nan_count = data.isna().sum()
print("\nNaN count in each column:\n", nan_count)

# List rows with NaN values
rows_with_nan = data[data.isna().any(axis=1)]
nan_houses = rows_with_nan['house_id'].unique()
#print("\nRows with NaN values:\n", rows_with_nan)
print("\nHouses with NaN values:\n", nan_houses)

# List columns with NaN values
columns_with_nan = data.columns[data.isna().any()].tolist()
print("\nColumns with NaN values:\n", columns_with_nan)


NaN count in each column:
 time           0
house_id       0
blr_mod_lvl    0
absorption     0
insulation     0
t_r_set        0
t_out          0
blr_t          0
heat           0
flame          0
water          0
t_ret          0
t_r            0
t_set          0
otc_cur        0
otc_maxt       0
bypass         0
year           0
month          0
day            0
day_of_year    0
hour           0
minute         0
dtype: int64

Houses with NaN values:
 []

Columns with NaN values:
 []


In [None]:
# Different houses in data
houses = data['house_id'].unique()
print("Different houses in data:")
print(houses)
print("Number of different houses:")
print(len(houses))

Different houses in data:
['home2' 'home9' 'home13' 'home14' 'home34' 'home46' 'home55' 'home67'
 'home86' 'home93' 'home101' 'home106' 'home110' 'home43' 'home63'
 'home53' 'home79' 'home90' 'home95' 'home5' 'home17' 'home47' 'home51'
 'home65' 'home77' 'home89' 'home111' 'home114']
Number of different houses:
28


In [None]:
house_datasets = {}
for house in houses:
    house_datasets[house] = data[data['house_id'] == house]

#print(house_datasets)

**Preparing the data**

In [None]:
# the categories for prediction
final_category = 'blr_mod_lvl'
#prediction_categories = ['blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out']
all_categories = data.columns.tolist()
print("All categories in the data:")
print(all_categories)

prediction_categories = all_categories.copy()
prediction_categories.remove('time')
prediction_categories.remove('house_id')

print("Prediction categories:")
print(prediction_categories)

All categories in the data:
['time', 'house_id', 'blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out', 'blr_t', 'heat', 'flame', 'water', 't_ret', 't_r', 't_set', 'otc_cur', 'otc_maxt', 'bypass', 'year', 'month', 'day', 'day_of_year', 'hour', 'minute']
Prediction categories:
['blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out', 'blr_t', 'heat', 'flame', 'water', 't_ret', 't_r', 't_set', 'otc_cur', 'otc_maxt', 'bypass', 'year', 'month', 'day', 'day_of_year', 'hour', 'minute']


In [None]:
# get the min, max, avg, std  values for each category per house for plotting/imaging
#for house in houses:
#  for cat in prediction_categories:
#    house_datasets[house][cat+"_min"] = house_datasets[house][cat].min()
#    house_datasets[house][cat+"_max"] = house_datasets[house][cat].max()
#    house_datasets[house][cat+"_avg"] = house_datasets[house][cat].mean()
#    house_datasets[house][cat+"_std"] = house_datasets[house][cat].std()

In [None]:
#print(house_datasets)

In [None]:
# creating graphs for each house for each category with min, max values
def show_graphs(dataset, categories):
    num_subplots = len(houses)
    subplot_width = 10
    subplot_height = 10
    total_width = num_subplots * subplot_width
    # Create the figure and axes
    fig, axes = plt.subplots(1, num_subplots, figsize=(total_width, subplot_height))

    # Iterate over categories and create subplots
    i=0
    for house in houses:
        # Get the data for the current category from each sub-dataset
        categories_data = {}
        categories_max = {}
        categories_min = {}
        categories_avg = {}
        for category in categories:
            categories_data[category] = dataset[house][category]

        # Plot the data on the corresponding subplot
        ax = axes[i]
        for category in categories:
            ax.plot(categories_data[category], linestyle='-', label=category)
        ax.set_title(house)
        ax.legend(loc="upper right")
        ax.set_xlabel('Time')
        ax.set_ylabel(category)
        #ax.grid(True)
        i=i+1

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [None]:
# showing categories per house with min, max, avg values

#for category in prediction_categories:
#    categories_for_plot = [category, category+"_min", category+"_max", category+"_avg"]
#    show_graphs(house_datasets, categories_for_plot)

In [None]:
print(prediction_categories)
scaling_categories = prediction_categories.copy()

print(scaling_categories)

['blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out', 'blr_t', 'heat', 'flame', 'water', 't_ret', 't_r', 't_set', 'otc_cur', 'otc_maxt', 'bypass', 'year', 'month', 'day', 'day_of_year', 'hour', 'minute']
['blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out', 'blr_t', 'heat', 'flame', 'water', 't_ret', 't_r', 't_set', 'otc_cur', 'otc_maxt', 'bypass', 'year', 'month', 'day', 'day_of_year', 'hour', 'minute']


In [None]:
# alternative for data processing

filtered_data = data[all_categories].copy()


# scaler for each category per house
scalers_dict = {}
for house in houses:
    scalers_dict[house] = {}
    for cat in scaling_categories: # we do not need scaler for blr_mod_lvl
        scalers_dict[house][cat] = MinMaxScaler()


# scaler for each category
# dict for scalers
#scalers_dict = {
#    #"scaler_blr_mod_lvl": scaler_blr_mod_lvl,
#    "scaler_absorption": scaler_absorption=MinMaxScaler(),
#    "scaler_insulation": scaler_insulation=MinMaxScaler(),
#    "scaler_t_r_set": scaler_t_r_set=MinMaxScaler(),
#    "scaler_t_out": scaler_t_out=MinMaxScaler()
#}

# function to get correct scaler
def get_scaler(category_name, house):
    #scaler = scalers_dict.get("scaler_"+category_name) # scaler_name = "scaler_"+category_name
    scaler = scalers_dict[house][category_name]
    return scaler

# Normalizing categories with scalers (each house and category separately)
for house in houses:
  for cat in scaling_categories:
    if cat == "blr_mod_lvl":
        # Filter data for the current house
        house_data = filtered_data[filtered_data["house_id"] == house]
        filtered_data.loc[filtered_data["house_id"] == house, "blr_mod_lvl"] = house_data["blr_mod_lvl"] / 100
    else:
        house_data = filtered_data[filtered_data["house_id"] == house]
        temp = house_data[cat].values.reshape(-1, 1)  # Reshape to a 2D array
        scaler_temp = get_scaler(cat, house)  # Retrieve scaler for the current house and category
        temp_normalized = scaler_temp.fit_transform(temp)
        # Save normalized data back to the filtered_data DataFrame
        new_cat_name = cat
        filtered_data.loc[filtered_data["house_id"] == house, new_cat_name] = temp_normalized.flatten()
        scalers_dict[house][cat] = scaler_temp # save the fitted scaler back to the scalers_dict

# now that scalers have been 'trained' and been used, save them for de-scaling later
with open('/content/drive/MyDrive/boilers_drive/scalers.pkl', 'wb') as file:
    pickle.dump(scalers_dict, file)


  filtered_data.loc[filtered_data["house_id"] == house, new_cat_name] = temp_normalized.flatten()
  filtered_data.loc[filtered_data["house_id"] == house, new_cat_name] = temp_normalized.flatten()
  filtered_data.loc[filtered_data["house_id"] == house, new_cat_name] = temp_normalized.flatten()
  filtered_data.loc[filtered_data["house_id"] == house, new_cat_name] = temp_normalized.flatten()
  filtered_data.loc[filtered_data["house_id"] == house, new_cat_name] = temp_normalized.flatten()


In [None]:
#print(filtered_data)

In [None]:
# updating house_datasets with new data
for house in houses:
    house_datasets[house] = filtered_data[filtered_data['house_id'] == house]

In [None]:
print(house_datasets)

{'home2':                        time house_id  blr_mod_lvl  absorption  insulation  \
0       2022-10-01 00:00:00    home2          0.0    0.277286    0.555556   
1       2022-10-01 00:01:00    home2          0.0    0.277286    0.555556   
2       2022-10-01 00:02:00    home2          0.0    0.278024    0.556755   
3       2022-10-01 00:03:00    home2          0.0    0.281711    0.554357   
4       2022-10-01 00:04:00    home2          0.0    0.284661    0.554357   
...                     ...      ...          ...         ...         ...   
305275  2023-04-30 23:55:00    home2          0.0    0.259218    0.296563   
305276  2023-04-30 23:56:00    home2          0.0    0.259218    0.295364   
305277  2023-04-30 23:57:00    home2          0.0    0.259218    0.293765   
305278  2023-04-30 23:58:00    home2          0.0    0.259218    0.293765   
305279  2023-04-30 23:59:00    home2          0.0    0.259218    0.292966   

         t_r_set     t_out     blr_t  heat  flame  ...  t_set  ot

In [None]:
min_blr_mod_lvl_values = {}
max_blr_mod_lvl_values = {}

for house in houses:
    min_blr_mod_lvl_values[house] = house_datasets[house]['blr_mod_lvl'].min()
    max_blr_mod_lvl_values[house] = house_datasets[house]['blr_mod_lvl'].max()

#print("The min values for normalized_blr_mod_lvl are:")
#print(min_blr_mod_lvl_values)
#print("The max values for normalized_blr_mod_lvl are:")
#print(max_blr_mod_lvl_values)


# dataframe created from the houses list and the min, max dicts
limits_df = pd.DataFrame()
limits_df['house_id'] = houses
limits_df['min_blr_mod_lvl'] = limits_df['house_id'].map(min_blr_mod_lvl_values)
limits_df['max_blr_mod_lvl'] = limits_df['house_id'].map(max_blr_mod_lvl_values)
print("The min, max values of blr_mod_lvl (from the df) are:")
print(limits_df.to_string(index=False))

The min, max values of blr_mod_lvl (from the df) are:
house_id  min_blr_mod_lvl  max_blr_mod_lvl
   home2              0.0           0.5727
   home9              0.0           0.4614
  home13              0.0           0.7309
  home14              0.0           0.7400
  home34              0.0           0.7391
  home46              0.0           0.7376
  home55              0.0           0.7873
  home67              0.0           0.6299
  home86              0.0           0.4350
  home93              0.0           0.7132
 home101              0.0           0.6805
 home106              0.0           0.6378
 home110              0.0           0.8431
  home43              0.0           0.6853
  home63              0.0           0.4075
  home53              0.0           0.6833
  home79              0.0           0.8124
  home90              0.0           0.5862
  home95              0.0           0.6343
   home5              0.0           0.5369
  home17              0.0           0.5495


In [None]:
normalized_categories = ["time", "house_id"]
for cat in prediction_categories:
  normalized_categories.append(cat)
print(normalized_categories)

['time', 'house_id', 'blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out', 'blr_t', 'heat', 'flame', 'water', 't_ret', 't_r', 't_set', 'otc_cur', 'otc_maxt', 'bypass', 'year', 'month', 'day', 'day_of_year', 'hour', 'minute']


In [None]:
# separating training and test data
# test data will be the last 20% (days!) from all houses
# (minutes_per_day=1440)

test_datasets = {}
train_val_datasets = {}

for house in houses:
    temp_house_data = house_datasets[house][normalized_categories]
    len_house = len(temp_house_data)
    days_of_house = len_house / 1440
    # calculate the 80% of the days (make it int before getting the minutes)
    train_size = int(0.8 * days_of_house) * 1440
    test_size = len_house - train_size
    train_val_datasets[house] = temp_house_data.iloc[:train_size]
    test_datasets[house] = temp_house_data.iloc[train_size:]

# from the train_val_datasets, split 80%-20% into training, validation
train_datasets = {}
val_datasets = {}

for house in houses:
    temp_house_data = train_val_datasets[house]
    len_house = len(temp_house_data)
    days_of_house = len_house / 1440
    # calculate the 80% of the days (make it int before getting the minutes)
    train_size = int(0.8 * days_of_house) * 1440
    val_size = len_house - train_size
    train_datasets[house] = temp_house_data.iloc[:train_size]
    val_datasets[house] = temp_house_data.iloc[train_size:]

In [None]:
#print(train_datasets)
#print(val_datasets)
#print(test_datasets)

In [None]:
# combine the train_datasets, test_datasets into one train_set , test_set

train_set = pd.concat(train_datasets.values(), ignore_index=True)
val_set = pd.concat(val_datasets.values(), ignore_index=True)
test_set = pd.concat(test_datasets.values(), ignore_index=True)

In [None]:
#print(train_set)
#print(val_set)
#print(test_set)

In [None]:
# order train, test by house and then by date
train_set = train_set.sort_values(by=['house_id', 'time'])
val_set = val_set.sort_values(by=['house_id', 'time'])
test_set = test_set.sort_values(by=['house_id', 'time'])

In [None]:
# get train-val-test sets and save for easy retrieval

# The paths for the output CSV files
output_train_csv_path = '/content/drive/MyDrive/boilers_drive/train_df.csv'
output_val_csv_path = '/content/drive/MyDrive/boilers_drive/val_df.csv'
output_test_csv_path = '/content/drive/MyDrive/boilers_drive/test_df.csv'

# Export final datasets to CSV files
train_set.to_csv(output_train_csv_path, index=False) # save train dataset
val_set.to_csv(output_val_csv_path, index=False) # save val dataset
test_set.to_csv(output_test_csv_path, index=False) # save test dataset

In [None]:
# !!! Already done this once. No need to repeat (it will change the order of the houses) !!!

'''
# random order of houses, for when I need some houses for training (e.g. 1 or 2 , not all houses)
rand_houses = list(houses)
random.shuffle(rand_houses)
#print(rand_houses)
random_order_houses = pd.DataFrame({'house_id': rand_houses})
print(random_order_houses)
'''

"\n# random order of houses, for when I need some houses for training (e.g. 1 or 2 , not all houses)\nrand_houses = list(houses)\nrandom.shuffle(rand_houses)\n#print(rand_houses)\nrandom_order_houses = pd.DataFrame({'house_id': rand_houses})\nprint(random_order_houses)\n"

In [None]:
# !!! Same as above !!!

# save the random order of houses
#random_order_houses.to_csv('/content/drive/MyDrive/boilers_drive/random_order_houses.csv', index=False)