In [2]:
import os

import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import random

import logging
import time
import tensorflow as tf

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import load_model
import pickle

In [3]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/boilers_drive/normalized_df.csv'

Mounted at /content/drive


In [4]:
# Specify the file path of your CSV file
#file_path = 'normalized_df.csv'

# the current script directory
# the current working directory
#current_dir = os.getcwd()

# the path to the CSV file in the parent directory
#parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
#file_path = os.path.join(parent_dir, 'normalized_df.csv')

# Read the CSV file
data = pd.read_csv(file_path)

print("Imported data:")
print(data)

Imported data:
                        time    blr_mod_lvl  absorption  insulation  t_r_set  \
0        2022-10-01 00:00:00   0.000000e+00    0.503910    7.457292     15.0   
1        2022-10-01 00:01:00   0.000000e+00    0.503910    7.455208     15.0   
2        2022-10-01 00:02:00   0.000000e+00    0.518558    7.487500     15.0   
3        2022-10-01 00:03:00   0.000000e+00    0.616207    7.426042     15.0   
4        2022-10-01 00:04:00   0.000000e+00    0.699210    7.425000     15.0   
...                      ...            ...         ...         ...      ...   
7611835  2023-04-30 23:55:00  2.871866e-119    0.000000    1.604167     17.0   
7611836  2023-04-30 23:56:00  1.914578e-119    0.000000    1.614583     17.0   
7611837  2023-04-30 23:57:00  1.276385e-119    0.000000    1.572917     17.0   
7611838  2023-04-30 23:58:00  8.509234e-120    0.000000    1.511458     17.0   
7611839  2023-04-30 23:59:00  5.672823e-120    0.000000    1.432203     17.0   

             t_out house

In [5]:
# Identify NaN values
nan_mask = data.isna()
print("NaN mask:\n", nan_mask)

# Get the count of NaN values in each column
nan_count = data.isna().sum()
print("\nNaN count in each column:\n", nan_count)

# List rows with NaN values
rows_with_nan = data[data.isna().any(axis=1)]
nan_houses = rows_with_nan['house_id'].unique()
print("\nRows with NaN values:\n", rows_with_nan)
print("\nHouses with NaN values:\n", nan_houses)

# List columns with NaN values
columns_with_nan = data.columns[data.isna().any()].tolist()
print("\nColumns with NaN values:\n", columns_with_nan)

NaN mask:
           time  blr_mod_lvl  absorption  insulation  t_r_set  t_out  house_id
0        False        False       False       False    False  False     False
1        False        False       False       False    False  False     False
2        False        False       False       False    False  False     False
3        False        False       False       False    False  False     False
4        False        False       False       False    False  False     False
...        ...          ...         ...         ...      ...    ...       ...
7611835  False        False       False       False    False  False     False
7611836  False        False       False       False    False  False     False
7611837  False        False       False       False    False  False     False
7611838  False        False       False       False    False  False     False
7611839  False        False       False       False    False  False     False

[7611840 rows x 7 columns]

NaN count in each column

In [6]:
# Different houses in data
houses = data['house_id'].unique()
print("Different houses in data:")
print(houses)
print("Number of different houses:")
print(len(houses))

Different houses in data:
['home2' 'home9' 'home13' 'home14' 'home34' 'home46' 'home55' 'home67'
 'home86' 'home93' 'home101' 'home106' 'home110' 'home43' 'home63'
 'home53' 'home79' 'home90' 'home95' 'home5' 'home17' 'home47' 'home51'
 'home65' 'home77' 'home89' 'home111' 'home114']
Number of different houses:
28


In [7]:
house_datasets = {}
for house in houses:
    house_datasets[house] = data[data['house_id'] == house]

#print(house_datasets)

**Preparing the data**

In [1]:
# the categories for prediction
final_category = 'blr_mod_lvl'
prediction_categories = ['blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out']


In [None]:
# alternative for data processing

filtered_data = data[['house_id', 'time', 'blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out']].copy()

# scaler for each category
scaler_blr_mod_lvl = MinMaxScaler()
scaler_absorption = MinMaxScaler()
scaler_insulation = MinMaxScaler()
scaler_t_r_set = MinMaxScaler()
scaler_t_out = MinMaxScaler()
# dict for scalers
scalers = {
    "scaler_blr_mod_lvl": scaler_blr_mod_lvl,
    "scaler_absorption": scaler_absorption,
    "scaler_insulation": scaler_insulation,
    "scaler_t_r_set": scaler_t_r_set,
    "scaler_t_out": scaler_t_out
}
# function to get correct scaler
def get_scaler(category_name):
    scaler_name = "scaler_" + category_name
    return scalers.get(scaler_name)

# normalizing categories with scalers (all data from all houses in each category)
for cat in prediction_categories:
    temp = []
    temp = filtered_data[cat].values
    temp = temp.reshape(-1, 1)  # Reshape to a 2D array with a single column
    scaler_temp = get_scaler(cat)
    temp_normalized = scaler_temp.fit_transform(temp)
    new_cat_name = "normalized_" + cat
    filtered_data[new_cat_name] = temp_normalized

# function for reverse scaling of list (depending on category)
def de_scale(cat_list, category):
    scaler = get_scaler(category)
    return scaler.inverse_transform(cat_list)


In [None]:
#print(filtered_data)

In [None]:
# creating sub-lists, each with the data of one day
# the function that gets category data of a house (ordered by date) and separates by day
def separate_into_days(data_list, minutes_per_day=1440):
    # number of days
    num_days = len(data_list) // minutes_per_day
    # the data into a list of sub-lists, each containing one day's data
    separated_data = [
        data_list[i * minutes_per_day:(i + 1) * minutes_per_day]
        for i in range(num_days)
    ]
    return separated_data


# function to "combine" values of categories and separate into sub-lists based on days
def combine_categories(dataset, categories_list):
    # Combine specified categories into lists
    combined_elements = dataset[categories_list].apply(lambda row: row.tolist(), axis=1)
    return combined_elements.tolist()

In [None]:
def prepare_data_2(house_data, input_categories, output_category, minutes_per_day=1440):
    combined_input_data = combine_categories(house_data, input_categories)
    separated_input_data = separate_into_days(combined_input_data, minutes_per_day)
    output = house_data[output_category].values
    separated_output = separate_into_days(output, minutes_per_day)
    # [:, :-1] and [:, 1:] for 2-d arrays
    # [:-1],  and [1:] for 1-d arrays (or lists)
    # all except last day are inputs (for prediction)
    input_data = separated_input_data[:-1]
    # all except first day are the corresponding outputs (from prediction)
    output_data = separated_output[1:]
    return input_data, output_data

In [None]:
# Filter data for the 'absorption' category
#category = 'absorption'
filtered_data = filtered_data[['house_id', 'time', 'normalized_blr_mod_lvl', 'normalized_absorption', 'normalized_insulation', 'normalized_t_r_set', 'normalized_t_out']].copy()
input_chosen_categories = ['normalized_blr_mod_lvl', 'normalized_absorption']

# We have 1440 minutes per day
minutes_per_day = 1440

# Initialize lists to store input and target pairs
input_data_list, target_data_list = [], []

for house_id in houses:
    house_data = filtered_data[filtered_data['house_id'] == 'home13']
    #house_data = filtered_data[filtered_data['house_id'] == house_id]
    house_data = house_data.sort_values(by='time')
    input_data, target_data = prepare_data_2(house_data, input_chosen_categories, 'normalized_blr_mod_lvl')
    input_data_list.append(input_data)
    target_data_list.append(target_data)

# Combine all houses' data
input_data = np.concatenate(input_data_list, axis=0)
target_data = np.concatenate(target_data_list, axis=0)


In [None]:
# Properly split datasets for train, validation, test
# from original data: 80%-20% split, where 20% is for test. from remaining 80% : 80%-20% split, for train and validation

batch_size = 8 # Reduce the batch size, in case it helps !

# Split the data into train+val and test sets
input_train_val, input_test, target_train_val, target_test = train_test_split(input_data, target_data, test_size=0.2, random_state=42)

# Further split train+val into train and val sets
input_train, input_val, target_train, target_val = train_test_split(input_train_val, target_train_val, test_size=0.2, random_state=42)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train))
train_dataset = train_dataset.cache().shuffle(1000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((input_val, target_val))
val_dataset = val_dataset.cache().batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((input_test, target_test))
test_dataset = test_dataset.cache().batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)


In [None]:
print(input_train.shape)
print(target_train.shape)
print(input_val.shape)
print(target_val.shape)
print(input_test.shape)
print(target_test.shape)

(3780, 1440, 2)
(3780, 1440)
(946, 1440, 2)
(946, 1440)
(1182, 1440, 2)
(1182, 1440)


In [None]:
# get train-val-test sets and datasets and save for easy retrieval
