In [1]:
import os
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

In [2]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/boilers_drive/merged_df_30.csv'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# path of CSV file
#file_path = 'merged_df_30.csv'

# the current script directory
# the current working directory
#current_dir = os.getcwd()

# the path to the CSV file in the parent directory
#parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
#file_path = os.path.join(parent_dir, 'merged_df_30.csv')

# reading CSV file
data = pd.read_csv(file_path)

In [None]:
print("Imported data:")
print(data)

Imported data:
                        time    blr_mod_lvl      blr_t  heat  flame  water  \
0        2022-10-01 00:00:00   0.000000e+00  28.398440   0.0    0.0    0.0   
1        2022-10-01 00:01:00   0.000000e+00  28.398440   0.0    0.0    0.0   
2        2022-10-01 00:02:00   0.000000e+00  28.413088   0.0    0.0    0.0   
3        2022-10-01 00:03:00   0.000000e+00  28.479815   0.0    0.0    0.0   
4        2022-10-01 00:04:00   0.000000e+00  28.496090   0.0    0.0    0.0   
...                      ...            ...        ...   ...    ...    ...   
7331983  2023-04-30 23:55:00  2.871866e-119  20.000000   0.0    0.0    0.0   
7331984  2023-04-30 23:56:00  1.914578e-119  20.000000   0.0    0.0    0.0   
7331985  2023-04-30 23:57:00  1.276385e-119  20.000000   0.0    0.0    0.0   
7331986  2023-04-30 23:58:00  8.509234e-120  20.000000   0.0    0.0    0.0   
7331987  2023-04-30 23:59:00  5.672823e-120  20.000000   0.0    0.0    0.0   

             t_out      t_ret   t_r  t_r_set  ot

In [None]:
# functions for plotting data
def plot_one(category, category_name):
    plt.plot(category, linestyle='-')
    plt.title("Plot of " + category_name)
    plt.ylabel(category_name)
    plt.show

def plot_dataset(dataset, column_names):
    for column in column_names:
        plt.plot(dataset[column], linestyle='-')
        plt.title("Plot of "+ column)
        plt.ylabel(column)
        #plt.grid(True)
        plt.show()

In [None]:
# list of all column names
column_names = data.columns.tolist()

print("List of column names:")
print(column_names)

corr_columns = column_names.copy()
corr_columns.remove('time')
corr_columns.remove('house_id')
corr_columns.remove('nodata')

# list of column names for correlation
print("List of column names for correlation:")
print(corr_columns)

plot_columns = corr_columns.copy()
plot_columns.remove("month")
plot_columns.remove("day")
plot_columns.remove("hour")

# list of column names for plotting
print("List of column names for plotting:")
print(plot_columns)

List of column names:
['time', 'blr_mod_lvl', 'blr_t', 'heat', 'flame', 'water', 't_out', 't_ret', 't_r', 't_r_set', 'otc_cur', 't_set', 'otc_maxt', 'bypass', 'nodata', 'house_id', 'month', 'day', 'hour']
List of column names for correlation:
['blr_mod_lvl', 'blr_t', 'heat', 'flame', 'water', 't_out', 't_ret', 't_r', 't_r_set', 'otc_cur', 't_set', 'otc_maxt', 'bypass', 'month', 'day', 'hour']
List of column names for plotting:
['blr_mod_lvl', 'blr_t', 'heat', 'flame', 'water', 't_out', 't_ret', 't_r', 't_r_set', 'otc_cur', 't_set', 'otc_maxt', 'bypass']


In [None]:
nodata_value = 0.0

filtered_nodata = data[data['nodata'] != nodata_value]

print(filtered_nodata)

Empty DataFrame
Columns: [time, blr_mod_lvl, blr_t, heat, flame, water, t_out, t_ret, t_r, t_r_set, otc_cur, t_set, otc_maxt, bypass, nodata, house_id, month, day, hour]
Index: []


In [None]:
# Different houses (house ids) in data
houses = data['house_id'].unique()
print("Different houses in data:")
print(houses)
print("Number of different houses:")
print(len(houses))

Different houses in data:
['home2' 'home9' 'home13' 'home14' 'home34' 'home46' 'home55' 'home67'
 'home86' 'home93' 'home101' 'home106' 'home110' 'home43' 'home63'
 'home53' 'home79' 'home90' 'home95' 'home5' 'home17' 'home47' 'home51'
 'home65' 'home77' 'home89' 'home111' 'home114']
Number of different houses:
28


In [None]:
# for plotting a category for all houses
def plot_houses_category(dataset, category):
    num_subplots = len(houses)

    subplot_width = 10
    subplot_height = 10
    total_width = num_subplots * subplot_width

    # Create the figure and axes
    fig, axes = plt.subplots(1, num_subplots, figsize=(total_width, subplot_height))

    # Iterate over categories and create subplots
    i=0
    for house in houses:
        # Get the data for the current category from each sub-dataset
        category_data = dataset[house][category]

        # Plot the data on the corresponding subplot
        ax = axes[i]
        ax.plot(category_data, linestyle='-', label=category)
        ax.set_title(house)
        ax.legend()
        #ax.grid(True)
        i=i+1

    # Adjust layout
    plt.tight_layout()
    plt.show()

Data Preprocessing

In [None]:
data2 = data

# the categopries with which we will study the data

data2['absorption'] = data2['blr_t'] - data2['t_ret']
data2['insulation'] = data2['t_r'] - data2['t_out']

data2['blr_mod_lvl_error'] = 0
data2['absorption_error'] = 0
data2['insulation_error'] = 0
data2['t_r_set_error'] = 0
data2['t_out_error'] = 0

print(data2)

                        time    blr_mod_lvl      blr_t  heat  flame  water  \
0        2022-10-01 00:00:00   0.000000e+00  28.398440   0.0    0.0    0.0   
1        2022-10-01 00:01:00   0.000000e+00  28.398440   0.0    0.0    0.0   
2        2022-10-01 00:02:00   0.000000e+00  28.413088   0.0    0.0    0.0   
3        2022-10-01 00:03:00   0.000000e+00  28.479815   0.0    0.0    0.0   
4        2022-10-01 00:04:00   0.000000e+00  28.496090   0.0    0.0    0.0   
...                      ...            ...        ...   ...    ...    ...   
7331983  2023-04-30 23:55:00  2.871866e-119  20.000000   0.0    0.0    0.0   
7331984  2023-04-30 23:56:00  1.914578e-119  20.000000   0.0    0.0    0.0   
7331985  2023-04-30 23:57:00  1.276385e-119  20.000000   0.0    0.0    0.0   
7331986  2023-04-30 23:58:00  8.509234e-120  20.000000   0.0    0.0    0.0   
7331987  2023-04-30 23:59:00  5.672823e-120  20.000000   0.0    0.0    0.0   

             t_out      t_ret   t_r  t_r_set  ...  month  day  

In [None]:
house_datasets2 = {}
for house in houses:
    house_datasets2[house] = data2[data2['house_id'] == house]

# Example
#print("House-dataset for house_id 'home34':")
#print(house_datasets2["home34"])

In [None]:
# see if there are houses with extreme values of t_out
extreme_t_out = {}
for house in houses:
    data_temp = house_datasets2[house]
    extreme_t_out[house] = data_temp[data_temp['t_out'] > 35.0]
    extreme_t_out[house] = data_temp[data_temp['t_out'] < -5.0]

extreme_t_out_houses = []
not_extreme_t_out_houses = []
for house in houses:
    if not extreme_t_out[house].empty:
        extreme_t_out_houses.append(house)
    else:
        not_extreme_t_out_houses.append(house)

print("extreme_t_out found for houses with id :")
print(extreme_t_out_houses)
print("houses without extreme_t_out :")
print(not_extreme_t_out_houses)

extreme_t_out found for houses with id :
[]
houses without extreme_t_out :
['home2', 'home9', 'home13', 'home14', 'home34', 'home46', 'home55', 'home67', 'home86', 'home93', 'home101', 'home106', 'home110', 'home43', 'home63', 'home53', 'home79', 'home90', 'home95', 'home5', 'home17', 'home47', 'home51', 'home65', 'home77', 'home89', 'home111', 'home114']


In [None]:
# see if there are houses with extreme values of absorption
extreme_absorption = {}

for house in houses:
    data_temp = house_datasets2[house]
    extreme_absorption[house] = data_temp[data_temp['absorption'] < 0.0]

extreme_absorption_houses = []
not_extreme_absorption_houses = []
for house in houses:
    if not extreme_absorption[house].empty:
        extreme_absorption_houses.append(house)
    else:
        not_extreme_absorption_houses.append(house)

print("extreme_absorption found for houses with id :")
print(extreme_absorption_houses)
print("houses without extreme_absorption :")
print(not_extreme_absorption_houses)

extreme_absorption found for houses with id :
['home2', 'home9', 'home13', 'home14', 'home34', 'home46', 'home55', 'home67', 'home86', 'home93', 'home101', 'home106', 'home110', 'home43', 'home63', 'home53', 'home79', 'home90', 'home95', 'home5', 'home17', 'home47', 'home65', 'home77', 'home89', 'home111', 'home114']
houses without extreme_absorption :
['home51']


In [None]:
# for every value of 'absorption' we only accept positives
# if 'absorption' < 0 then it becomes NaN
for house in houses:
  house_datasets2[house]['absorption'] = house_datasets2[house]['absorption'].apply(lambda x: x if x > 0 else np.nan)
# !! for a moment might be okay
# !! for many minutes (values) it is error
print(house_datasets2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_datasets2[house]['absorption'] = house_datasets2[house]['absorption'].apply(lambda x: x if x > 0 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_datasets2[house]['absorption'] = house_datasets2[house]['absorption'].apply(lambda x: x if x > 0 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

{'home2':                        time    blr_mod_lvl      blr_t  heat  flame  water  \
0       2022-10-01 00:00:00   0.000000e+00  28.398440   0.0    0.0    0.0   
1       2022-10-01 00:01:00   0.000000e+00  28.398440   0.0    0.0    0.0   
2       2022-10-01 00:02:00   0.000000e+00  28.413088   0.0    0.0    0.0   
3       2022-10-01 00:03:00   0.000000e+00  28.479815   0.0    0.0    0.0   
4       2022-10-01 00:04:00   0.000000e+00  28.496090   0.0    0.0    0.0   
...                     ...            ...        ...   ...    ...    ...   
301212  2023-04-30 23:55:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301213  2023-04-30 23:56:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301214  2023-04-30 23:57:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301215  2023-04-30 23:58:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301216  2023-04-30 23:59:00  4.940656e-324  19.854808   0.0    0.0    0.0   

            t_out      t_ret   t_r  t_r_set  ...  month  day  hou

In [None]:
# the categories we will study
# first we will find anomalous values and normalize
anomalous_categories = ['blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out']

In [None]:
def calculate_z_score(data, category):
    # Calculate mean and standard deviation of 'outside_temperature'
    mean_temp = data[category].mean()
    std_temp = data[category].std()

    z_score_name = 'z_score_' + category
    # Calculate Z-score for each data point
    data[z_score_name] = (data[category] - mean_temp) / std_temp
    return


def find_outliers_z_score(data, category):
    calculate_z_score(data, category)

    # threshold for Z-score (usually 3 standard deviations from the mean)
    z_threshold = 3
    if category=='blr_mod_lvl' :
        z_threshold = 8
    if category=='absorption' :
        z_threshold = 7
    if category=='insulation':
        z_threshold = 4
    if category=='t_out':
        z_threshold = 3
    if category=='t_out':
        z_threshold = 4

    z_score_name = 'z_score_' + category
    category_error = category + "_z_score_error"
    # Identify outliers based on Z-score threshold
    # Perform boolean indexing to flag anomalies based on the threshold
    data[category_error] = (data[z_score_name].abs() > z_threshold).astype(int)

    category_outliers = category + "_z_score_outliers"
    data[category_outliers] = data[category] * data[category_error]
    return

In [None]:
for house in houses:
    for category in anomalous_categories:
        find_outliers_z_score(house_datasets2[house], category)

print(house_datasets2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[z_score_name] = (data[category] - mean_temp) / std_temp
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[category_error] = (data[z_score_name].abs() > z_threshold).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[category_outliers] = data[category] * data[category_error]
A va

{'home2':                        time    blr_mod_lvl      blr_t  heat  flame  water  \
0       2022-10-01 00:00:00   0.000000e+00  28.398440   0.0    0.0    0.0   
1       2022-10-01 00:01:00   0.000000e+00  28.398440   0.0    0.0    0.0   
2       2022-10-01 00:02:00   0.000000e+00  28.413088   0.0    0.0    0.0   
3       2022-10-01 00:03:00   0.000000e+00  28.479815   0.0    0.0    0.0   
4       2022-10-01 00:04:00   0.000000e+00  28.496090   0.0    0.0    0.0   
...                     ...            ...        ...   ...    ...    ...   
301212  2023-04-30 23:55:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301213  2023-04-30 23:56:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301214  2023-04-30 23:57:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301215  2023-04-30 23:58:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301216  2023-04-30 23:59:00  4.940656e-324  19.854808   0.0    0.0    0.0   

            t_out      t_ret   t_r  t_r_set  ...  absorption_z_sc

In [None]:
# plot two (or more) categories from a list for each house
def plot_houses_categories_list(dataset, categories):
    num_subplots = len(houses)

    subplot_width = 10
    subplot_height = 10
    total_width = num_subplots * subplot_width

    # Create the figure and axes
    fig, axes = plt.subplots(1, num_subplots, figsize=(total_width, subplot_height))

    # Iterate over categories and create subplots
    i=0
    for house in houses:
        # Get the data for the current category from each sub-dataset
        categories_data = {}
        for category in categories:
            categories_data[category] = dataset[house][category]

        # Plot the data on the corresponding subplot
        ax = axes[i]
        for category in categories:
            ax.plot(categories_data[category], linestyle='-', label=category)
        ax.set_title(house)
        ax.legend()
        #ax.grid(True)
        i=i+1

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [None]:
# Plot categories for houses

#for category in anomalous_categories:
#    plot_houses_category(house_datasets2, category)

In [None]:
# Plot categories and z_score outliers for houses

#for category in anomalous_categories:
#    category_outliers = category + "_z_score_outliers"
#    categories = [category, category_outliers]
#    plot_houses_categories_list(house_datasets2, categories)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
def use_isolation_forest(data, category):
    # Data needs reshaping for Isolation Forest input
    category_data = data[category].values.reshape(-1, 1)

    # Contamination value
    contamination = 0.01

    # Create an Isolation Forest instance
    isolation_forest = IsolationForest(contamination=contamination)

    # Fit the Isolation Forest model to the data
    isolation_forest.fit(category_data)

    # Predict outliers using Isolation Forest
    outlier_preds = isolation_forest.predict(category_data)

    # Convert outlier predictions to binary category
    # Map the predicted labels from -1 (outliers) to 1 (error) and all other labels to 0 (non-error).
    category_error = 'isolation_forest_error_' + category
    data[category_error] = np.where(outlier_preds == -1, 1, 0)
    category_outliers = category + "_isolation_forest_outliers"
    data[category_outliers] = data[category] * data[category_error]
    return

In [None]:
# !! we will not use isolation forest for now because it requires that there are no NaN values !!

#for house in houses:
#    for category in anomalous_categories:
#        use_isolation_forest(house_datasets2[house], category)

#print(house_datasets2)

In [None]:
# Plot categories and Isolation Forest outliers for houses

#for category in anomalous_categories:
#    category_outliers = category + "_isolation_forest_outliers"
#    categories = [category, category_outliers]
#    plot_houses_categories_list(house_datasets2, categories)

In [None]:
# Plot categories and outliers of all methods for houses

#for category in anomalous_categories:
#    category_isolation_forest_outliers = category + "_isolation_forest_outliers"
#    category_z_score_outliers = category + "_z_score_outliers"
#    categories = [category, category_isolation_forest_outliers, category_z_score_outliers]
#    plot_houses_categories_list(house_datasets2, categories)

In [None]:
# Combine errors from multiple methods into one
def get_final_errors(data, category):
    z_score_error = category + "_z_score_error"
    isolation_forest_error = 'isolation_forest_error_' + category
    combined_error = 'combined_error_' + category

    # Combine errors using logical operations
    #data[combined_error] = (data[z_score_error] & data[isolation_forest_error]).astype(int)
    #if category == 'blr_mod_lvl' :
    #    data[combined_error] = 0
    #if category == 'absorption' :
    #    data[combined_error] = (data[z_score_error] & data[isolation_forest_error]).astype(int)
    #if category == 'insulation' :
    #    data[combined_error] = (data[z_score_error] & data[isolation_forest_error]).astype(int)
    #if category == 't_r_set' :
    #    data[combined_error] = (data[z_score_error]).astype(int)
    #if category == 't_out' :
    #    data[combined_error] = (data[z_score_error] & data[isolation_forest_error]).astype(int)

    # !! for now we do not use isolation forest so there is only z-score error
    data[combined_error] = (data[z_score_error]).astype(int)

    combined_outliers = category + "_combined_outliers"
    data[combined_outliers] = data[category] * data[combined_error]

    # if combined_error == 1 then category = NaN
    # where keeps original value for combined_error == 0 (no error), else replaces with NaN
    data[category] = data[category].where(data[combined_error] == 0, np.nan)
    return

In [None]:
for house in houses:
    for category in anomalous_categories:
        get_final_errors(house_datasets2[house], category)

print(house_datasets2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[combined_error] = (data[z_score_error]).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[combined_outliers] = data[category] * data[combined_error]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[category] = data[category].where(data[combined_error] == 0, np.nan)
A value is 

{'home2':                        time    blr_mod_lvl      blr_t  heat  flame  water  \
0       2022-10-01 00:00:00   0.000000e+00  28.398440   0.0    0.0    0.0   
1       2022-10-01 00:01:00   0.000000e+00  28.398440   0.0    0.0    0.0   
2       2022-10-01 00:02:00   0.000000e+00  28.413088   0.0    0.0    0.0   
3       2022-10-01 00:03:00   0.000000e+00  28.479815   0.0    0.0    0.0   
4       2022-10-01 00:04:00   0.000000e+00  28.496090   0.0    0.0    0.0   
...                     ...            ...        ...   ...    ...    ...   
301212  2023-04-30 23:55:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301213  2023-04-30 23:56:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301214  2023-04-30 23:57:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301215  2023-04-30 23:58:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301216  2023-04-30 23:59:00  4.940656e-324  19.854808   0.0    0.0    0.0   

            t_out      t_ret   t_r  t_r_set  ...  combined_error_

In [None]:
# Plot categories and combined outliers for houses

#for category in anomalous_categories:
#    combined_outliers = category + "_combined_outliers"
#    # if we want categories (with their NaN values) and combined outliers
#    categories = [category, combined_outliers]
#    # if we do not care about combined outliers, only for categories with their NaN values
#    categories = [category]
#    plot_houses_categories_list(house_datasets2, categories)

In [None]:
# simple moving average
# does not perform adequately for normalizing error values
'''
def normalize_data(data, category):
    normalized_category = 'normalized_' + category
    combined_error = 'combined_error_' + category

    # Define window size for moving average
    window_size = 5

    data[normalized_category] = 0
    # Replace error values with the moving average of neighboring values
    data[normalized_category] = data[category].where((~data[combined_error]).astype(bool), data[category].rolling(window=window_size, min_periods=1, center=True).mean())

    return
'''

"\ndef normalize_data(data, category):\n    normalized_category = 'normalized_' + category\n    combined_error = 'combined_error_' + category\n\n    # Define window size for moving average\n    window_size = 5\n\n    data[normalized_category] = 0\n    # Replace error values with the moving average of neighboring values\n    data[normalized_category] = data[category].where((~data[combined_error]).astype(bool), data[category].rolling(window=window_size, min_periods=1, center=True).mean())\n\n    return\n"

In [None]:
# Normalize with the average of the non-error previous and next values
def moving_average_imputation(data, category):
    normalized_category = 'normalized_' + category
    combined_error = 'combined_error_' + category

    # Initialize a copy of the category data for modification
    category_data = data[category].copy()

    # Forward fill, fills NaN values with previous non-NaN
    ffill = category_data.ffill()

    # Backward fill, fills NaN values with next non-NaN
    bfill = category_data.bfill()

    # Calculate the mean of forward fill and backward fill where original values were NaN
    filled_category = category_data.combine_first((ffill + bfill) / 2)

    data[normalized_category] = filled_category
    return

In [None]:
for house in houses:
    for category in anomalous_categories:
        moving_average_imputation(house_datasets2[house], category)

print(house_datasets2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[normalized_category] = filled_category
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[normalized_category] = filled_category
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[normalized_category] = filled_category
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

{'home2':                        time    blr_mod_lvl      blr_t  heat  flame  water  \
0       2022-10-01 00:00:00   0.000000e+00  28.398440   0.0    0.0    0.0   
1       2022-10-01 00:01:00   0.000000e+00  28.398440   0.0    0.0    0.0   
2       2022-10-01 00:02:00   0.000000e+00  28.413088   0.0    0.0    0.0   
3       2022-10-01 00:03:00   0.000000e+00  28.479815   0.0    0.0    0.0   
4       2022-10-01 00:04:00   0.000000e+00  28.496090   0.0    0.0    0.0   
...                     ...            ...        ...   ...    ...    ...   
301212  2023-04-30 23:55:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301213  2023-04-30 23:56:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301214  2023-04-30 23:57:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301215  2023-04-30 23:58:00  4.940656e-324  19.894530   0.0    0.0    0.0   
301216  2023-04-30 23:59:00  4.940656e-324  19.854808   0.0    0.0    0.0   

            t_out      t_ret   t_r  t_r_set  ...  \
0       22.34

In [None]:
print(anomalous_categories)

normalized_categories = []
for category in anomalous_categories:
    normalized_categories.append('normalized_' + category)

print(normalized_categories)


['blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out']
['normalized_blr_mod_lvl', 'normalized_absorption', 'normalized_insulation', 'normalized_t_r_set', 'normalized_t_out']


In [None]:
# plot normalized data

#for category in normalized_categories:
#    plot_houses_category(house_datasets2, category)

In [None]:
# comparatively plot original data, error values and normalized data

#for category in anomalous_categories:
#    combined_outliers = category + "_combined_outliers"
#    normalized_category = 'normalized_' + category
#    categories = [category, combined_outliers, normalized_category]
#    plot_houses_categories_list(house_datasets2, categories)

In [None]:
# Concatenate all sub-datasets into one dataset
normalized_dataset = pd.concat(house_datasets2.values(), ignore_index=True)

final_categories = ['time']
for category in normalized_categories:
    final_categories.append(category)
final_categories.append('house_id')
print(final_categories)

# Select the desired columns from the combined dataset using the list of category names
final_dataset = normalized_dataset[final_categories]

# Dictionary mapping old category names to new category names
rename_mapping = {
    'normalized_blr_mod_lvl': 'blr_mod_lvl',
    'normalized_absorption': 'absorption',
    'normalized_insulation': 'insulation',
    'normalized_t_r_set': 't_r_set',
    'normalized_t_out': 't_out'
}

# Rename columns in the final dataset
final_dataset.rename(columns=rename_mapping, inplace=True)

# Print the final dataset
print(final_dataset)

['time', 'normalized_blr_mod_lvl', 'normalized_absorption', 'normalized_insulation', 'normalized_t_r_set', 'normalized_t_out', 'house_id']
                        time    blr_mod_lvl  absorption  insulation  t_r_set  \
0        2022-10-01 00:00:00   0.000000e+00    0.503910    7.457292     15.0   
1        2022-10-01 00:01:00   0.000000e+00    0.503910    7.455208     15.0   
2        2022-10-01 00:02:00   0.000000e+00    0.518558    7.487500     15.0   
3        2022-10-01 00:03:00   0.000000e+00    0.616207    7.426042     15.0   
4        2022-10-01 00:04:00   0.000000e+00    0.699210    7.425000     15.0   
...                      ...            ...         ...         ...      ...   
7331983  2023-04-30 23:55:00  2.871866e-119         NaN    1.604167     17.0   
7331984  2023-04-30 23:56:00  1.914578e-119         NaN    1.614583     17.0   
7331985  2023-04-30 23:57:00  1.276385e-119         NaN    1.572917     17.0   
7331986  2023-04-30 23:58:00  8.509234e-120         NaN    1.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset.rename(columns=rename_mapping, inplace=True)


In [None]:
# The path for the output CSV file
#output_csv_path = os.path.join(parent_dir, 'normalized_df.csv')
output_csv_path = '/content/drive/MyDrive/boilers_drive/normalized_df.csv'

# Export final dataset to a CSV file
final_dataset.to_csv(output_csv_path, index=False)