In [1]:
import os
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

In [2]:
# path of CSV file
#file_path = 'merged_df_30.csv'

# the current script directory
# the current working directory
current_dir = os.getcwd()

# the path to the CSV file in the parent directory
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
file_path = os.path.join(parent_dir, 'merged_df_30.csv')

# reading CSV file
data = pd.read_csv(file_path)

In [None]:
print("Imported data:")
print(data)

In [None]:
# functions for plotting data
def plot_one(category, category_name):
    plt.plot(category)
    plt.title("Plot of " + category_name)
    plt.ylabel(category_name)
    plt.show

def plot_dataset(dataset, column_names):
    for column in column_names:
        plt.plot(dataset[column])
        plt.title("Plot of "+ column)
        plt.ylabel(column)
        #plt.grid(True)
        plt.show()

In [None]:
# list of all column names
column_names = data.columns.tolist()

print("List of column names:")
print(column_names)

corr_columns = column_names.copy()
corr_columns.remove('time')
corr_columns.remove('house_id')
corr_columns.remove('nodata')

# list of column names for correlation
print("List of column names for correlation:")
print(corr_columns)

plot_columns = corr_columns.copy()
plot_columns.remove("month")
plot_columns.remove("day")
plot_columns.remove("hour")

# list of column names for plotting
print("List of column names for plotting:")
print(plot_columns)

In [None]:
nodata_value = 0.0

filtered_nodata = data[data['nodata'] != nodata_value]

print(filtered_nodata)

In [None]:
# Different houses (house ids) in data
houses = data['house_id'].unique()
print("Different houses in data:")
print(houses)
print("Number of different houses:")
print(len(houses))

In [None]:
# for plotting a category for all houses
def plot_houses_category(dataset, category):
    num_subplots = len(houses)

    subplot_width = 10
    subplot_height = 10
    total_width = num_subplots * subplot_width

    # Create the figure and axes
    fig, axes = plt.subplots(1, num_subplots, figsize=(total_width, subplot_height))

    # Iterate over categories and create subplots
    i=0
    for house in houses:
        # Get the data for the current category from each sub-dataset
        category_data = dataset[house][category]

        # Plot the data on the corresponding subplot
        ax = axes[i]
        ax.plot(category_data, label=category)
        ax.set_title(house)
        ax.legend()
        #ax.grid(True)
        i=i+1

    # Adjust layout
    plt.tight_layout()
    plt.show()

Data Preprocessing

In [None]:
data2 = data

# the categopries with which we will study the data

data2['absorption'] = data2['blr_t'] - data2['t_ret']
data2['insulation'] = data2['t_out'] - data2['t_r']

data2['blr_mod_lvl_error'] = 0
data2['absorption_error'] = 0
data2['insulation_error'] = 0
data2['t_r_set_error'] = 0
data2['t_out_error'] = 0

print(data2)

In [None]:
house_datasets2 = {}
for house in houses:
    house_datasets2[house] = data2[data2['house_id'] == house]

# Example
#print("House-dataset for house_id 'home34':")
#print(house_datasets2["home34"])

In [None]:
# see if there are houses with extreme values of t_out
extreme_t_out = {}
for house in houses:
    data_temp = house_datasets2[house]
    extreme_t_out[house] = data_temp[data_temp['t_out'] > 35.0]
    extreme_t_out[house] = data_temp[data_temp['t_out'] < -5.0]

extreme_t_out_houses = []
not_extreme_t_out_houses = []
for house in houses:
    if not extreme_t_out[house].empty:
        extreme_t_out_houses.append(house)
    else:
        not_extreme_t_out_houses.append(house)

print("extreme_t_out found for houses with id :")
print(extreme_t_out_houses)
print("houses without extreme_t_out :")
print(not_extreme_t_out_houses)

In [None]:
# see if there are houses with extreme values of absorption
extreme_absorption = {}

for house in houses:
    data_temp = house_datasets2[house]
    extreme_absorption[house] = data_temp[data_temp['absorption'] < -20.0]

extreme_absorption_houses = []
not_extreme_absorption_houses = []
for house in houses:
    if not extreme_absorption[house].empty:
        extreme_absorption_houses.append(house)
    else:
        not_extreme_absorption_houses.append(house)

print("extreme_absorption found for houses with id :")
print(extreme_absorption_houses)
print("houses without extreme_absorption :")
print(not_extreme_absorption_houses)


In [None]:
# the categories we will study
# first we will find anomalous values and normalize
anomalous_categories = ['blr_mod_lvl', 'absorption', 'insulation', 't_r_set', 't_out']

In [None]:
def calculate_z_score(data, category):
    # Calculate mean and standard deviation of 'outside_temperature'
    mean_temp = data[category].mean()
    std_temp = data[category].std()

    z_score_name = 'z_score_' + category
    # Calculate Z-score for each data point
    data[z_score_name] = (data[category] - mean_temp) / std_temp
    return


def find_outliers_z_score(data, category):
    calculate_z_score(data, category)

    # threshold for Z-score (usually 3 standard deviations from the mean)
    z_threshold = 3
    if category=='blr_mod_lvl' :
        z_threshold = 8
    if category=='absorption' :
        z_threshold = 7
    if category=='insulation':
        z_threshold = 4
    if category=='t_out':
        z_threshold = 3
    if category=='t_out':
        z_threshold = 4

    z_score_name = 'z_score_' + category
    category_error = category + "_z_score_error"
    # Identify outliers based on Z-score threshold
    # Perform boolean indexing to flag anomalies based on the threshold
    data[category_error] = (data[z_score_name].abs() > z_threshold).astype(int)

    category_outliers = category + "_z_score_outliers"
    data[category_outliers] = data[category] * data[category_error]
    return

In [None]:
for house in houses:
    for category in anomalous_categories:
        find_outliers_z_score(house_datasets2[house], category)

print(house_datasets2)

In [None]:
# plot two (or more) categories from a list for each house
def plot_houses_categories_list(dataset, categories):
    num_subplots = len(houses)

    subplot_width = 10
    subplot_height = 10
    total_width = num_subplots * subplot_width

    # Create the figure and axes
    fig, axes = plt.subplots(1, num_subplots, figsize=(total_width, subplot_height))

    # Iterate over categories and create subplots
    i=0
    for house in houses:
        # Get the data for the current category from each sub-dataset
        categories_data = {}
        for category in categories:
            categories_data[category] = dataset[house][category]

        # Plot the data on the corresponding subplot
        ax = axes[i]
        for category in categories:
            ax.plot(categories_data[category], label=category)
        ax.set_title(house)
        ax.legend()
        #ax.grid(True)
        i=i+1

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [None]:
# Plot categories for houses

#for category in anomalous_categories:
#    plot_houses_category(house_datasets2, category)

In [None]:
# Plot categories and z_score outliers for houses

#for category in anomalous_categories:
#    category_outliers = category + "_z_score_outliers"
#    categories = [category, category_outliers]
#    plot_houses_categories_list(house_datasets2, categories)

In [None]:
def use_isolation_forest(data, category):
    # Data needs reshaping for Isolation Forest input
    category_data = data[category].values.reshape(-1, 1)

    # Contamination value
    contamination = 0.01

    # Create an Isolation Forest instance
    isolation_forest = IsolationForest(contamination=contamination)

    # Fit the Isolation Forest model to the data
    isolation_forest.fit(category_data)

    # Predict outliers using Isolation Forest
    outlier_preds = isolation_forest.predict(category_data)

    # Convert outlier predictions to binary category
    # Map the predicted labels from -1 (outliers) to 1 (error) and all other labels to 0 (non-error).
    category_error = 'isolation_forest_error_' + category
    data[category_error] = np.where(outlier_preds == -1, 1, 0)
    category_outliers = category + "_isolation_forest_outliers"
    data[category_outliers] = data[category] * data[category_error]
    return

In [None]:
for house in houses:
    for category in anomalous_categories:
        use_isolation_forest(house_datasets2[house], category)

print(house_datasets2)

In [None]:
# Plot categories and Isolation Forest outliers for houses

#for category in anomalous_categories:
#    category_outliers = category + "_isolation_forest_outliers"
#    categories = [category, category_outliers]
#    plot_houses_categories_list(house_datasets2, categories)

In [None]:
# Plot categories and outliers of all methods for houses

#for category in anomalous_categories:
#    category_isolation_forest_outliers = category + "_isolation_forest_outliers"
#    category_z_score_outliers = category + "_z_score_outliers"
#    categories = [category, category_isolation_forest_outliers, category_z_score_outliers]
#    plot_houses_categories_list(house_datasets2, categories)

In [None]:
# Combine errors from multiple methods into one
def get_final_errors(data, category):
    z_score_error = category + "_z_score_error"
    isolation_forest_error = 'isolation_forest_error_' + category
    combined_error = 'combined_error_' + category

    # Combine errors using logical operations
    data[combined_error] = (data[z_score_error] & data[isolation_forest_error]).astype(int)
    if category == 'blr_mod_lvl' :
        data[combined_error] = 0
    if category == 'absorption' :
        data[combined_error] = (data[z_score_error] & data[isolation_forest_error]).astype(int)
    if category == 'insulation' :
        data[combined_error] = (data[z_score_error] & data[isolation_forest_error]).astype(int)
    if category == 't_r_set' :
        data[combined_error] = (data[z_score_error]).astype(int)
    if category == 't_out' :
        data[combined_error] = (data[z_score_error] & data[isolation_forest_error]).astype(int)

    combined_outliers = category + "_combined_outliers"
    data[combined_outliers] = data[category] * data[combined_error]
    return

In [None]:
for house in houses:
    for category in anomalous_categories:
        get_final_errors(house_datasets2[house], category)

print(house_datasets2)

In [None]:
# Plot categories and combined outliers for houses

#for category in anomalous_categories:
#    combined_outliers = category + "_combined_outliers"
#    categories = [category, combined_outliers]
#    plot_houses_categories_list(house_datasets2, categories)

In [None]:
# simple moving average
# does not perform adequately for normalizing error values
'''
def normalize_data(data, category):
    normalized_category = 'normalized_' + category
    combined_error = 'combined_error_' + category

    # Define window size for moving average
    window_size = 5

    data[normalized_category] = 0
    # Replace error values with the moving average of neighboring values
    data[normalized_category] = data[category].where((~data[combined_error]).astype(bool), data[category].rolling(window=window_size, min_periods=1, center=True).mean())

    return
'''

In [None]:
# Normalize with the average of the non-error previous and next values
def moving_average_imputation(data, category):
    normalized_category = 'normalized_' + category
    combined_error = 'combined_error_' + category

    # Initialize a copy of the category data for modification
    category_data = data[category].copy()

    # Iterate over each value in the category data
    for i in range(len(category_data)):
        if data[combined_error].iloc[i] == 1:  # If the value is an error
            # Find the indices of neighboring non-error values
            neighbors_indices = [j for j in range(max(0, i-5), min(len(category_data), i+6)) if data[combined_error].iloc[j] == 0]

            # If there are at least 5 non-error values on each side
            if len(neighbors_indices) >= 5:
                # Calculate the average of neighboring non-error values
                avg_value = category_data.iloc[neighbors_indices].mean()
                # Replace the error value with the computed average
                category_data.iloc[i] = avg_value

    data[normalized_category] = category_data
    return

In [None]:
for house in houses:
    for category in anomalous_categories:
        moving_average_imputation(house_datasets2[house], category)

print(house_datasets2)

In [None]:
print(anomalous_categories)

normalized_categories = []
for category in anomalous_categories:
    normalized_categories.append('normalized_' + category)

print(normalized_categories)


In [None]:
# plot normalized data

#for category in normalized_categories:
#    plot_houses_category(house_datasets2, category)

In [None]:
# comparatively plot original data, error values and normalized data

#for category in anomalous_categories:
#    combined_outliers = category + "_combined_outliers"
#    normalized_category = 'normalized_' + category
#    categories = [category, combined_outliers, normalized_category]
#    plot_houses_categories_list(house_datasets2, categories)

In [None]:
# Concatenate all sub-datasets into one dataset
normalized_dataset = pd.concat(house_datasets2.values(), ignore_index=True)

final_categories = ['time']
for category in normalized_categories:
    final_categories.append(category)
final_categories.append('house_id')
print(final_categories)

# Select the desired columns from the combined dataset using the list of category names
final_dataset = normalized_dataset[final_categories]

# Dictionary mapping old category names to new category names
rename_mapping = {
    'normalized_blr_mod_lvl': 'blr_mod_lvl',
    'normalized_absorption': 'absorption',
    'normalized_insulation': 'insulation',
    'normalized_t_r_set': 't_r_set',
    'normalized_t_out': 't_out'
}

# Rename columns in the final dataset
final_dataset.rename(columns=rename_mapping, inplace=True)

# Print the final dataset
print(final_dataset)

In [None]:
# The path for the output CSV file
output_csv_path = os.path.join(parent_dir, 'normalized_df.csv')

# Export final dataset to a CSV file
final_dataset.to_csv(output_csv_path, index=False)