In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import pyarrow as pa
import pyarrow.parquet as pq



In [2]:
def read_home_data_auto_discover(base_path='UK-2-min_data_v1_1254mb/clean/', 
                                 start_id=1):
    """
    Reads home data from CSV files, automatically discovering files
    by incrementing property IDs until no more files are found.

    Args:
        base_path (str): The base directory where the clean data CSVs are located.
                         Defaults to 'UK-2-min_data/clean/'.
        start_id (int): The starting property ID number (e.g., 1 for EOH0001).

    Returns:
        dict: A dictionary where keys are property IDs (e.g., 'EOH0001')
              and values are pandas DataFrames containing the home data.
              Returns an empty dictionary if no data is found or read.
    """
    home_data_dfs = {}
    current_id = start_id
    files_found_in_a_row = 0 # To break the loop if a gap in IDs occurs
    MAX_CONSECUTIVE_MISSING = 100 # Define how many consecutive missing files
    # to tolerate

    # Define the expected column names
    expected_columns = [
        'Timestamp', 'Boiler_Energy_Output','Circulation_Pump_Energy_Consumed',
        'External_Air_Temperature', 'Heat_Pump_Energy_Output',
        'Heat_Pump_Heating_Flow_Temperature', 'Heat_Pump_Return_Temperature',
        'Hot_Water_Flow_Temperature', 'Immersion_Heater_Energy_Consumed',
        'Internal_Air_Temperature', 'Whole_System_Energy_Consumed'
    ]

    while True: # Loop indefinitely until we explicitly break
        # Format the property ID with leading zeros (e.g., 1 -> 0001, 10 -> 0010)
        property_id = f'EOH{current_id:04d}'
        file_name = f'Property_ID={property_id}.csv'
        file_path = os.path.join(base_path, file_name)

        try:
            df = pd.read_csv(file_path)

            # Reset consecutive missing counter if a file is found
            files_found_in_a_row = 0

            # Optional: Check if the columns match the expected ones
            if not all(col in df.columns for col in expected_columns):
                print(f"Warning: Columns in {file_name} do not fully match expected columns. Skipping this file for now.")

            home_data_dfs[property_id] = df
            print(f"Successfully loaded data for {property_id}")

        except FileNotFoundError:
            print(f"File not found for {property_id}. Checking for more files...")
            files_found_in_a_row += 1
            if files_found_in_a_row >= MAX_CONSECUTIVE_MISSING:
                print(f"Reached {MAX_CONSECUTIVE_MISSING} consecutive missing files. Assuming no more files exist. Stopping.")
                break # Exit the loop
        except pd.errors.EmptyDataError:
            print(f"File {file_name} is empty. Skipping...")
            files_found_in_a_row = 0 # An empty file is still a "found" file, reset counter
        except Exception as e:
            print(f"An error occurred while reading {file_name}: {e}. Skipping...")
            files_found_in_a_row = 0 # An error during read is also a "found" file, reset counter

        current_id += 1 # Increment ID for the next iteration

    return home_data_dfs

In [3]:
df_raw_1 = pd.read_csv("UK-2-min_data_v1_1254mb/clean/Property_ID=EOH0001.csv")
df_raw_2 = pd.read_csv("UK-2-min_data_v2_1263mb/Property_ID=EOH2403.csv")
df_raw_3 = pd.read_csv("UK-2-min_data_v3_1227mb/Property_ID=EOH1675.csv")
df_raw_4 = pd.read_csv("UK-2-min_data_v4_1245mb/Property_ID=EOH0737.csv")



In [4]:
df_raw = pd.read_csv('training_data/home_characteristics.csv')

# List of columns to check for missingness
cols = [
    "Bedrooms", "Floor_Height", "Habitable_Rooms", "House_Age",
    "House_Form", "No_Storeys", "No_Underfloor",
    "Total_Floor_Area", "Wall_Type"
]

df_detached = df_raw[df_raw["House_Form"] == 'Detached']

In [5]:
print(df_raw_1.columns)
print(df_raw_2.columns)
print(df_raw_3.columns)
print(df_raw_4.columns)

Index(['Timestamp', 'Circulation_Pump_Energy_Consumed',
       'External_Air_Temperature', 'Heat_Pump_Energy_Output',
       'Heat_Pump_Heating_Flow_Temperature', 'Heat_Pump_Return_Temperature',
       'Hot_Water_Flow_Temperature', 'Immersion_Heater_Energy_Consumed',
       'Internal_Air_Temperature', 'Whole_System_Energy_Consumed'],
      dtype='object')
Index(['Timestamp', 'Boiler_Energy_Output', 'Circulation_Pump_Energy_Consumed',
       'External_Air_Temperature', 'Heat_Pump_Energy_Output',
       'Heat_Pump_Heating_Flow_Temperature', 'Heat_Pump_Return_Temperature',
       'Internal_Air_Temperature', 'Whole_System_Energy_Consumed'],
      dtype='object')
Index(['Timestamp', 'Circulation_Pump_Energy_Consumed',
       'External_Air_Temperature', 'Heat_Pump_Energy_Output',
       'Heat_Pump_Heating_Flow_Temperature', 'Heat_Pump_Return_Temperature',
       'Hot_Water_Flow_Temperature', 'Internal_Air_Temperature',
       'Whole_System_Energy_Consumed'],
      dtype='object')
Index(['Tim

In [6]:
df_homes_1 = read_home_data_auto_discover()

Successfully loaded data for EOH0001
File not found for EOH0002. Checking for more files...
Successfully loaded data for EOH0003
File not found for EOH0004. Checking for more files...
Successfully loaded data for EOH0005
File not found for EOH0006. Checking for more files...
File not found for EOH0007. Checking for more files...
File not found for EOH0008. Checking for more files...
File not found for EOH0009. Checking for more files...
File not found for EOH0010. Checking for more files...
File not found for EOH0011. Checking for more files...
File not found for EOH0012. Checking for more files...
File not found for EOH0013. Checking for more files...
Successfully loaded data for EOH0014
File not found for EOH0015. Checking for more files...
File not found for EOH0016. Checking for more files...
File not found for EOH0017. Checking for more files...
Successfully loaded data for EOH0018
File not found for EOH0019. Checking for more files...
File not found for EOH0020. Checking for more

In [7]:
df_homes_2 = read_home_data_auto_discover("UK-2-min_data_v2_1263mb",2403)

Successfully loaded data for EOH2403
File not found for EOH2404. Checking for more files...
Successfully loaded data for EOH2405
Successfully loaded data for EOH2406
File not found for EOH2407. Checking for more files...
File not found for EOH2408. Checking for more files...
Successfully loaded data for EOH2409
Successfully loaded data for EOH2410
File not found for EOH2411. Checking for more files...
Successfully loaded data for EOH2412
File not found for EOH2413. Checking for more files...
Successfully loaded data for EOH2414
File not found for EOH2415. Checking for more files...
Successfully loaded data for EOH2416
File not found for EOH2417. Checking for more files...
Successfully loaded data for EOH2418
File not found for EOH2419. Checking for more files...
File not found for EOH2420. Checking for more files...
Successfully loaded data for EOH2421
Successfully loaded data for EOH2422
File not found for EOH2423. Checking for more files...
File not found for EOH2424. Checking for mo

In [8]:
df_homes_3 = read_home_data_auto_discover("UK-2-min_data_v3_1227mb", 1675)

Successfully loaded data for EOH1675
File not found for EOH1676. Checking for more files...
File not found for EOH1677. Checking for more files...
File not found for EOH1678. Checking for more files...
File not found for EOH1679. Checking for more files...
File not found for EOH1680. Checking for more files...
File not found for EOH1681. Checking for more files...
File not found for EOH1682. Checking for more files...
File not found for EOH1683. Checking for more files...
Successfully loaded data for EOH1684
Successfully loaded data for EOH1685
Successfully loaded data for EOH1686
Successfully loaded data for EOH1687
Successfully loaded data for EOH1688
File not found for EOH1689. Checking for more files...
File not found for EOH1690. Checking for more files...
File not found for EOH1691. Checking for more files...
File not found for EOH1692. Checking for more files...
File not found for EOH1693. Checking for more files...
Successfully loaded data for EOH1694
File not found for EOH1695

In [9]:
df_homes_4 = read_home_data_auto_discover("UK-2-min_data_v4_1245mb", 737)

Successfully loaded data for EOH0737
File not found for EOH0738. Checking for more files...
File not found for EOH0739. Checking for more files...
File not found for EOH0740. Checking for more files...
File not found for EOH0741. Checking for more files...
File not found for EOH0742. Checking for more files...
File not found for EOH0743. Checking for more files...
File not found for EOH0744. Checking for more files...
Successfully loaded data for EOH0745
Successfully loaded data for EOH0746
File not found for EOH0747. Checking for more files...
File not found for EOH0748. Checking for more files...
Successfully loaded data for EOH0749
File not found for EOH0750. Checking for more files...
File not found for EOH0751. Checking for more files...
File not found for EOH0752. Checking for more files...
File not found for EOH0753. Checking for more files...
Successfully loaded data for EOH0754
File not found for EOH0755. Checking for more files...
Successfully loaded data for EOH0756
File not

In [10]:
df_homes = {}
df_homes.update(df_homes_1)
df_homes.update(df_homes_2)
df_homes.update(df_homes_3)
df_homes.update(df_homes_4)


In [None]:
# 1) ensure each df is indexed by datetime
for pid, df in df_homes.items():
    if df.empty:
        continue
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df['Timestamp'])  # or your timestamp column

# 2) concat with keys = property IDs to create a MultiIndex
big = pd.concat(
    df_homes.values(),
    keys=df_homes.keys(),
    names=["Property_ID", "Timestamp"]
)

# 3) write to Parquet
table = pa.Table.from_pandas(big, preserve_index=True)
pq.write_table(table, "home_time_series_2min.parquet")

  big = pd.concat(


In [11]:
data_columns_to_check_for_plot = [
    'Circulation_Pump_Energy_Consumed',
    'External_Air_Temperature', 'Heat_Pump_Energy_Output',
    'Heat_Pump_Heating_Flow_Temperature', 'Heat_Pump_Return_Temperature',
    'Hot_Water_Flow_Temperature', 'Immersion_Heater_Energy_Consumed',
    'Internal_Air_Temperature', 'Whole_System_Energy_Consumed'
]

In [17]:
'''
missing_list = {}

for property_id, df in df_homes.items():
    print(property_id)
    if df.empty:
        print(f"Skipping empty DataFrame for {property_id} during plot data "
        f"preparation.")
        continue
    print(df.isnull().sum())
    missing_list[property_id] = df.isnull().sum()
      
'''
   


'\nmissing_list = {}\n\nfor property_id, df in df_homes.items():\n    print(property_id)\n    if df.empty:\n        print(f"Skipping empty DataFrame for {property_id} during plot data "\n        f"preparation.")\n        continue\n    print(df.isnull().sum())\n    missing_list[property_id] = df.isnull().sum()\n\n'

In [18]:
'''
sorted_missing_list_items = sorted(
    missing_list.items(),
    key=lambda item: item[1].get('Internal_Air_Temperature', -1))

sorted_missing_list_items 
'''

"\nsorted_missing_list_items = sorted(\n    missing_list.items(),\n    key=lambda item: item[1].get('Internal_Air_Temperature', -1))\n\nsorted_missing_list_items \n"

In [12]:
# turn the detached IDs into a set for fast membership tests
detached_ids = set(df_detached['Property_ID'].values)

# build a new dict containing only the homes whose key is in detached_ids
df_detached_timeseries = {
    pid: ts
    for pid, ts in df_homes.items()
    if pid in detached_ids
}

df_detached_timeseries

{'EOH0005':                   Timestamp  Circulation_Pump_Energy_Consumed  \
 0       2021-05-21 12:22:00                             0.000   
 1       2021-05-21 12:24:00                             0.001   
 2       2021-05-21 12:26:00                             0.002   
 3       2021-05-21 12:28:00                             0.003   
 4       2021-05-21 12:30:00                             0.005   
 ...                     ...                               ...   
 618097  2023-09-28 23:50:00                           108.960   
 618098  2023-09-28 23:52:00                           108.960   
 618099  2023-09-28 23:54:00                           108.960   
 618100  2023-09-28 23:56:00                           108.960   
 618101  2023-09-28 23:58:00                               NaN   
 
         External_Air_Temperature  Heat_Pump_Energy_Output  \
 0                            NaN                      NaN   
 1                           8.68                      NaN   
 2       

In [14]:
# 1) ensure each df is indexed by datetime
for pid, df in df_detached_timeseries.items():
    if df.empty:
        continue
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df['Timestamp'])  # or your timestamp column

# 2) concat with keys = property IDs to create a MultiIndex
big_detached = pd.concat(
    df_detached_timeseries.values(),
    keys=df_detached_timeseries.keys(),
    names=["Property_ID", "Timestamp"]
)

# 3) write to Parquet
table = pa.Table.from_pandas(big_detached, preserve_index=True)
pq.write_table(table, "detached_home_data.parquet")

  big_detached = pd.concat(


In [20]:
# 0) Parse & set Timestamp as index *in the dict itself*
for pid, df in df_detached_timeseries.copy().items():
    if df.empty:
        continue
    print(df.index)
    # make a copy so we don’t modify original‐in‐place unintentionally
    df2 = df.copy()
    df2['Timestamp'] = pd.to_datetime(df2['Timestamp'])
    df2 = df2.set_index('Timestamp')
    df_detached_timeseries[pid] = df2

# 1) columns of interest
cols = [
    'External_Air_Temperature',
    'Heat_Pump_Energy_Output',
    'Heat_Pump_Heating_Flow_Temperature',
    'Heat_Pump_Return_Temperature',
    'Hot_Water_Flow_Temperature',
    'Internal_Air_Temperature',
    'Whole_System_Energy_Consumed'
]

# 2) compute missing‐counts & spans for each property
stats = []
for pid, df in df_detached_timeseries.items():
    if df.empty:
        continue
    # reindex columns (missing cols → all‐NaN)
    sub = df.reindex(columns=cols)
    total_missing = int(sub.isnull().sum().sum())
    span = df.index.max() - df.index.min()
    stats.append((pid, total_missing, span))

# sort by (fewest missing, then longest span)
stats_sorted = sorted(stats, key=lambda x: (x[1], -x[2].total_seconds()))
ordered_pids = [t[0] for t in stats_sorted]

# 3) global time‐axis (union of all property indices)
all_times = pd.Index([])
for pid in ordered_pids:
    all_times = all_times.union(df_detached_timeseries[pid].index)
all_times = all_times.sort_values()

# 4) build presence matrix
matrix = np.zeros((len(ordered_pids), len(all_times)), dtype=bool)
for i, pid in enumerate(ordered_pids):
    df = df_detached_timeseries[pid]
    sub = df.reindex(index=all_times, columns=cols)
    missing_mask = sub.isnull().any(axis=1)
    matrix[i, :] = ~missing_mask


RangeIndex(start=0, stop=618102, step=1)
RangeIndex(start=0, stop=728321, step=1)
RangeIndex(start=0, stop=554650, step=1)
RangeIndex(start=0, stop=629570, step=1)
RangeIndex(start=0, stop=584083, step=1)
RangeIndex(start=0, stop=253058, step=1)
RangeIndex(start=0, stop=627902, step=1)
RangeIndex(start=0, stop=674060, step=1)
RangeIndex(start=0, stop=548228, step=1)
RangeIndex(start=0, stop=620223, step=1)
RangeIndex(start=0, stop=674269, step=1)
RangeIndex(start=0, stop=647669, step=1)
RangeIndex(start=0, stop=440137, step=1)
RangeIndex(start=0, stop=261651, step=1)
RangeIndex(start=0, stop=661219, step=1)
RangeIndex(start=0, stop=633324, step=1)
RangeIndex(start=0, stop=694598, step=1)
RangeIndex(start=0, stop=590059, step=1)
RangeIndex(start=0, stop=133570, step=1)
RangeIndex(start=0, stop=583450, step=1)
RangeIndex(start=0, stop=620579, step=1)
RangeIndex(start=0, stop=559117, step=1)
RangeIndex(start=0, stop=615895, step=1)
RangeIndex(start=0, stop=261630, step=1)
RangeIndex(start

In [23]:
matrix
stop

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]],
      shape=(300, 763923))

In [None]:
# 5) plot: white = all data‐present; black = any missing
cmap = ListedColormap(['black', 'white'])
fig, ax = plt.subplots(figsize=(12, max(4, len(ordered_pids)*0.3)))
ax.imshow(matrix, aspect='auto', interpolation='none', cmap=cmap)

# y-axis = property IDs
ax.set_yticks(np.arange(len(ordered_pids)))
ax.set_yticklabels(ordered_pids)

# x-axis = up to 10 date ticks
n = len(all_times)
locs = np.linspace(0, n-1, min(10, n)).round().astype(int)
ax.set_xticks(locs)
ax.set_xticklabels(
    [all_times[i].strftime('%Y-%m-%d') for i in locs],
    rotation=45, ha='right'
)

ax.set_xlabel('Time')
ax.set_ylabel('Property ID')
plt.tight_layout()
plt.show()

In [16]:
# 1) create a black/white colormap
cmap = ListedColormap(['black', 'white'])

# 2) plot
fig, ax = plt.subplots(figsize=(4, max(4, len(ordered_pids)*0.3)))
ax.imshow(
    matrix,
    aspect='auto',
    interpolation='nearest',
    cmap=cmap
)

# 3) y-axis = property IDs
ax.set_yticks(np.arange(len(ordered_pids)))
ax.set_yticklabels(ordered_pids)

# 4) x-axis = time labels (up to 10)
n = len(all_times)
locs = np.linspace(0, n-1, min(10, n)).round().astype(int)
ax.set_xticks(locs)
ax.set_xticklabels(
    [all_times[i].strftime('%Y-%m-%d') for i in locs],
    rotation=45, ha='right'
)

# 5) turn off any grid or spines for that “clean heatmap” look
ax.grid(False)
for spine in ax.spines.values():
    spine.set_visible(False)

ax.set_xlabel('Time')
ax.set_ylabel('Property ID')
plt.tight_layout()
plt.show()

KeyboardInterrupt: 

In [25]:
missing_list_detach = {}

for property_id, df in df_detached_timeseries.items():
    print(property_id)
    if df.empty:
        print(f"Skipping empty DataFrame for {property_id} during plot data "
              f"preparation.")
        continue
    print(df.isnull().sum())
    missing_list_detach[property_id] = df.isnull().sum()
    
sorted_missing_detach = sorted(
    missing_list_detach.items(),
    key=lambda item: item[1].get('Internal_Air_Temperature', -1))







EOH0005
Timestamp                                  0
Circulation_Pump_Energy_Consumed        2232
External_Air_Temperature                6444
Heat_Pump_Energy_Output               123226
Heat_Pump_Heating_Flow_Temperature    159997
Heat_Pump_Return_Temperature          131813
Hot_Water_Flow_Temperature            581450
Internal_Air_Temperature               25790
Whole_System_Energy_Consumed            2232
dtype: int64
EOH0018
Timestamp                                  0
Boiler_Energy_Output                  141667
Circulation_Pump_Energy_Consumed       32188
External_Air_Temperature                3379
Heat_Pump_Energy_Output                32193
Heat_Pump_Heating_Flow_Temperature     48421
Heat_Pump_Return_Temperature           62357
Internal_Air_Temperature               38522
Whole_System_Energy_Consumed           32229
dtype: int64
EOH0021
Timestamp                                  0
Circulation_Pump_Energy_Consumed       25134
External_Air_Temperature                5946
Heat_

[('EOH0590',
  Timestamp                            0
  Circulation_Pump_Energy_Consumed    56
  Heat_Pump_Energy_Output             55
  Whole_System_Energy_Consumed        56
  dtype: int64),
 ('EOH0728',
  Timestamp                                  0
  Circulation_Pump_Energy_Consumed         215
  External_Air_Temperature                1095
  Heat_Pump_Energy_Output                  244
  Heat_Pump_Heating_Flow_Temperature      7340
  Heat_Pump_Return_Temperature            6051
  Hot_Water_Flow_Temperature            229407
  Immersion_Heater_Energy_Consumed         247
  Internal_Air_Temperature                 372
  Whole_System_Energy_Consumed             215
  dtype: int64),
 ('EOH1884',
  Timestamp                                 0
  Boiler_Energy_Output                   3038
  Circulation_Pump_Energy_Consumed       3037
  External_Air_Temperature               1512
  Heat_Pump_Energy_Output                3041
  Heat_Pump_Heating_Flow_Temperature     5813
  Heat_Pump_Retur

In [26]:
sorted_missing_list_items

[('EOH0590',
  Timestamp                            0
  Circulation_Pump_Energy_Consumed    56
  Heat_Pump_Energy_Output             55
  Whole_System_Energy_Consumed        56
  dtype: int64),
 ('EOH0728',
  Timestamp                                  0
  Circulation_Pump_Energy_Consumed         215
  External_Air_Temperature                1095
  Heat_Pump_Energy_Output                  244
  Heat_Pump_Heating_Flow_Temperature      7340
  Heat_Pump_Return_Temperature            6051
  Hot_Water_Flow_Temperature            229407
  Immersion_Heater_Energy_Consumed         247
  Internal_Air_Temperature                 372
  Whole_System_Energy_Consumed             215
  dtype: int64),
 ('EOH1884',
  Timestamp                                 0
  Boiler_Energy_Output                   3038
  Circulation_Pump_Energy_Consumed       3037
  External_Air_Temperature               1512
  Heat_Pump_Energy_Output                3041
  Heat_Pump_Heating_Flow_Temperature     5813
  Heat_Pump_Retur

In [None]:
plot_data_list = []


for property_id, df in df_homes.items(): # <<< Using df_homes directly here
    if df.empty:
        print(f"Skipping empty DataFrame for {property_id} during plot data preparation.")
        continue

    cols_to_check_in_df = [col for col in data_columns_to_check_for_plot if col in df.columns]

    if not cols_to_check_in_df:
        print(f"No relevant data columns to check for missingness in {property_id}. Skipping.")
        continue

    missing_count_per_timestamp = df[cols_to_check_in_df].isnull().sum(axis=1)
    total_possible_missing = len(cols_to_check_in_df)

    normalized_missing = missing_count_per_timestamp / total_possible_missing

    temp_df = pd.DataFrame({
        'Property_ID': property_id,
        'Timestamp': normalized_missing.index,
        'Normalized_Missing': normalized_missing.values
    })
    plot_data_list.append(temp_df)

if not plot_data_list:
    print("No valid data available to plot after processing for missingness. Exiting.")
    # Clean up dummy directories
    for path in base_paths.values():
        if os.path.exists(path):
            shutil.rmtree(path)
    exit()

combined_missing_data = pd.concat(plot_data_list, ignore_index=True)

pivot_table = combined_missing_data.pivot_table(
    index='Property_ID',
    columns='Timestamp',
    values='Normalized_Missing'
)

pivot_table_sorted = pivot_table.sort_index(key=lambda x: x.str.extract(r'EOH(\d+)').astype(int).iloc[:, 0])

plot_matrix = pivot_table_sorted.fillna(0).values # Fill NaN with 0 (white for no missing)

# --- 5. Plotting the Missing Data Heatmap ---
plt.figure(figsize=(20, max(8, len(pivot_table_sorted) * 0.4))) # Dynamic height, min 8 inches

plt.imshow(plot_matrix, cmap='gray_r', aspect='auto', origin='lower', interpolation='nearest')

# --- Set X-axis (Timestamps) ---
timestamps_in_plot = pivot_table_sorted.columns
num_timestamps = len(timestamps_in_plot)

if num_timestamps > 0:
    tick_density = 12
    tick_interval = max(1, num_timestamps // tick_density)
    
    x_tick_locations = np.arange(0, num_timestamps, tick_interval)
    if num_timestamps > 0 and (num_timestamps - 1) not in x_tick_locations:
        if num_timestamps - 1 - x_tick_locations[-1] > tick_interval / 2:
            x_tick_locations = np.append(x_tick_locations, num_timestamps - 1)

    x_tick_labels = [timestamps_in_plot[i].strftime('%Y-%m-%d %H:%M') for i in x_tick_locations]

    plt.xticks(x_tick_locations, x_tick_labels, rotation=45, ha='right', fontsize=9)
else:
    plt.xticks([])

plt.xlabel('Timestamp', fontsize=12)

# --- Set Y-axis (Home IDs) ---
plt.yticks(np.arange(len(pivot_table_sorted)), pivot_table_sorted.index, fontsize=10)
plt.ylabel('Home ID (Property_ID)', fontsize=12)

# --- Add Title and Layout Adjustments ---
plt.title('Missing Data per Home and Timestep (Black = All Missing, White = No Missing)', fontsize=14)

cbar = plt.colorbar(plt.cm.ScalarMappable(cmap='gray_r', norm=plt.Normalize(vmin=0, vmax=1)),
                    ax=plt.gca(), orientation='vertical', fraction=0.02, pad=0.02)
cbar.set_label('Proportion of Missing Columns (0=None, 1=All)', fontsize=10)
cbar.set_ticks([0, 0.2, 0.4, 0.6, 0.8, 1])

plt.tight_layout(rect=[0, 0, 0.95, 1])
plt.show()

In [2]:
# --- Set Y-axis (Home IDs) ---
plt.yticks(np.arange(len(pivot_table_sorted)), pivot_table_sorted.index, fontsize=10)
plt.ylabel('Home ID (Property_ID)', fontsize=12)

# --- Add Title and Layout Adjustments ---
plt.title('Missing Data per Home and Timestep (Black = All Missing, White = No Missing)', fontsize=14)

cbar = plt.colorbar(plt.cm.ScalarMappable(cmap='gray_r', norm=plt.Normalize(vmin=0, vmax=1)),
                    ax=plt.gca(), orientation='vertical', fraction=0.02, pad=0.02)
cbar.set_label('Proportion of Missing Columns (0=None, 1=All)', fontsize=10)
cbar.set_ticks([0, 0.2, 0.4, 0.6, 0.8, 1])

plt.tight_layout(rect=[0, 0, 0.95, 1])
plt.show()

NameError: name 'plt' is not defined