In [None]:
import pandas as pd
from IPython.display import display
import os
import random
from tqdm import tqdm
import numpy as np
import statsmodels.api as sm
import glob
import matplotlib.pyplot as plt
import re

## Data preprocess

In [None]:
import pandas as pd

def process_dataframe(file_path):
    # Step 1: Read the Excel file
    df_cleaned = pd.read_excel(file_path)

    # Step 3: Fill missing values based on time
    def fill_missing_values_based_on_time(dataframe):
        dataframe['rawtime'] = pd.to_datetime(dataframe['rawtime'])
        for index, row in dataframe.iterrows():
            if row.isnull().any():
                prev_row = dataframe.iloc[index - 1] if index - 1 >= 0 else None
                next_row = dataframe.iloc[index + 1] if index + 1 < len(dataframe) else None
                for col in dataframe.columns:
                    if pd.isnull(row[col]):
                        if prev_row is not None and next_row is not None:
                            prev_diff = abs(row['rawtime'] - prev_row['rawtime'])
                            next_diff = abs(row['rawtime'] - next_row['rawtime'])
                            if prev_diff < next_diff:
                                dataframe.at[index, col] = prev_row[col]
                            else:
                                dataframe.at[index, col] = next_row[col]
                        elif prev_row is not None:
                            dataframe.at[index, col] = prev_row[col]
                        elif next_row is not None:
                            dataframe.at[index, col] = next_row[col]
        return dataframe

    df_filled = fill_missing_values_based_on_time(df_cleaned)

    # Step 4: Convert rawtime to "YYYY-MM-DD HH:MM" format
    df_filled['rawtime'] = df_filled['rawtime'].dt.strftime('%Y-%m-%d %H:%M')

    # Step 5: Check for duplicate timestamps
    def check_duplicate_timestamps(dataframe):
        duplicate_counts = dataframe['rawtime'].value_counts()
        duplicates = duplicate_counts[duplicate_counts > 1]
        if not duplicates.empty:
            print(f"There are {duplicates.sum()} duplicate timestamps. The first duplicate value will be removed.")
        return duplicates

    duplicates_info = check_duplicate_timestamps(df_filled)

    # Step 6: Remove the first occurrence of duplicate timestamps
    def remove_first_duplicate_and_recheck(dataframe):
        duplicate_timestamps = dataframe['rawtime'][dataframe['rawtime'].duplicated(keep='first')]
        dataframe_cleaned = dataframe.drop(duplicate_timestamps.index)
        duplicate_counts = dataframe_cleaned['rawtime'].value_counts()
        duplicates_after_removal = duplicate_counts[duplicate_counts > 1]
        if duplicates_after_removal.empty:
            print("Processing complete, no duplicate timestamps remain.")
        else:
            print(f"There are still {duplicates_after_removal.sum()} duplicate timestamps remaining.")
        return dataframe_cleaned

    df_cleaned_final = remove_first_duplicate_and_recheck(df_filled)

    # Step 7: Insert missing rows with ID as the first column
    def insert_missing_rows_with_id_first(dataframe):
        dataframe['rawtime'] = pd.to_datetime(dataframe['rawtime'])
        full_time_range = pd.date_range(start=dataframe['rawtime'].min(), end=dataframe['rawtime'].max(), freq='T')
        df_full = pd.DataFrame({'rawtime': full_time_range})
        df_merged = pd.merge(df_full, dataframe, on='rawtime', how='left')
        df_merged['ID'] = dataframe['ID'].iloc[0]
        df_merged['rawtime'] = df_merged['rawtime'].dt.strftime('%Y-%m-%d %H:%M')
        columns_order = ['ID', 'rawtime'] + [col for col in df_merged.columns if col not in ['ID', 'rawtime']]
        return df_merged[columns_order]

    df_with_inserted_rows = insert_missing_rows_with_id_first(df_cleaned_final)

    # Step 8: Fill the 'Operational_status' column with the previous value
    def fill_previous_value_for_column(dataframe, column_name):
        dataframe[column_name] = dataframe[column_name].fillna(method='ffill')
        return dataframe

    df_filled_with_previous = fill_previous_value_for_column(df_with_inserted_rows, 'Operational_status')

    # Step 9: Interpolate 'Tin' and 'Tout' and round to 1 decimal place
    def fill_with_interpolation_and_rounding(dataframe, columns, decimal_places=1):
        dataframe[columns] = dataframe[columns].interpolate(method='linear', limit_direction='both').round(decimal_places)
        return dataframe

    df_filled_with_rounded_interpolation = fill_with_interpolation_and_rounding(df_filled_with_previous, ['Tin', 'Tout'], decimal_places=1)

    # Step 10: Fill 'Setpoint' column with the previous value
    df_filled_with_final_adjustments = fill_previous_value_for_column(df_filled_with_rounded_interpolation, 'Setpoint')

    # Step 11: Function to count the number of shutdown cycles
    def count_shutdown_cycles(dataframe, column_name):
        shutdown_cycles = 0
        in_shutdown = False
    
        for i in range(len(dataframe)):
            current_status = dataframe.iloc[i][column_name]
            if current_status == 0 and not in_shutdown:
                in_shutdown = True  # Start of a shutdown cycle
            elif current_status == 1 and in_shutdown:
                shutdown_cycles += 1  # End of a shutdown cycle
                in_shutdown = False  # Reset for the next cycle   
        return shutdown_cycles
    # Apply the function to count shutdown cycles in 'Operational_status'
    shutdown_cycles_count = count_shutdown_cycles(df_filled_with_final_adjustments, 'Operational_status')
    
    # Display the result
    print(f"There are {shutdown_cycles_count} shutdown cycles.")

    # Step 12: Function to replace temperature values with 'no_records' for each shutdown cycle
    def replace_temperatures_in_shutdown_cycles(dataframe, column_name, temperature_columns):
        in_shutdown = False
        shutdown_start = None
    
        for i in range(len(dataframe)):
            current_status = dataframe.iloc[i][column_name]
        
            if current_status == 0 and not in_shutdown:
                # Mark the start of the shutdown cycle
                in_shutdown = True
                shutdown_start = i
            elif current_status == 1 and in_shutdown:
                # End of the shutdown cycle, process the rows
                if shutdown_start is not None and (i - shutdown_start) > 1:
                    # Replace values from the second 'Operational_status' == 0 until the last one
                    dataframe.loc[shutdown_start + 1:i - 1, temperature_columns] = 'no_records'
                # Reset for the next cycle
                in_shutdown = False
                shutdown_start = None

        return dataframe
    temperature_columns_to_replace = ['Setpoint', 'Tin', 'Tout']
    df_replaced_temperatures = replace_temperatures_in_shutdown_cycles(df_filled_with_final_adjustments, 'Operational_status', temperature_columns_to_replace)

    # Step 13: Add the "On/Off action" column
    def add_switch_action_column(dataframe, status_column):
        dataframe['On/OFF'] = ''
        for i in range(len(dataframe) - 1):
            current_status = dataframe.iloc[i][status_column]
            next_status = dataframe.iloc[i + 1][status_column]
            if current_status == 0 and next_status == 1:
                dataframe.at[i + 1, 'On/OFF'] = 'ON'
            elif current_status == 1 and next_status == 0:
                dataframe.at[i + 1, 'On/OFF'] = 'OFF'
        return dataframe

    df_with_switch_action = add_switch_action_column(df_replaced_temperatures, 'Operational_status')

    # Step 14: Add the "Setpoint adjustment" column
    def add_temperature_action_column_with_handling(dataframe, temp_column):
        dataframe['Setpoint adjustment'] = ''
        for i in range(len(dataframe) - 1):
            current_temp = dataframe.iloc[i][temp_column]
            next_temp = dataframe.iloc[i + 1][temp_column]
            if pd.api.types.is_numeric_dtype(type(current_temp)) and pd.api.types.is_numeric_dtype(type(next_temp)):
                if next_temp > current_temp:
                    dataframe.at[i + 1, 'Setpoint adjustment'] = 'Upper'
                elif next_temp < current_temp:
                    dataframe.at[i + 1, 'Setpoint adjustment'] = 'Lower'
        return dataframe

    df_final = add_temperature_action_column_with_handling(df_with_switch_action, 'Setpoint')

    # Step 15: Display the final dataframe
    display(df_final)

    return df_final


## Comfort indoor air temperature model

In [None]:
import pandas as pd
import os
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Function to mark stable and sleep periods
def mark_stable_and_sleep_periods(df, state_col='Operational_status', temp_col='Setpoint', time_col='rawtime', 
                                  stable_col='Stable_period', sleep_col='Sleep_period'):
    df[stable_col] = 0
    df['Stable_period'] = (df[state_col] == 1) & (df[temp_col] == df[temp_col].shift(1))
    df['Stable_period'] = df['Stable_period'].astype(int).cumsum() - df['Stable_period'].astype(int).cumsum().where(
        ~df['Stable_period'].astype(bool)).ffill().fillna(0)
    df[stable_col] = (df['Stable_period'] >= 15).astype(int)
    df.drop(columns=['Stable_period'], inplace=True)
    df[sleep_col] = 0
    df[time_col] = pd.to_datetime(df[time_col])
    df['hour_minute'] = df[time_col].dt.time
    sleep_start = pd.to_datetime('22:00').time()
    sleep_end = pd.to_datetime('08:00').time()
    df[sleep_col] = df['hour_minute'].apply(lambda x: 1 if (x >= sleep_start or x < sleep_end) else 0)
    df.drop(columns=['hour_minute'], inplace=True)
    return df

# Function to extract hourly averages during non-sleep periods
def extract_and_average_stable_data(df, stable_col='Stable_period', sleep_col='Sleep_period', time_col='rawtime', 
                                    target_cols=None):
    if target_cols is None:
        target_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    stable_non_sleep_data = df[(df[stable_col] == 1) & (df[sleep_col] == 0)].copy()
    stable_non_sleep_data[time_col] = pd.to_datetime(stable_non_sleep_data[time_col])
    stable_non_sleep_data['hour'] = stable_non_sleep_data[time_col].dt.floor('H')
    hourly_avg = stable_non_sleep_data.groupby('hour')[target_cols].mean().reset_index()
    return hourly_avg

# Function to extract hourly averages during sleep periods
def sleepextract_and_average_stable_data(df, state_col='Operational_status', sleep_col='Sleep_period', 
                                         time_col='rawtime', target_cols=None):
    if target_cols is None:
        target_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    stable_sleep_data = df[(df[state_col] == 1) & (df[sleep_col] == 1)].copy()
    stable_sleep_data[time_col] = pd.to_datetime(stable_sleep_data[time_col])
    stable_sleep_data['hour'] = stable_sleep_data[time_col].dt.floor('H')
    hourly_avg = stable_sleep_data.groupby('hour')[target_cols].mean().reset_index()
    return hourly_avg

# Function to perform linear regression and plot results
def perform_linear_regression_and_plot(df, save_path, indoor_temp_col='室内温度', outdoor_temp_col='室外温度', device_id='设备ID'):
    """
    Perform linear regression and plot results for wake state with specified formatting.
    
    Parameters:
    - df (pd.DataFrame): Dataframe containing data for regression.
    - save_path (str): Path to save the regression plot as a PNG file.
    - indoor_temp_col (str): Column name for indoor air temperature.
    - outdoor_temp_col (str): Column name for outdoor air temperature.
    - device_id (str): Identifier for the device (used in results output).
    
    Returns:
    - pd.DataFrame: A dataframe containing regression results.
    """
    # Extract independent and dependent variables
    X = df[outdoor_temp_col]
    y = df[indoor_temp_col]
    
    # Add a constant for the intercept
    X = sm.add_constant(X)
    
    # Perform linear regression
    model = sm.OLS(y, X).fit()
    r_squared = model.rsquared
    p_value = model.f_pvalue
    
    # Get regression coefficients
    intercept = model.params['const']
    slope = model.params[outdoor_temp_col]
    equation = f"y = {intercept:.4f} + {slope:.4f}*x"
    
    # Create a results DataFrame
    results_df = pd.DataFrame({
        '设备ID': [device_id],
        'R²': [r_squared],
        'p值': [p_value],
        '回归方程': [equation]
    })
    
    # Plotting
    plt.figure(figsize=(5, 5))
    plt.scatter(df[outdoor_temp_col], df[indoor_temp_col], label='Data Points', color='#A4D9D7', alpha=0.5)
    plt.plot(df[outdoor_temp_col], intercept + slope * df[outdoor_temp_col], color='#025F76', label='Regression Line')
    
    # Customize plot with Arial font and larger labels
    plt.xlabel('Outdoor air temperature (°C)', fontsize=16, fontname='Arial')
    plt.ylabel('Indoor air temperature (°C)', fontsize=16, fontname='Arial')
    plt.xticks(fontsize=15, fontname='Arial')
    plt.yticks(fontsize=15, fontname='Arial')
    plt.grid(True)
    
    # Display regression equation, R², and p-value on the plot
    plt.text(
        0.05, 0.95, 
        f"Equation: {equation}\nR² = {r_squared:.4f}\nP-value = {p_value:.4e}",
        transform=plt.gca().transAxes,
        fontsize=12, 
        fontname='Arial', 
        verticalalignment='top',
        bbox=dict(facecolor='white', edgecolor='gray', alpha=0.8)
    )
    
    # Adjust layout and save
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, format='png', dpi=400)
        print(f"Saved plot to {save_path}")
    plt.show()        
    return results_df



# Main processing function
def process_device_data(device_id, room_type, input_file, output_folder, 
                        time_col='rawtime', state_col='Operational_status', 
                        temp_col='Setpoint', sleep_col='Sleep_period', stable_col='Stable_period',
                        indoor_temp_col='Tin', outdoor_temp_col='Tout'):
    """
    Process a single device's data based on its room type.

    Parameters:
    - device_id (str): The device ID.
    - room_type (str): Room type (e.g., Bedroom, Livingroom+non-work, Mixedroom+work).
    - input_file (str): Path to the input Excel file for the device.
    - output_folder (str): Output folder path.
    """
    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Error reading file {input_file}: {e}")
        return

    # Step 1: Mark stable and sleep periods
    df = mark_stable_and_sleep_periods(df, state_col=state_col, temp_col=temp_col, time_col=time_col,
                                       stable_col=stable_col, sleep_col=sleep_col)

    # Step 2: Process based on room type
    if room_type == 'Bedroom':
        print(f"Processing Bedroom data for ID {device_id}...")
        hourly_avg_sleep = sleepextract_and_average_stable_data(df, state_col=state_col, sleep_col=sleep_col, 
                                                                time_col=time_col, target_cols=[indoor_temp_col, outdoor_temp_col])
        output_file = os.path.join(output_folder, f"{device_id}_sleep_hourly_average.xlsx")
        hourly_avg_sleep.to_excel(output_file, index=False)

    elif 'Livingroom' in room_type:
        print(f"Processing Livingroom data for ID {device_id}...")
        hourly_avg = extract_and_average_stable_data(df, stable_col=stable_col, sleep_col=sleep_col, 
                                                     time_col=time_col, target_cols=[indoor_temp_col, outdoor_temp_col])
        hourly_avg_file = os.path.join(output_folder, f"{device_id}_wake_hourly_average.xlsx")
        hourly_avg.to_excel(hourly_avg_file, index=False)
        perform_linear_regression_and_plot(hourly_avg, indoor_temp_col=indoor_temp_col, 
                                           outdoor_temp_col=outdoor_temp_col, device_id=device_id, 
                                           output_path=output_folder, period='wake')

    elif 'Mixedroom' in room_type:
        print(f"Processing Mixedroom data for ID {device_id}...")
        # Non-sleep data processing
        hourly_avg = extract_and_average_stable_data(df, stable_col=stable_col, sleep_col=sleep_col, 
                                                     time_col=time_col, target_cols=[indoor_temp_col, outdoor_temp_col])
        hourly_avg_file = os.path.join(output_folder, f"{device_id}_wake_hourly_average.xlsx")
        hourly_avg.to_excel(hourly_avg_file, index=False)
        perform_linear_regression_and_plot(hourly_avg, indoor_temp_col=indoor_temp_col, 
                                           outdoor_temp_col=outdoor_temp_col, device_id=device_id, 
                                           output_path=output_folder, period='wake')
        # Sleep data processing
        hourly_avg_sleep = sleepextract_and_average_stable_data(df, state_col=state_col, sleep_col=sleep_col, 
                                                                time_col=time_col, target_cols=[indoor_temp_col, outdoor_temp_col])
        sleep_avg_file = os.path.join(output_folder, f"{device_id}_sleep_hourly_average.xlsx")
        hourly_avg_sleep.to_excel(sleep_avg_file, index=False)

    else:
        print(f"Unknown room type for ID {device_id}. Skipping...")

process_device_data(
    device_id='Device123',
    room_type='Livingroom+non-work',
    input_file='/path/to/input/Device123.xlsx',
    output_folder='/path/to/output'
)


## Comfort indoor air temperature setting schedule

In [1]:
import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import make_interp_spline

# Function to process Bedroom schedules
def process_sleep_schedules(
    id_cluster_file_path,
    raw_data_folder_path,
    output_folder_path
):
    """
    Process sleep schedules for each device based on ID and save the results.

    Parameters:
    - id_cluster_file_path (str): Path to the Excel file containing IDs and clusters.
    - raw_data_folder_path (str): Path to the folder containing raw data files.
    - output_folder_path (str): Path to the folder where the results will be saved.

    Returns:
    - None
    """
    # Read the ID and cluster information
    id_cluster_df = pd.read_excel(id_cluster_file_path)
    
    # Convert the ID column to string to avoid '.0' issue
    id_cluster_df['ID'] = id_cluster_df['ID'].astype(str).str.replace('.0', '', regex=False)
    
    # Iterate over each ID in the DataFrame
    for _, row in id_cluster_df.iterrows():
        device_id = row['ID']
        raw_data_file = f"{device_id}_labeled_hourly_average.xlsx"
        raw_data_path = os.path.join(raw_data_folder_path, raw_data_file)
        
        # Check if the raw data file exists
        if not os.path.exists(raw_data_path):
            print(f"Raw data file for ID {device_id} not found. Skipping...")
            continue

        # Load the raw data file
        try:
            df = pd.read_excel(raw_data_path)
        except Exception as e:
            print(f"Error reading file {raw_data_path}: {e}")
            continue

        # Filter data for sleep hours (22:00 - 08:00)
        sleep_hours = ['22', '23', '0', '1', '2', '3', '4', '5', '6', '7']
        sleep_data = df[df['Hour'].astype(str).isin(sleep_hours)]

        if sleep_data.empty:
            print(f"No sleep data available for ID {device_id}. Skipping...")
            continue

        # Adjust the hour column to increment labels by 1
        sleep_data['Hour'] = sleep_data['Hour'].astype(str).apply(
            lambda h: str((int(h) + 1) % 24) if h.isdigit() else h
        )

        # Save the sleep schedule to an Excel file
        os.makedirs(output_folder_path, exist_ok=True)
        output_file_path = os.path.join(output_folder_path, f"{device_id}_sleep_schedule.xlsx")
        try:
            sleep_data.to_excel(output_file_path, index=False)
            print(f"Saved sleep schedule for ID {device_id} to {output_file_path}")
        except Exception as e:
            print(f"Error saving sleep schedule for ID {device_id}: {e}")

def process_work_living_room_comfort(
    id_cluster_file_path,
    regression_file_path,
    weather_data_file_path,
    output_folder_path
):
    """
    Process living room comfort temperature calculations for working samples based on ID.

    Parameters:
    - id_cluster_file_path (str): Path to the Excel file containing IDs and clusters.
    - regression_file_path (str): Path to the regression results Excel file.
    - weather_data_file_path (str): Path to the Excel file containing hourly outdoor temperatures.
    - output_folder_path (str): Path to the folder where the results will be saved.

    Returns:
    - None
    """
    # Step 1: Read the ID and cluster information, filter for 'work' samples
    try:
        id_cluster_df = pd.read_excel(id_cluster_file_path)
        work_samples = id_cluster_df[id_cluster_df['re_cluster'] == 'work']['ID'].astype(str)
    except Exception as e:
        print(f"Error reading ID cluster file: {e}")
        return

    # Step 2: Read the regression equations
    try:
        regression_df = pd.read_excel(regression_file_path)
    except Exception as e:
        print(f"Error reading regression file: {e}")
        return

    # Step 3: Read the hourly outdoor temperature data
    try:
        weather_data_df = pd.read_excel(weather_data_file_path)
        hourly_temperatures = weather_data_df['Dry']  # Use 'Dry' as the outdoor temperature column
    except Exception as e:
        print(f"Error reading weather data file: {e}")
        return

    # Step 4: Process each device in the work_samples
    for device_id in work_samples:
        # Find the regression equation for the current device ID
        regression_row = regression_df[regression_df['ID'].astype(str) == device_id]
        if regression_row.empty:
            print(f"No regression equation found for ID {device_id}. Skipping...")
            continue

        regression_equation = regression_row['regression equation'].values[0]

        # Extract coefficients from the regression equation
        match = re.search(r"y\s*=\s*([\d\.\-]+)\s*\+\s*([\d\.\-]+)\s*\*\s*x", regression_equation)
        if not match:
            print(f"Invalid regression equation format for ID {device_id}: {regression_equation}. Skipping...")
            continue

        intercept = float(match.group(1))
        slope = float(match.group(2))

        # Step 5: Calculate comfort temperatures
        comfort_temperatures = (intercept + slope * hourly_temperatures).round(1)

        # Step 6: Add operational status and room type
        weather_data_df['Comfort_Temperature'] = comfort_temperatures
        weather_data_df['operational_status'] = np.where(
            weather_data_df['Hour'].isin([13, 14, 19, 20, 21, 22]), 
            'on', 
            'off'
        )
        weather_data_df['room_type'] = 'Livingroom+work'

        # Set Comfort_Temperature to NaN where operational status is 'off'
        weather_data_df.loc[weather_data_df['operational_status'] == 'off', 'Comfort_Temperature'] = np.nan

        # Step 7: Create the output DataFrame
        comfort_df = weather_data_df[['Month', 'Day', 'Hour', 'Comfort_Temperature', 'operational_status', 'room_type']]

        # Step 8: Save the resulting DataFrame to an Excel file
        os.makedirs(output_folder_path, exist_ok=True)
        output_file_path = os.path.join(output_folder_path, f"{device_id}_comfort_temperature.xlsx")
        try:
            comfort_df.to_excel(output_file_path, index=False)
            print(f"Saved comfort temperature data for ID {device_id} to {output_file_path}")
        except Exception as e:
            print(f"Error saving comfort temperature data for ID {device_id}: {e}")
            
def process_nonwork_living_room_comfort(
    id_cluster_file_path,
    regression_file_path,
    weather_data_file_path,
    output_folder_path
):
    """
    Process comfort temperature schedules for Livingroom+non-work based on IDs.

    Parameters:
    - id_cluster_file_path (str): Path to the Excel file containing IDs and clusters.
    - regression_file_path (str): Path to the regression results Excel file.
    - weather_data_file_path (str): Path to the Excel file containing hourly outdoor temperatures.
    - output_folder_path (str): Path to the folder where the results will be saved.

    Returns:
    - None
    """
    # Step 1: Read the ID and cluster information, filter for 'non-work' samples
    try:
        id_cluster_df = pd.read_excel(id_cluster_file_path)
        nonwork_samples = id_cluster_df[id_cluster_df['re_cluster'] == 'non-work']['ID'].astype(str)
    except Exception as e:
        print(f"Error reading ID cluster file: {e}")
        return

    # Step 2: Read the regression equations
    try:
        regression_df = pd.read_excel(regression_file_path)
    except Exception as e:
        print(f"Error reading regression file: {e}")
        return

    # Step 3: Read the hourly outdoor temperature data
    try:
        weather_data_df = pd.read_excel(weather_data_file_path)
        hourly_temperatures = weather_data_df['Dry']  # Use 'Dry' as the outdoor temperature column
    except Exception as e:
        print(f"Error reading weather data file: {e}")
        return

    # Step 4: Process each device in the non-work samples
    for device_id in nonwork_samples:
        # Find the regression equation for the current device ID
        regression_row = regression_df[regression_df['ID'].astype(str) == device_id]
        if regression_row.empty:
            print(f"No regression equation found for ID {device_id}. Skipping...")
            continue

        regression_equation = regression_row['regression equation'].values[0]

        # Extract coefficients from the regression equation
        match = re.search(r"y\s*=\s*([\d\.\-]+)\s*\+\s*([\d\.\-]+)\s*\*\s*x", regression_equation)
        if not match:
            print(f"Invalid regression equation format for ID {device_id}: {regression_equation}. Skipping...")
            continue

        intercept = float(match.group(1))
        slope = float(match.group(2))

        # Step 5: Calculate comfort temperatures
        comfort_temperatures = (intercept + slope * hourly_temperatures).round(1)

        # Step 6: Add operational status and room type
        weather_data_df['Comfort_Temperature'] = comfort_temperatures
        weather_data_df['operational_status'] = np.where(
            weather_data_df['Hour'].isin(range(9, 23)),  # 09:00 to 22:00
            'on',
            'off'
        )
        weather_data_df['room_type'] = 'Livingroom+non-work'

        # Set Comfort_Temperature to NaN where operational status is 'off'
        weather_data_df.loc[weather_data_df['operational_status'] == 'off', 'Comfort_Temperature'] = np.nan

        # Step 7: Create the output DataFrame
        comfort_df = weather_data_df[['Month', 'Day', 'Hour', 'Comfort_Temperature', 'operational_status', 'room_type']]

        # Step 8: Save the resulting DataFrame to an Excel file
        os.makedirs(output_folder_path, exist_ok=True)
        output_file_path = os.path.join(output_folder_path, f"{device_id}_comfort_temperature.xlsx")
        try:
            comfort_df.to_excel(output_file_path, index=False)
            print(f"Saved comfort temperature data for ID {device_id} to {output_file_path}")
        except Exception as e:
            print(f"Error saving comfort temperature data for ID {device_id}: {e}")

def process_mixed_non_work_room_comfort(
    id_cluster_file_path,
    regression_file_path,
    weather_data_file_path,
    raw_data_folder_path,
    output_folder_path
):
    """
    Process comfort temperature schedules for Mixedroom+non-work based on ID.

    Parameters:
    - id_cluster_file_path (str): Path to the Excel file containing IDs and clusters.
    - regression_file_path (str): Path to the regression results Excel file.
    - weather_data_file_path (str): Path to the hourly outdoor temperatures file.
    - raw_data_folder_path (str): Path to the folder containing raw sleep data for night-time calculations.
    - output_folder_path (str): Path to the folder where the results will be saved.

    Returns:
    - None
    """
    # Step 1: Read ID and cluster information
    try:
        id_cluster_df = pd.read_excel(id_cluster_file_path)
        nonwork_samples = id_cluster_df[id_cluster_df['re_cluster'] == 'non-work']['ID'].astype(str).str.strip()
    except Exception as e:
        print(f"Error reading ID cluster file: {e}")
        return

    # Step 2: Read regression equations
    try:
        regression_df = pd.read_excel(regression_file_path)
        regression_df['ID'] = regression_df['ID'].astype(str).str.strip()
    except Exception as e:
        print(f"Error reading regression file: {e}")
        return

    # Step 3: Read hourly outdoor temperature data
    try:
        weather_data_df = pd.read_excel(weather_data_file_path)
        hourly_temperatures = weather_data_df['Dry']
    except Exception as e:
        print(f"Error reading weather data file: {e}")
        return

    # Step 4: Process each device in the non-work cluster
    for device_id in nonwork_samples:
        # Find regression equation for the current device ID
        regression_row = regression_df[regression_df['ID'] == device_id]
        if regression_row.empty:
            print(f"No regression equation found for ID {device_id}. Skipping...")
            continue

        regression_equation = regression_row['regression equation'].values[0]

        # Extract coefficients from the regression equation
        match = re.search(r"y\s*=\s*([\d\.\-]+)\s*\+\s*([\d\.\-]+)\s*\*\s*x", regression_equation)
        if not match:
            print(f"Invalid regression equation format for ID {device_id}: {regression_equation}. Skipping...")
            continue

        intercept = float(match.group(1))
        slope = float(match.group(2))

        # Step 5: Calculate daytime comfort temperatures
        daytime_hours = range(8, 22)  # Daytime hours: 08:00 to 21:00
        comfort_temperatures_day = intercept + slope * hourly_temperatures
        comfort_df_day = weather_data_df[weather_data_df['Hour'].isin(daytime_hours)].copy()
        comfort_df_day['Comfort_Temperature'] = comfort_temperatures_day.round(1)

        # Step 6: Process nighttime comfort temperatures
        raw_data_file = f"{device_id}_labeled_hourly_average.xlsx"
        raw_data_path = os.path.join(raw_data_folder_path, raw_data_file)

        if not os.path.exists(raw_data_path):
            print(f"Raw data file for ID {device_id} not found. Skipping nighttime calculations...")
            continue

        try:
            night_data_df = pd.read_excel(raw_data_path)
            comfort_df_night = sleep_schedule_generate(night_data_df, weather_data_df)
        except Exception as e:
            print(f"Error processing night data for ID {device_id}: {e}")
            continue

        # Step 7: Expand nighttime data across all days in the weather period
        night_hours = ['22', '23', '0', '1', '2', '3', '4', '5', '6', '7']
        comfort_df_night_expanded = []

        for month in weather_data_df['Month'].unique():
            for day in weather_data_df[weather_data_df['Month'] == month]['Day'].unique():
                for hour_str in night_hours:
                    hour = int(hour_str)
                    temperature = comfort_df_night.loc[
                        comfort_df_night['hour'] == hour_str, 'average_temperature'
                    ].values[0]
                    comfort_df_night_expanded.append({
                        'Month': month,
                        'Day': day,
                        'Hour': hour,
                        'Comfort_Temperature': round(temperature, 1)
                    })

        comfort_df_night_expanded = pd.DataFrame(comfort_df_night_expanded)

        # Step 8: Combine daytime and expanded nighttime data
        comfort_df = pd.concat([comfort_df_day, comfort_df_night_expanded], ignore_index=True)
        comfort_df = comfort_df.sort_values(by=['Month', 'Day', 'Hour']).reset_index(drop=True)

        # Step 9: Save the final comfort temperature data
        os.makedirs(output_folder_path, exist_ok=True)
        output_file_path = os.path.join(output_folder_path, f"{device_id}_comfort_temperature.xlsx")
        try:
            comfort_df.to_excel(output_file_path, index=False)
            print(f"Saved combined comfort temperature data for ID {device_id} to {output_file_path}")
        except Exception as e:
            print(f"Error saving comfort temperature data for ID {device_id}: {e}")
def process_mixed_work_room_comfort(
    id_cluster_file_path,
    regression_file_path,
    weather_data_file_path,
    raw_data_folder_path,
    output_folder_path
):
    """
    Process comfort temperature schedules for Mixedroom+work based on ID.

    Parameters:
    - id_cluster_file_path (str): Path to the Excel file containing IDs and clusters.
    - regression_file_path (str): Path to the regression results Excel file.
    - weather_data_file_path (str): Path to the Excel file containing hourly outdoor temperatures.
    - raw_data_folder_path (str): Path to the folder containing raw sleep data for night-time calculations.
    - output_folder_path (str): Path to the folder where the results will be saved.

    Returns:
    - None
    """
    # Step 1: Read ID and cluster information
    try:
        id_cluster_df = pd.read_excel(id_cluster_file_path)
        work_samples = id_cluster_df[id_cluster_df['re_cluster'] == 'work']['ID'].astype(str).str.strip()
    except Exception as e:
        print(f"Error reading ID cluster file: {e}")
        return

    # Step 2: Read regression equations
    try:
        regression_df = pd.read_excel(regression_file_path)
        regression_df['ID'] = regression_df['ID'].astype(str).str.strip()
    except Exception as e:
        print(f"Error reading regression file: {e}")
        return

    # Step 3: Read hourly outdoor temperature data
    try:
        weather_data_df = pd.read_excel(weather_data_file_path)
        hourly_temperatures = weather_data_df['Dry']
    except Exception as e:
        print(f"Error reading weather data file: {e}")
        return

    # Step 4: Process each device in the work cluster
    for device_id in work_samples:
        # Find regression equation for the current device ID
        regression_row = regression_df[regression_df['ID'] == device_id]
        if regression_row.empty:
            print(f"No regression equation found for ID {device_id}. Skipping...")
            continue

        regression_equation = regression_row['regression equation'].values[0]

        # Extract coefficients from the regression equation
        match = re.search(r"y\s*=\s*([\d\.\-]+)\s*\+\s*([\d\.\-]+)\s*\*\s*x", regression_equation)
        if not match:
            print(f"Invalid regression equation format for ID {device_id}: {regression_equation}. Skipping...")
            continue

        intercept = float(match.group(1))
        slope = float(match.group(2))

        # Step 5: Calculate daytime comfort temperatures
        daytime_hours = [12, 13, 18, 19, 20, 21]  # Selected daytime hours
        comfort_temperatures_day = intercept + slope * hourly_temperatures
        comfort_df_day = weather_data_df[weather_data_df['Hour'].isin(daytime_hours)].copy()
        comfort_df_day['Comfort_Temperature'] = comfort_temperatures_day.round(1)
        comfort_df_day['operational_status'] = 'on'

        # Set operational status to 'off' and temperature to NaN for other hours
        other_daytime_hours = [h for h in range(8, 22) if h not in daytime_hours]
        other_df_day = weather_data_df[weather_data_df['Hour'].isin(other_daytime_hours)].copy()
        other_df_day['Comfort_Temperature'] = np.nan
        other_df_day['operational_status'] = 'off'

        # Combine both on and off hours
        comfort_df_day = pd.concat([comfort_df_day, other_df_day], ignore_index=True)

        # Step 6: Process nighttime comfort temperatures
        raw_data_file = f"{device_id}_labeled_hourly_average.xlsx"
        raw_data_path = os.path.join(raw_data_folder_path, raw_data_file)

        if not os.path.exists(raw_data_path):
            print(f"Raw data file for ID {device_id} not found. Skipping nighttime calculations...")
            continue

        try:
            night_data_df = pd.read_excel(raw_data_path)
            comfort_df_night = sleep_schedule_generate(night_data_df, weather_data_df)
        except Exception as e:
            print(f"Error processing night data for ID {device_id}: {e}")
            continue

        # Step 7: Combine day and night data
        comfort_df = pd.concat([comfort_df_day, comfort_df_night], ignore_index=True)
        comfort_df['room_type'] = 'Mixedroom+work'
        comfort_df = comfort_df.drop_duplicates(subset=['Month', 'Day', 'Hour']).sort_values(by=['Month', 'Day', 'Hour']).reset_index(drop=True)

        # Keep only necessary columns
        columns_to_keep = ['Month', 'Day', 'Hour', 'Comfort_Temperature', 'operational_status', 'room_type']
        comfort_df = comfort_df[columns_to_keep]

        # Step 8: Save the final comfort temperature data
        os.makedirs(output_folder_path, exist_ok=True)
        output_file_path = os.path.join(output_folder_path, f"{device_id}_comfort_temperature.xlsx")
        try:
            comfort_df.to_excel(output_file_path, index=False)
            print(f"Saved combined comfort temperature data for ID {device_id} to {output_file_path}")
        except Exception as e:
            print(f"Error saving comfort temperature data for ID {device_id}: {e}")


In [None]:
def generate_room_schedules(
    room_type,
    id_cluster_file_path,
    regression_file_path,
    weather_data_file_path,
    raw_data_folder_path,
    output_folder_path
):
    """
    Generate schedules for different room types based on input data and regression results.

    Parameters:
    - room_type (str): Room type ('Bedroom', 'Livingroom+work', 'Livingroom+non-work', 'Mixedroom+work', 'Mixedroom+non-work').
    - id_cluster_file_path (str): Path to the Excel file containing IDs and clusters.
    - regression_file_path (str): Path to the regression results Excel file.
    - weather_data_file_path (str): Path to the Excel file containing hourly outdoor temperatures.
    - raw_data_folder_path (str): Path to the folder containing raw sleep data.
    - output_folder_path (str): Path to the folder where the results will be saved.
    """
    if room_type == 'Bedroom':
        process_sleep_schedules(
            id_cluster_file_path=id_cluster_file_path,
            raw_data_folder_path=raw_data_folder_path,
            output_folder_path=output_folder_path
        )
    elif room_type == 'Livingroom+work':
        process_work_living_room_comfort(
            id_cluster_file_path=id_cluster_file_path,
            regression_file_path=regression_file_path,
            weather_data_file_path=weather_data_file_path,
            output_folder_path=output_folder_path
        )
    elif room_type == 'Livingroom+non-work':
        process_nonwork_living_room_comfort(
            id_cluster_file_path=id_cluster_file_path,
            regression_file_path=regression_file_path,
            weather_data_file_path=weather_data_file_path,
            output_folder_path=output_folder_path
        )
    elif room_type == 'Mixedroom+non-work':
        process_mixed_non_work_room_comfort(
            id_cluster_file_path=id_cluster_file_path,
            regression_file_path=regression_file_path,
            weather_data_file_path=weather_data_file_path,
            raw_data_folder_path=raw_data_folder_path,
            output_folder_path=output_folder_path
        )
    elif room_type == 'Mixedroom+work':
        process_mixed_work_room_comfort(
            id_cluster_file_path=id_cluster_file_path,
            regression_file_path=regression_file_path,
            weather_data_file_path=weather_data_file_path,
            raw_data_folder_path=raw_data_folder_path,
            output_folder_path=output_folder_path
        )
    else:
        print(f"Invalid room type: {room_type}. Please choose from 'Bedroom', 'Livingroom+work', 'Livingroom+non-work', 'Mixedroom+work', or 'Mixedroom+non-work'.")
