In [1]:
import pandas as pd
import numpy as np

# Load the CSV file
original_df = pd.read_csv("../data/sorted_data.csv")
# Display the first few rows of the DataFrame
print("Original DataFrame:")
print(original_df.head())

# Create a copy of the original DataFrame that will be expanded
full_df = original_df.copy()

Original DataFrame:
   window-event  window-open  indoor-temp    time  outdoor-temp
0          True         True        77.18  147918         77.54
1         False         True        77.18  147919         77.54
2         False         True        77.18  147920         77.54
3         False         True        77.18  147921         77.54
4         False         True        77.18  147922         77.54


Add and extract features

In [2]:
import sys
if '../' not in sys.path:
    # necessary to access the src folder without relative imports
    sys.path.append('../')
from src.data.prepare_data import *

full_df = add_time_features(original_df)
full_df = add_temp_diff(full_df)
full_df = add_rolling_features(full_df)
full_df = add_lagged_features(full_df)
# Display the first few rows of the DataFrame after feature extraction
print("DataFrame after feature extraction:")
print(full_df.head())

DataFrame after feature extraction:
   window-event  window-open  indoor-temp  outdoor-temp  sin_minutes_of_day  \
0          True         True        77.18         77.54           -0.983255   
1         False         True        77.18         77.54           -0.984041   
2         False         True        77.18         77.54           -0.984808   
3         False         True        77.18         77.54           -0.985556   
4         False         True        77.18         77.54           -0.986286   

   cos_minutes_of_day  sin_day_of_year  cos_day_of_year           date_time  \
0           -0.182236         0.982927        -0.183998 2025-04-13 17:18:00   
1           -0.177944         0.982927        -0.183998 2025-04-13 17:19:00   
2           -0.173648         0.982927        -0.183998 2025-04-13 17:20:00   
3           -0.169350         0.982927        -0.183998 2025-04-13 17:21:00   
4           -0.165048         0.982927        -0.183998 2025-04-13 17:22:00   

   temp_diff  

In [3]:
# Standardize the features
# TODO
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# # Select the features to standardize
# features_to_standardize = [
#     'temp_diff', 'rolling_mean', 'rolling_std', 'lagged_temp', 'lagged_humidity'
# ]
# # Standardize the selected features
# full_df[features_to_standardize] = scaler.fit_transform(full_df[features_to_standardize])

In [4]:
# Identify feature importance
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Drop rows with NaN values
full_df = full_df.dropna()

X = full_df.drop(columns=['window-event','window-open', 'date_time'])
y_window_open = full_df['window-open']
y_window_event = full_df['window-event']

# Select features
window_open_selector = SelectKBest(score_func=f_classif, k='all')
window_open_selector.fit(X, y_window_open)

window_event_selector = SelectKBest(score_func=f_classif, k='all')
window_event_selector.fit(X, y_window_event)

# Print the feature scores
window_open_feature_scores = window_open_selector.scores_
window_event_feature_scores = window_event_selector.scores_

feature_names = X.columns
window_open_importance_df = pd.DataFrame({'Feature': feature_names, 'Score': window_open_feature_scores})
window_event_importance_df = pd.DataFrame({'Feature': feature_names, 'Score': window_event_feature_scores})

window_open_importance_df = window_open_importance_df.sort_values(by='Score', ascending=False)
window_event_importance_df = window_event_importance_df.sort_values(by='Score', ascending=False)

print("Window Open Feature Importance:")
print(window_open_importance_df)

print("Window Event Feature Importance:")
print(window_event_importance_df)

Window Open Feature Importance:
                      Feature        Score
5             cos_day_of_year  4379.985616
4             sin_day_of_year  4019.287856
3          cos_minutes_of_day   149.267617
2          sin_minutes_of_day    66.831764
7    rolling_mean_indoor_temp    21.468065
1                outdoor-temp    20.925093
13         lagged_indoor_temp    20.921567
14        lagged_outdoor_temp    20.849978
9   rolling_mean_outdoor_temp    20.809087
0                 indoor-temp    20.553227
8     rolling_std_indoor_temp    12.294847
12      rolling_std_temp_diff    11.734846
6                   temp_diff     9.276251
11     rolling_mean_temp_diff     9.029224
10   rolling_std_outdoor_temp     1.857394
Window Event Feature Importance:
                      Feature      Score
14        lagged_outdoor_temp  16.830665
9   rolling_mean_outdoor_temp  16.383167
1                outdoor-temp  15.394137
5             cos_day_of_year  13.083169
11     rolling_mean_temp_diff  12.975989
6

In [5]:
# Drop least important features
# full_df = full_df.drop(columns=feature_importance_df[feature_importance_df['Score'] < 0.1]['Feature'])

Visualize feature correlations and perform dimensionality reduction