In [1]:
import pandas as pd
import sys
if '../' not in sys.path:
    # necessary to access the src folder without relative imports
    sys.path.append('../')
from src.data.prepare_data import *

# Load the CSV file
df = pd.read_csv("../data/sorted_data.csv")

df = extract_features(df).dropna()
print(df.head())

X = df.drop(columns=['window-event','window-open'])
y_window_open = df['window-open']
y_window_event = df['window-event']

   window-event  window-open  indoor-temp  outdoor-temp  sin_minutes_of_day  \
4         False         True        77.18         77.54           -0.986286   
5         False         True        77.18         77.54           -0.986996   
6         False         True        77.18         77.54           -0.987688   
7         False         True        77.18         77.54           -0.989016   
8         False         True        77.18         77.54           -0.990268   

   cos_minutes_of_day  sin_day_of_year  cos_day_of_year  temp_diff  \
4           -0.165048         0.982927        -0.183998      -0.36   
5           -0.160743         0.982927        -0.183998      -0.36   
6           -0.156434         0.982927        -0.183998      -0.36   
7           -0.147809         0.982927        -0.183998      -0.36   
8           -0.139173         0.982927        -0.183998      -0.36   

   rolling_mean_indoor_temp  rolling_std_indoor_temp  \
4                     77.18                     

# Train models
For now we will train against classifying window status and and window events to see which is more useful.

### Models:
- Random Forest
- Gradient Boosting
- Logistic Regression
- RNN
- LSTM
- TCN

We'll start by testing the first models. Depending on the results we will add more complex models.

### Evaluation:
Metrics to evaluate the models:
- Accuracy
- Precision
- Recall
- F1 Score

Training splits:
- k-fold cross-validation with n iterations
- leave one episode out with 50% overlap between episodes

In [3]:
# k-fold cross-validation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

n_folds = 10
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
random_forest_results = {
    'window-open': {
        'accuracy': [],
        'f1': []
    },
    'window-event': {
        'accuracy': [],
        'f1': []
    }
}
logistic_regression_results = {
    'window-open': {
        'accuracy': [],
        'f1': []
    },
    'window-event': {
        'accuracy': [],
        'f1': []
    }
}

for train_index, test_index in skf.split(X, y_window_open):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train_window_open, y_test_window_open = y_window_open.iloc[train_index], y_window_open.iloc[test_index]
    y_train_window_event, y_test_window_event = y_window_event.iloc[train_index], y_window_event.iloc[test_index]

    # Train and evaluate the model for window-open
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train_window_open)
    y_pred_window_open = model.predict(X_test)
    random_forest_results['window-open']['accuracy'].append(accuracy_score(y_test_window_open, y_pred_window_open))
    random_forest_results['window-open']['f1'].append(f1_score(y_test_window_open, y_pred_window_open))

    # Train and evaluate the model for window-event
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train_window_event)
    y_pred_window_event = model.predict(X_test)
    random_forest_results['window-event']['accuracy'].append(accuracy_score(y_test_window_event, y_pred_window_event))
    random_forest_results['window-event']['f1'].append(f1_score(y_test_window_event, y_pred_window_event))
# Calculate the average accuracy and F1 score for each model
# Random Forest
avg_accuracy_window_open = sum(random_forest_results['window-open']['accuracy']) / n_folds
avg_f1_window_open = sum(random_forest_results['window-open']['f1']) / n_folds
avg_accuracy_window_event = sum(random_forest_results['window-event']['accuracy']) / n_folds
avg_f1_window_event = sum(random_forest_results['window-event']['f1']) / n_folds
# Print the results
print(f"Window Open - Average Accuracy: {avg_accuracy_window_open:.4f}, Average F1 Score: {avg_f1_window_open:.4f}")
print(f"Window Event - Average Accuracy: {avg_accuracy_window_event:.4f}, Average F1 Score: {avg_f1_window_event:.4f}")

# Logistic Regression
avg_accuracy_window_open = sum(logistic_regression_results['window-open']['accuracy']) / n_folds
avg_f1_window_open = sum(logistic_regression_results['window-open']['f1']) / n_folds
avg_accuracy_window_event = sum(logistic_regression_results['window-event']['accuracy']) / n_folds
avg_f1_window_event = sum(logistic_regression_results['window-event']['f1']) / n_folds
# Print the results
print(f"Window Open - Average Accuracy: {avg_accuracy_window_open:.4f}, Average F1 Score: {avg_f1_window_open:.4f}")
print(f"Window Event - Average Accuracy: {avg_accuracy_window_event:.4f}, Average F1 Score: {avg_f1_window_event:.4f}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Window Open - Average Accuracy: 0.9991, Average F1 Score: 0.9991
Window Event - Average Accuracy: 0.9991, Average F1 Score: 0.0000
Window Open - Average Accuracy: 0.0000, Average F1 Score: 0.0000
Window Event - Average Accuracy: 0.0000, Average F1 Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
