In [3]:
import pandas as pd
import sys
if '../' not in sys.path:
    # necessary to access the src folder without relative imports
    sys.path.append('../')
from src.data.prepare_data import *

# Load the CSV file
df = pd.read_csv("../data/expanded_data.csv")

In [4]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneGroupOut
 
# Split data

# Train and test data
X = df.drop(columns=['window-event','window-open', 'date_time'])
y_window_open = df['window-open']
y_window_event = df['window-event']

# N-fold cross-validation
n_folds = 10
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Leave one episode (day) out
logo = LeaveOneGroupOut()
groups = df['date_time'].apply(lambda x: pd.to_datetime(x).date())

# Train models
For now we will train against classifying window status and and window events to see which is more useful.

### Models:
- Random Forest
- Gradient Boosting
- Logistic Regression
- RNN
- LSTM
- TCN

We'll start by testing the first models. Depending on the results we will add more complex models.

### Evaluation:
Metrics to evaluate the models:
- Accuracy
- Precision
- Recall
- F1 Score

Training splits:
- k-fold cross-validation with n iterations
- leave one episode out with 50% overlap between episodes

In [5]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Get metrics for Random Forest and Logistic Regression using Stratified K-Folds

random_forest_results = {
    'window-open': {
        'accuracy': [],
        'f1': []
    },
    'window-event': {
        'accuracy': [],
        'f1': []
    }
}
logistic_regression_results = {
    'window-open': {
        'accuracy': [],
        'f1': []
    },
    'window-event': {
        'accuracy': [],
        'f1': []
    }
}

for train_index, test_index in skf.split(X, y_window_open):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train_window_open, y_test_window_open = y_window_open.iloc[train_index], y_window_open.iloc[test_index]
    y_train_window_event, y_test_window_event = y_window_event.iloc[train_index], y_window_event.iloc[test_index]

    # Train and evaluate Random Forest model for window-open
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train_window_open)
    y_pred_window_open = model.predict(X_test)
    random_forest_results['window-open']['accuracy'].append(accuracy_score(y_test_window_open, y_pred_window_open))
    random_forest_results['window-open']['f1'].append(f1_score(y_test_window_open, y_pred_window_open, zero_division=1))

    # Train and evaluate Random Forest model for window-event
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train_window_event)
    y_pred_window_event = model.predict(X_test)
    random_forest_results['window-event']['accuracy'].append(accuracy_score(y_test_window_event, y_pred_window_event))
    random_forest_results['window-event']['f1'].append(f1_score(y_test_window_event, y_pred_window_event, zero_division=1))

    # Train and evaluate Logistic Regression model for window-open
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train_window_open)
    y_pred_window_open = model.predict(X_test)
    logistic_regression_results['window-open']['accuracy'].append(accuracy_score(y_test_window_open, y_pred_window_open))
    logistic_regression_results['window-open']['f1'].append(f1_score(y_test_window_open, y_pred_window_open, zero_division=1))
    
    # Train and evaluate Logistic Regression model for window-event
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train_window_event)
    y_pred_window_event = model.predict(X_test)
    logistic_regression_results['window-event']['accuracy'].append(accuracy_score(y_test_window_event, y_pred_window_event))
    logistic_regression_results['window-event']['f1'].append(f1_score(y_test_window_event, y_pred_window_event, zero_division=1))

# Calculate the average accuracy and F1 score for each model
# Random Forest
avg_accuracy_window_open = sum(random_forest_results['window-open']['accuracy']) / n_folds
avg_f1_window_open = sum(random_forest_results['window-open']['f1']) / n_folds
avg_accuracy_window_event = sum(random_forest_results['window-event']['accuracy']) / n_folds
avg_f1_window_event = sum(random_forest_results['window-event']['f1']) / n_folds
print(f"Window Open - Average Accuracy: {avg_accuracy_window_open:.4f}, Average F1 Score: {avg_f1_window_open:.4f}")
print(f"Window Event - Average Accuracy: {avg_accuracy_window_event:.4f}, Average F1 Score: {avg_f1_window_event:.4f}")

# Logistic Regression
avg_accuracy_window_open = sum(logistic_regression_results['window-open']['accuracy']) / n_folds
avg_f1_window_open = sum(logistic_regression_results['window-open']['f1']) / n_folds
avg_accuracy_window_event = sum(logistic_regression_results['window-event']['accuracy']) / n_folds
avg_f1_window_event = sum(logistic_regression_results['window-event']['f1']) / n_folds
print(f"Window Open - Average Accuracy: {avg_accuracy_window_open:.4f}, Average F1 Score: {avg_f1_window_open:.4f}")
print(f"Window Event - Average Accuracy: {avg_accuracy_window_event:.4f}, Average F1 Score: {avg_f1_window_event:.4f}")

Window Open - Average Accuracy: 0.9991, Average F1 Score: 0.9991
Window Event - Average Accuracy: 0.9990, Average F1 Score: 0.3000
Window Open - Average Accuracy: 0.7432, Average F1 Score: 0.7477
Window Event - Average Accuracy: 0.9991, Average F1 Score: 0.4000


In [6]:
# Get metrics for Random Forest and Logistic Regression using Leave One Episode Out by Day

random_forest_results = {
    'window-open': {
        'accuracy': [],
        'f1': []
    },
    'window-event': {
        'accuracy': [],
        'f1': []
    }
}

logistic_regression_results = {
    'window-open': {
        'accuracy': [],
        'f1': []
    },
    'window-event': {
        'accuracy': [],
        'f1': []
    }
}

# For n days, use n-1 for training and 1 for testing
for i, (train_index, test_index) in enumerate(logo.split(X, y_window_open, groups)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train_window_open, y_test_window_open = y_window_open.iloc[train_index], y_window_open.iloc[test_index]
    y_train_window_event, y_test_window_event = y_window_event.iloc[train_index], y_window_event.iloc[test_index]

    # Train and evaluate Random Forest model for window-open
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train_window_open)
    y_pred_window_open = model.predict(X_test)
    random_forest_results['window-open']['accuracy'].append(accuracy_score(y_test_window_open, y_pred_window_open))
    random_forest_results['window-open']['f1'].append(f1_score(y_test_window_open, y_pred_window_open, zero_division=1))

    # Train and evaluate Random Forest model for window-event
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train_window_event)
    y_pred_window_event = model.predict(X_test)
    random_forest_results['window-event']['accuracy'].append(accuracy_score(y_test_window_event, y_pred_window_event))
    random_forest_results['window-event']['f1'].append(f1_score(y_test_window_event, y_pred_window_event, zero_division=1))

    # Train and evaluate Logistic Regression model for window-open
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train_window_open)
    y_pred_window_open = model.predict(X_test)
    logistic_regression_results['window-open']['accuracy'].append(accuracy_score(y_test_window_open, y_pred_window_open))
    logistic_regression_results['window-open']['f1'].append(f1_score(y_test_window_open, y_pred_window_open, zero_division=1))
    
    # Train and evaluate Logistic Regression model for window-event
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train_window_event)
    y_pred_window_event = model.predict(X_test)
    logistic_regression_results['window-event']['accuracy'].append(accuracy_score(y_test_window_event, y_pred_window_event))
    logistic_regression_results['window-event']['f1'].append(f1_score(y_test_window_event, y_pred_window_event, zero_division=1))
    
# Calculate the average accuracy and F1 score for each model
# Random Forest
avg_accuracy_window_open = sum(random_forest_results['window-open']['accuracy']) / n_folds
avg_f1_window_open = sum(random_forest_results['window-open']['f1']) / n_folds
avg_accuracy_window_event = sum(random_forest_results['window-event']['accuracy']) / n_folds
avg_f1_window_event = sum(random_forest_results['window-event']['f1']) / n_folds
print(f"Window Open - Average Accuracy: {avg_accuracy_window_open:.4f}, Average F1 Score: {avg_f1_window_open:.4f}")
print(f"Window Event - Average Accuracy: {avg_accuracy_window_event:.4f}, Average F1 Score: {avg_f1_window_event:.4f}")

# Logistic Regression
avg_accuracy_window_open = sum(logistic_regression_results['window-open']['accuracy']) / n_folds
avg_f1_window_open = sum(logistic_regression_results['window-open']['f1']) / n_folds
avg_accuracy_window_event = sum(logistic_regression_results['window-event']['accuracy']) / n_folds
avg_f1_window_event = sum(logistic_regression_results['window-event']['f1']) / n_folds
print(f"Window Open - Average Accuracy: {avg_accuracy_window_open:.4f}, Average F1 Score: {avg_f1_window_open:.4f}")
print(f"Window Event - Average Accuracy: {avg_accuracy_window_event:.4f}, Average F1 Score: {avg_f1_window_event:.4f}")

Window Open - Average Accuracy: 0.6421, Average F1 Score: 0.3617
Window Event - Average Accuracy: 1.2979, Average F1 Score: 0.5000
Window Open - Average Accuracy: 0.7950, Average F1 Score: 0.4799
Window Event - Average Accuracy: 1.2975, Average F1 Score: 0.4000
