In [None]:
import warnings
warnings.filterwarnings("ignore", message="X has feature names")

In [None]:
import pandas as pd

# Load the saved corrected CSVs
df_temp_1 = pd.read_csv("df_temp_1_corrected.csv")
df_temp_2 = pd.read_csv("df_temp_2_corrected.csv")
df_temp_3 = pd.read_csv("df_temp_3_corrected.csv")


# Load the saved corrected CSVs
df_temp_1_2 = pd.read_csv("df_2temp_1_corrected.csv")
df_temp_2_2 = pd.read_csv("df_2temp_2_corrected.csv")
df_temp_3_2 = pd.read_csv("df_2temp_3_corrected.csv")

In [None]:
# Drop columns with prefix 'dT' from each DataFrame
df_temp_1 = df_temp_1.loc[:, ~df_temp_1.columns.str.startswith('dT')]
df_temp_2 = df_temp_2.loc[:, ~df_temp_2.columns.str.startswith('dT')]
df_temp_3 = df_temp_3.loc[:, ~df_temp_3.columns.str.startswith('dT')]

df_temp_1_2 = df_temp_1_2.loc[:, ~df_temp_1_2.columns.str.startswith('dT')]
df_temp_2_2 = df_temp_2_2.loc[:, ~df_temp_2_2.columns.str.startswith('dT')]
df_temp_3_2 = df_temp_3_2.loc[:, ~df_temp_3_2.columns.str.startswith('dT')]

In [None]:
df_temp_1.columns, df_temp_2.columns, df_temp_3.columns

In [None]:
df_temp_1.columns, df_temp_2.columns, df_temp_3.columns, df_temp_1_2.columns, df_temp_2_2.columns, df_temp_3_2.columns

In [None]:
df_temp_1.shape, df_temp_2.shape, df_temp_3.shape, df_temp_1_2.shape, df_temp_2_2.shape, df_temp_3_2.shape

In [None]:
import pandas as pd

# Assuming df_temp_1, df_temp_2, df_temp_3 (batch 1)
# and df_temp_1_2, df_temp_2_2, df_temp_3_2 (batch 2) are already loaded

# For df_temp_1:
# 1) Find the maximum “Time” value in the first batch
max_time_1 = df_temp_1["Time"].max()

# 2) Select only those rows from batch 2 whose Time is strictly greater than max_time_1
new_rows_1 = df_temp_1_2[df_temp_1_2["Time"] > max_time_1].copy()

# 3) Concatenate and reset the index (ignore_index=True to get a new contiguous index)
df_temp_1 = pd.concat([df_temp_1, new_rows_1], ignore_index=True)

# For df_temp_2:
max_time_2 = df_temp_2["Time"].max()
new_rows_2 = df_temp_2_2[df_temp_2_2["Time"] > max_time_2].copy()
df_temp_2 = pd.concat([df_temp_2, new_rows_2], ignore_index=True)

# For df_temp_3:
max_time_3 = df_temp_3["Time"].max()
new_rows_3 = df_temp_3_2[df_temp_3_2["Time"] > max_time_3].copy()
df_temp_3 = pd.concat([df_temp_3, new_rows_3], ignore_index=True)

# After running this, df_temp_1, df_temp_2, df_temp_3 each include only the “new” rows
# from the second‐batch DataFrames.

In [None]:
# Define a helper function to apply to all DataFrames
def rename_and_add_datetime(df):
    df = df.rename(columns={"Time": "Epochs"})
    df["Datetime"] = pd.to_datetime(df["Epochs"], unit="s")
    return df


# Apply to all three
df_temp_1 = rename_and_add_datetime(df_temp_1)
df_temp_2 = rename_and_add_datetime(df_temp_2)
df_temp_3 = rename_and_add_datetime(df_temp_3)

In [None]:
def resample_hourly(df):
    df = df.set_index("Datetime")  # set datetime index
    # resample and keep Datetime
    df_resampled = df.resample("1h").mean().reset_index()
    return df_resampled


# Resample all 3
df_temp_1 = resample_hourly(df_temp_1)
df_temp_2 = resample_hourly(df_temp_2)
df_temp_3 = resample_hourly(df_temp_3)

In [None]:
df_temp_1 = df_temp_1.drop(columns=["Epochs"])
df_temp_2 = df_temp_2.drop(columns=["Epochs"])
df_temp_3 = df_temp_3.drop(columns=["Epochs"])

df_temp_1 = df_temp_1.rename(columns={"Voltage": "Volt_H1"})
df_temp_2 = df_temp_2.rename(columns={"Voltage": "Volt_H2"})
df_temp_3 = df_temp_3.rename(columns={"Voltage": "Volt_H3"})

In [None]:
df_temp_1.columns, df_temp_2.columns, df_temp_2.columns

In [None]:
# Merge by Datetime
df_merged = pd.merge(df_temp_1, df_temp_2, on="Datetime", how="outer")
df_merged = pd.merge(df_merged, df_temp_3, on="Datetime", how="outer")

# Optional: sort by time
df_merged = df_merged.sort_values("Datetime").reset_index(drop=True)

In [None]:
df_merged.columns

In [None]:
df_merged_clean = df_merged.dropna().reset_index(drop=True)

In [None]:
print(f"Original rows: {len(df_merged)}")
print(f"Cleaned rows:  {len(df_merged_clean)}")

In [None]:
df_merged_clean["Hour"] = df_merged_clean["Datetime"].dt.hour

In [None]:
df_merged_clean.columns

### Correlation matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select only temperature sensor columns
temp_columns = [col for col in df_merged_clean.columns if col.startswith("T_")]
df_t_only = df_merged_clean[temp_columns]

# Compute the correlation matrix
corr_matrix_t = df_t_only.corr()

In [None]:
# Plot with opacity and annotations
plt.figure(figsize=(14, 12))
sns.heatmap(
    corr_matrix_t,
    cmap="coolwarm",
    center=0,
    annot=True,                   # Show values
    fmt=".2f",                    # Format to 2 decimal places
    annot_kws={"size": 8},
    cbar_kws={"label": "Correlation"},
    alpha=0.99                     # Set opacity
)
plt.title("Correlation Matrix of Temperature Sensors (T_*)", fontsize=14)
plt.tight_layout()
plt.show()

### Basic Statistics

In [None]:
import numpy as np
import pandas as pd

# Filter T_ columns
temp_columns = [col for col in df_merged_clean.columns if col.startswith("T_")]
df_t_only = df_merged_clean[temp_columns]

# Compute correlation matrix
corr_matrix = df_t_only.corr()

# Extract upper triangle without diagonal
upper_tri = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Flatten and drop NaNs
correlations = upper_tri.unstack().dropna()

# Describe the correlation values
print("📊 Descriptive statistics for pairwise T-sensor correlations:\n")
print(correlations.describe().round(3))

In [None]:
# Cell 1: Initialization with predefined MultiIndex columns

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Copy your data
df = df_merged_clean.copy()

# Define hub groups
hub_1 = ['T_15_pv1', 'T_15_pv2', 'T_17_pv1', 'T_17_pv2',
         'T_19_pv1', 'T_19_pv2', 'T_14_pv1', 'T_14_pv2']
hub_2 = ['T_16_pv1', 'T_16_pv2', 'T_13_pv1', 'T_13_pv2',
         'T_18_pv1', 'T_18_pv2', 'T_20_pv1', 'T_20_pv2']
hub_3 = ['T_11_pv1', 'T_11_pv2', 'T_12_pv1',
         'T_12_pv2', 'T_27_pv1', 'T_27_pv2']

sensor_to_features = {}
for s in hub_1:
    sensor_to_features[s] = hub_2 + hub_3
for s in hub_2:
    sensor_to_features[s] = hub_1 + hub_3
for s in hub_3:
    sensor_to_features[s] = hub_1 + hub_2

### Linear

In [None]:
# Cell: Load the pickled models back into Python

import pickle

# Same filename used when saving
model_filename = "linear_models.pkl"

# Load the dictionary of models
with open(model_filename, "rb") as f:
    loaded_linear_models = pickle.load(f)

print(f"Loaded {len(loaded_linear_models)} models from {model_filename}")

# Example: use one loaded model to predict
# sensor = list(loaded_linear_models.keys())[0]
# preds = loaded_linear_models[sensor].predict(df_merged_clean[sensor_to_features[sensor]])

In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# 1) Prepare a list to collect metrics from loaded models
records_linear = []

# 2) Iterate over loaded models and evaluate on 20% hold-out
for target, model in tqdm(loaded_linear_models.items(), desc="LinearRegression (Loaded)", unit="sensor"):
    # a) Extract features and target
    feats = sensor_to_features[target]
    X, y = df[feats], df[target]

    # b) Perform same 80/20 time-ordered split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    # c) Predict on test split
    y_pred = model.predict(X_te)

    # d) Record RMSE / R²
    records_linear.append({
        "Target":     target,
        "Parameters": {},  # still none for LinearRegression
        "RMSE":       np.sqrt(mean_squared_error(y_te, y_pred)),
        "R²":         r2_score(y_te, y_pred)
    })

# 3) Assemble into DataFrame
df_linear = pd.DataFrame(records_linear).set_index("Target")

In [None]:
(df_linear[["RMSE", "R²"]]), df_linear[["RMSE", "R²"]].describe()

### Ridge

In [None]:
import pickle

# Load Ridge models from the saved pickle file
with open("ridge_models.pkl", "rb") as f:
    loaded_ridge_models = pickle.load(f)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Output list for metrics
records = []

# Recompute metrics using same test split strategy
for target, model in loaded_ridge_models.items():
    feats = sensor_to_features[target]
    X, y = df[feats], df[target]

    # Split exactly like original: 80% train, 20% test (time-order preserved)
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    y_pred = model.predict(X_te)

    records.append({
        "Target": target,
        "Parameters": {"alpha": model.alpha},  # optional
        "RMSE": np.sqrt(mean_squared_error(y_te, y_pred)),
        "R²": r2_score(y_te, y_pred)
    })

# Convert to DataFrame
df_ridge = pd.DataFrame(records).set_index("Target")

In [None]:
(df_ridge[["RMSE", "R²"]]), df_ridge[["RMSE", "R²"]].describe()

### Ridge + Poly2 + TSCV

In [None]:
import pickle

# Load models from .pkl file
with open("ridge_poly2_tscv_models.pkl", "rb") as f:
    loaded_ridge_poly2_tscv_models = pickle.load(f)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# 1) Containers
records = []
tscv = TimeSeriesSplit(n_splits=5)

# 2) Evaluate loaded Ridge+Poly2 models using TSCV
for target, model in tqdm(loaded_ridge_poly2_tscv_models.items(), desc="Recompute Ridge+Poly2+TSCV", unit="sensor"):
    feats = sensor_to_features[target]
    X, y = df[feats].values, df[target].values

    y_preds = np.full_like(y, np.nan, dtype=float)

    for train_idx, test_idx in tscv.split(X):
        X_tr, X_te = X[train_idx], X[test_idx]
        y_tr = y[train_idx]

        model.fit(X_tr, y_tr)
        y_preds[test_idx] = model.predict(X_te)

    valid_mask = ~np.isnan(y_preds)
    rmse = np.sqrt(mean_squared_error(y[valid_mask], y_preds[valid_mask]))
    r2 = r2_score(y[valid_mask], y_preds[valid_mask])

    records.append({
        "Target":     target,
        "Parameters": {"alpha": model.named_steps["ridge"].alpha},
        "RMSE":       rmse,
        "R²":         r2
    })

# 3) Wrap into a DataFrame
df_ridge_poly2_tscv = pd.DataFrame(records).set_index("Target")

In [None]:
(df_ridge_poly2_tscv[["RMSE", "R²"]]
 ), df_ridge_poly2_tscv[["RMSE", "R²"]].describe()

### KNeighborsRegressor

In [None]:
import pickle

# Load models
knn_model_filename = "models_knn.pkl"

with open(knn_model_filename, "rb") as f:
    loaded_models_knn = pickle.load(f)

print(f"Loaded {len(loaded_models_knn)} KNN models from '{knn_model_filename}'")

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Evaluation container
recomputed_knn_records = []

for target, model in loaded_models_knn.items():
    feats = sensor_to_features[target]
    X, y = df[feats], df[target]

    # Consistent 80/20 split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    # Predict
    y_pred = model.predict(X_te)

    # Store metrics
    recomputed_knn_records.append({
        "Target": target,
        "Parameters": {
            "k": model.named_steps["knn"].n_neighbors,
            "weights": model.named_steps["knn"].weights
        },
        "RMSE": np.sqrt(mean_squared_error(y_te, y_pred)),
        "R²": r2_score(y_te, y_pred)
    })

# Results DataFrame
df_knn = pd.DataFrame(recomputed_knn_records).set_index("Target")

In [None]:
(df_knn[["RMSE", "R²"]]
 ), df_knn[["RMSE", "R²"]].describe()

### Random Forest

In [None]:
# Cell 2: Unpickle the RandomForest models in a new notebook
import pickle

# Load models
rf_model_filename = "models_rf.pkl"

with open(rf_model_filename, "rb") as f:
    loaded_models_rf = pickle.load(f)

print(
    f"Loaded {len(loaded_models_rf)} RandomForest models from '{rf_model_filename}'")

In [None]:
# Cell 3: Re-evaluate loaded RF models on 80/20 time-ordered split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Container for recomputed metrics
recomputed_rf_records = []

for target, model in loaded_models_rf.items():
    feats = sensor_to_features[target]
    X, y = df[feats], df[target]

    # Same 80/20 time-ordered split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    # Predict on test split
    y_pred = model.predict(X_te)

    # Record RMSE and R²
    recomputed_rf_records.append({
        "Target": target,
        "Parameters": {
            "n_estimators": model.n_estimators,
            "max_depth": model.max_depth,
            "max_features": model.max_features
        },
        "RMSE": np.sqrt(mean_squared_error(y_te, y_pred)),
        "R²": r2_score(y_te, y_pred)
    })

# Build the final DataFrame
df_rf = pd.DataFrame(recomputed_rf_records).set_index("Target")

In [None]:
(df_rf[["RMSE", "R²"]]
 ), df_rf[["RMSE", "R²"]].describe()

### MLP

In [None]:
# Cell 2: Load the pickled MLP models in a new notebook

import pickle

mlp_model_filename = "mlp_models.pkl"
with open(mlp_model_filename, "rb") as f:
    loaded_mlp_models = pickle.load(f)

print(f"Loaded {len(loaded_mlp_models)} MLP models from '{mlp_model_filename}'")

In [None]:
# Cell 3: Recompute metrics (RMSE and R²) on the same 80/20 split

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

records_mlp_recomputed = []

for target, model in loaded_mlp_models.items():
    feats = sensor_to_features[target]
    X, y = df[feats], df[target]

    # Same 80/20 time-ordered split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    y_pred = model.predict(X_te)

    records_mlp_recomputed.append({
        "Target":     target,
        "Parameters": model.get_params(),
        "RMSE":       np.sqrt(mean_squared_error(y_te, y_pred)),
        "R²":         r2_score(y_te, y_pred)
    })

df_mlp = pd.DataFrame(records_mlp_recomputed).set_index("Target")

In [None]:
(df_mlp[["RMSE", "R²"]]
 ), df_mlp[["RMSE", "R²"]].describe()

In [None]:
# Cell 2: Load the pickled GradientBoosting models in a new notebook

import pickle

gb_model_filename = "models_gb.pkl"
with open(gb_model_filename, "rb") as f:
    loaded_models_gb = pickle.load(f)

print(
    f"Loaded {len(loaded_models_gb)} GradientBoosting models from '{gb_model_filename}'")

In [None]:
# Cell 3: Recompute metrics (RMSE and R²) on the same 80/20 split

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

recomputed_gb_records = []

for target, model in loaded_models_gb.items():
    feats = sensor_to_features[target]
    X, y = df[feats], df[target]

    # Same 80/20 time‐ordered split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    y_pred = model.predict(X_te)

    recomputed_gb_records.append({
        "Target":     target,
        "Parameters": {
            "n_estimators": model.named_steps["gbrt"].n_estimators,
            "max_depth":    model.named_steps["gbrt"].max_depth,
            "learning_rate": model.named_steps["gbrt"].learning_rate
        },
        "RMSE":       np.sqrt(mean_squared_error(y_te, y_pred)),
        "R²":         r2_score(y_te, y_pred)
    })

df_gb = pd.DataFrame(recomputed_gb_records).set_index("Target")

In [None]:
(df_gb[["RMSE", "R²"]]
 ), df_gb[["RMSE", "R²"]].describe()

### SVR

In [None]:
# Cell 2: Load the pickled SVR models in a new notebook

import pickle

svr_model_filename = "models_svr.pkl"
with open(svr_model_filename, "rb") as f:
    loaded_models_svr = pickle.load(f)

print(f"Loaded {len(loaded_models_svr)} SVR models from '{svr_model_filename}'")

In [None]:
# Cell 3: Recompute metrics (RMSE and R²) on the same 80/20 split

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

recomputed_svr_records = []

for target, model in loaded_models_svr.items():
    feats = sensor_to_features[target]
    X, y = df[feats], df[target]

    # Same 80/20 time‐ordered split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    y_pred = model.predict(X_te)

    # Extract hyperparameters
    svr_step = model.named_steps["svr"]
    recomputed_svr_records.append({
        "Target":     target,
        "Parameters": {
            "C":       svr_step.C,
            "epsilon": svr_step.epsilon,
            "gamma":   svr_step.gamma
        },
        "RMSE":       np.sqrt(mean_squared_error(y_te, y_pred)),
        "R²":         r2_score(y_te, y_pred)
    })

df_svr = pd.DataFrame(recomputed_svr_records).set_index("Target")

In [None]:
(df_svr[["RMSE", "R²"]]
 ), df_svr[["RMSE", "R²"]].describe()

## Compare

In [None]:
import pandas as pd

# Select only the RMSE and R² columns from each results DataFrame
df_linear_metrics = df_linear[["RMSE", "R²"]]
df_ridge_metrics = df_ridge[["RMSE", "R²"]]
df_ridge_poly2_metrics = df_ridge_poly2_tscv[["RMSE", "R²"]]
df_knn_metrics = df_knn[["RMSE", "R²"]]
df_rf_metrics = df_rf[["RMSE", "R²"]]
df_mlp_metrics = df_mlp[["RMSE", "R²"]]
df_gb_metrics = df_gb[["RMSE", "R²"]]
df_svr_metrics = df_svr[["RMSE", "R²"]]

# Concatenate them side‐by‐side with a nested (MultiIndex) column header
df_comparison = pd.concat(
    {
        "Linear":     df_linear_metrics,
        "Ridge":      df_ridge_metrics,
        "RidgePoly2": df_ridge_poly2_metrics,
        "KNN":        df_knn_metrics,
        "Random Forrest":           df_rf_metrics,
        "MLP":        df_mlp_metrics,
        "GradBoostReg": df_gb_metrics, 
        "SVR": df_svr_metrics
    },
    axis=1
)

In [None]:
print(df_comparison.iloc[0])

In [None]:
# 1) List of model names in df_comparison
model_names = df_comparison.columns.levels[0].tolist()

# 2) For each sensor, find which model has the minimum RMSE
best_by_rmse = {}
for sensor, row in df_comparison.iterrows():
    # Extract RMSE values for all models
    rmse_vals = {model: row[(model, "RMSE")] for model in model_names}
    # Pick the model with the smallest RMSE
    best_model = min(rmse_vals, key=rmse_vals.get)
    best_by_rmse[sensor] = best_model

# 3) Build a small DataFrame
df_best_models = (
    pd.DataFrame.from_dict(best_by_rmse, orient="index", columns=["BestModel"])
      .sort_index()
)


In [None]:
print(df_best_models)

In [None]:
print(df_best_models.value_counts())

In [None]:
# 1) List of model names from df_comparison
model_names = df_comparison.columns.levels[0].tolist()

# 2) Compute ranks for each sensor based on RMSE
ranked_models = {}

for sensor, row in df_comparison.iterrows():
    rmse_vals = {model: row[(model, "RMSE")] for model in model_names}

    # Sort by RMSE ascending (lower is better)
    sorted_models = sorted(rmse_vals.items(), key=lambda x: x[1])

    # Save with explicit ranks
    ranked_models[sensor] = {
        f"Rank_{i+1}": model for i, (model, _) in enumerate(sorted_models)
    }

# 3) Create final DataFrame with ranks
df_model_ranks = pd.DataFrame.from_dict(
    ranked_models, orient="index").sort_index()

In [None]:
# Assume df_model_ranks contains columns like Rank_1, Rank_2, ..., Rank_N
rank_columns = df_model_ranks.columns

# Apply value_counts per column and combine into a single DataFrame
rank_summary = pd.concat(
    [df_model_ranks[col].value_counts() for col in rank_columns],
    axis=1
).fillna(0).astype(int)

# Rename columns for clarity (optional, already Rank_1 etc.)
rank_summary.columns = rank_columns

In [None]:
rank_summary

In [None]:
import pandas as pd
from tqdm.auto import tqdm

# Non-sensor metadata columns
cols_non_sensors = ["Datetime", "Volt_H1", "Volt_H2", "Volt_H3", "Hour"]
base_df = df_merged_clean[cols_non_sensors].copy()

# Template to reduce redundancy


def generate_predictions(model_dict, name):
    df_pred = base_df.copy()
    for s, model in tqdm(model_dict.items(), desc=f"Predicting {name}", unit="sensor"):
        X_full = df_merged_clean[sensor_to_features[s]]
        df_pred[s] = model.predict(X_full)
    df_pred = df_pred[df_merged_clean.columns].copy()
    return df_pred


# Generate predictions
df_linear_merged_clean = generate_predictions(loaded_linear_models, "Linear")
df_ridge_merged_clean = generate_predictions(loaded_ridge_models, "Ridge")
df_ridge_poly2_tscv_merged_clean = generate_predictions(
    loaded_ridge_poly2_tscv_models, "RidgePoly2")
df_knn_merged_clean = generate_predictions(loaded_models_knn, "KNN")
df_rf_merged_clean = generate_predictions(loaded_models_rf, "RandomForest")
df_mlp_merged_clean = generate_predictions(loaded_mlp_models, "MLP")
df_gb_merged_clean = generate_predictions(loaded_models_gb, "GradientBoosting")
df_svr_merged_clean = generate_predictions(loaded_models_svr, "SVR")

In [None]:
color_scheme = {
    "Measured": "#000000",
    "Linear": "#1f77b4",
    "Ridge": "#2ca02c",
    "RidgePoly2": "#ff7f0e",
    "KNN": "#9467bd",
    "RandomForest": "#17becf",
    "MLP": "#e377c2",
    "GradientBoosting": "#d62728",
    "SVR": "#8c564b"
}

In [None]:
import plotly.graph_objects as go

# Choose the sensor you want to plot, e.g.:
sensor = "T_14_pv1"

# Create a new figure
fig = go.Figure()

# Add measured values
fig.add_trace(go.Scatter(
    x=df_merged_clean["Datetime"],
    y=df_merged_clean[sensor],
    mode="markers",
    name="Measured",
    marker=dict(size=6, color=color_scheme["Measured"], opacity=0.6)
))

# Helper function to add prediction line


def add_prediction_trace(df_pred, model_name):
    fig.add_trace(go.Scatter(
        x=df_pred["Datetime"],
        y=df_pred[sensor],
        mode="lines",
        name=model_name,
        line=dict(width=2, color=color_scheme[model_name])
    ))


# Add all models
add_prediction_trace(df_linear_merged_clean, "Linear")
add_prediction_trace(df_ridge_merged_clean, "Ridge")
add_prediction_trace(df_ridge_poly2_tscv_merged_clean, "RidgePoly2")
add_prediction_trace(df_knn_merged_clean, "KNN")
add_prediction_trace(df_rf_merged_clean, "RandomForest")
add_prediction_trace(df_mlp_merged_clean, "MLP")
add_prediction_trace(df_gb_merged_clean, "GradientBoosting")
add_prediction_trace(df_svr_merged_clean, "SVR")

# Final layout
fig.update_layout(
    title=f"Predictions vs Measured: {sensor}",
    xaxis_title="Datetime",
    yaxis_title="Temperature [°C]",
    template="plotly_white",
    legend=dict(orientation="h", yanchor="bottom",
                y=1.02, xanchor="right", x=1),
    width=1100,
    height=600
)

fig.show()

In [None]:
import os
import matplotlib.pyplot as plt

# ──────────────────────────────────────────────────────────────────────────────
# Matplotlib version of plot_and_save
# ──────────────────────────────────────────────────────────────────────────────


def plot_and_save_matplotlib(sensor, model_name, df_measured, df_predicted, df_stats, out_dir="images"):
    """
    Creates a scatter of measured vs predicted for a given sensor and model using matplotlib,
    annotates RMSE/R², and saves the figure as a PNG.

    Parameters:
    - sensor:        String (e.g. "T_14_pv1")
    - model_name:    String matching a key in color_scheme (e.g. "Ridge")
    - df_measured:   DataFrame with actual sensor values (must contain the sensor column)
    - df_predicted:  DataFrame with predictions from this model (same sensor columns)
    - df_stats:      DataFrame indexed by sensor, with columns ["RMSE","R²"] for this model
    - out_dir:       Subfolder where images are saved (default "images")
    """
    # Ensure output directory exists
    os.makedirs(out_dir, exist_ok=True)

    measured = df_measured[sensor].values
    predicted = df_predicted[sensor].values

    rmse_val = df_stats.loc[sensor, "RMSE"]
    r2_val = df_stats.loc[sensor, "R²"]
    annotation_text = f"Model: {model_name}\nR² = {r2_val:.3f}\nRMSE = {rmse_val:.3f}"

    # Create figure and axes
    fig, ax = plt.subplots(figsize=(4.0, 4.0))

    # Scatter: measured vs predicted
    ax.scatter(
        measured,
        predicted,
        color=color_scheme[model_name],
        s=16,
        alpha=0.6,
        label="Data"
    )

    # y = x reference line
    min_val = min(measured.min(), predicted.min())
    max_val = max(measured.max(), predicted.max())
    ax.plot([min_val, max_val], [min_val, max_val],
            color="gray", linestyle="--", label="y = x")

    # Annotation of RMSE and R²
    ax.text(
        0.95, 0.05,
        annotation_text,
        transform=ax.transAxes,
        fontsize=12,
        ha="right",
        va="bottom",
        bbox=dict(boxstyle="round,pad=0.3", facecolor="white",
                  edgecolor="black", alpha=0.8)
    )

    # Labels and title
    ax.set_title(f"{model_name}: Comparsion for {sensor}")
    ax.set_xlabel("Measured")
    ax.set_ylabel("Predicted")

    # Legend
    ax.legend(loc="upper left")

    # Save as PNG
    outfile = os.path.join(out_dir, f"{model_name}_{sensor}.png")
    plt.tight_layout()
    fig.savefig(outfile, dpi=150)
    plt.close(fig)

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Single-call test with matplotlib version (for example)
# ──────────────────────────────────────────────────────────────────────────────

# Choose a sensor and model to test
test_sensor = "T_14_pv1"
test_model = "Ridge"

# Call the matplotlib-based function once
plot_and_save_matplotlib(
    sensor=test_sensor,
    model_name=test_model,
    df_measured=df_merged_clean,
    df_predicted=df_ridge_merged_clean,
    df_stats=df_ridge,
    out_dir="images"
)

print(f"Saved image for {test_model}_{test_sensor}.png to './images'")

In [None]:
model_data = {
   "Linear":           (df_linear_merged_clean, df_linear),
   "Ridge":            (df_ridge_merged_clean, df_ridge),
   "RidgePoly2":       (df_ridge_poly2_tscv_merged_clean, df_ridge_poly2_tscv),
   "KNN":              (df_knn_merged_clean, df_knn),
   "RandomForest":     (df_rf_merged_clean, df_rf),
   "MLP":              (df_mlp_merged_clean, df_mlp),
   "GradientBoosting": (df_gb_merged_clean, df_gb),
   "SVR":              (df_svr_merged_clean, df_svr)
}


In [None]:
import os

# ──────────────────────────────────────────────────────────────────────────────
# Map each model name to its merged‐predictions DataFrame and stats DataFrame
# ──────────────────────────────────────────────────────────────────────────────
model_pred_dfs = {
    "Linear":           df_linear_merged_clean,
    "Ridge":            df_ridge_merged_clean,
    "RidgePoly2":       df_ridge_poly2_tscv_merged_clean,
    "KNN":              df_knn_merged_clean,
    "RandomForest":     df_rf_merged_clean,
    "MLP":              df_mlp_merged_clean,
    "GradientBoosting": df_gb_merged_clean,
    "SVR":              df_svr_merged_clean
}

model_stats_dfs = {
    "Linear":           df_linear,
    "Ridge":            df_ridge,
    "RidgePoly2":       df_ridge_poly2_tscv,
    "KNN":              df_knn,
    "RandomForest":     df_rf,
    "MLP":              df_mlp,
    "GradientBoosting": df_gb,
    "SVR":              df_svr
}

# ──────────────────────────────────────────────────────────────────────────────
# Ensure output folder exists once
# ──────────────────────────────────────────────────────────────────────────────
os.makedirs("images", exist_ok=True)

# ──────────────────────────────────────────────────────────────────────────────
# Loop over sensors and plot only their best model
# ──────────────────────────────────────────────────────────────────────────────
for sensor, best_model in df_best_models["BestModel"].items():
    df_predicted = model_pred_dfs[best_model]
    df_stats = model_stats_dfs[best_model]

    # Call the matplotlib save function
    plot_and_save_matplotlib(
        sensor=sensor,
        model_name=best_model,
        df_measured=df_merged_clean,
        df_predicted=df_predicted,
        df_stats=df_stats,
        out_dir="images"
    )

In [None]:

# for model_name, (df_predicted, df_stats) in model_data.items():
#     for sensor in df_stats.index:
#         plot_and_save_matplotlib(
#             sensor=sensor,
#             model_name=model_name,
#             df_measured=df_merged_clean,
#             df_predicted=df_predicted,
#             df_stats=df_stats,
#             out_dir="images"
#         )