In [None]:
import pandas as pd

# Load the saved corrected CSVs
df_temp_1 = pd.read_csv("df_temp_1_corrected.csv")
df_temp_2 = pd.read_csv("df_temp_2_corrected.csv")
df_temp_3 = pd.read_csv("df_temp_3_corrected.csv")


# Load the saved corrected CSVs
df_temp_1_2 = pd.read_csv("df_2temp_1_corrected.csv")
df_temp_2_2 = pd.read_csv("df_2temp_2_corrected.csv")
df_temp_3_2 = pd.read_csv("df_2temp_3_corrected.csv")

In [None]:
# Drop columns with prefix 'dT' from each DataFrame
df_temp_1 = df_temp_1.loc[:, ~df_temp_1.columns.str.startswith('dT')]
df_temp_2 = df_temp_2.loc[:, ~df_temp_2.columns.str.startswith('dT')]
df_temp_3 = df_temp_3.loc[:, ~df_temp_3.columns.str.startswith('dT')]

df_temp_1_2 = df_temp_1_2.loc[:, ~df_temp_1_2.columns.str.startswith('dT')]
df_temp_2_2 = df_temp_2_2.loc[:, ~df_temp_2_2.columns.str.startswith('dT')]
df_temp_3_2 = df_temp_3_2.loc[:, ~df_temp_3_2.columns.str.startswith('dT')]

In [None]:
df_temp_1.columns, df_temp_2.columns, df_temp_3.columns

In [None]:
df_temp_1.columns, df_temp_2.columns, df_temp_3.columns, df_temp_1_2.columns, df_temp_2_2.columns, df_temp_3_2.columns

In [None]:
df_temp_1.shape, df_temp_2.shape, df_temp_3.shape, df_temp_1_2.shape, df_temp_2_2.shape, df_temp_3_2.shape

In [None]:
import pandas as pd

# Assuming df_temp_1, df_temp_2, df_temp_3 (batch 1)
# and df_temp_1_2, df_temp_2_2, df_temp_3_2 (batch 2) are already loaded

# For df_temp_1:
# 1) Find the maximum “Time” value in the first batch
max_time_1 = df_temp_1["Time"].max()

# 2) Select only those rows from batch 2 whose Time is strictly greater than max_time_1
new_rows_1 = df_temp_1_2[df_temp_1_2["Time"] > max_time_1].copy()

# 3) Concatenate and reset the index (ignore_index=True to get a new contiguous index)
df_temp_1 = pd.concat([df_temp_1, new_rows_1], ignore_index=True)

# For df_temp_2:
max_time_2 = df_temp_2["Time"].max()
new_rows_2 = df_temp_2_2[df_temp_2_2["Time"] > max_time_2].copy()
df_temp_2 = pd.concat([df_temp_2, new_rows_2], ignore_index=True)

# For df_temp_3:
max_time_3 = df_temp_3["Time"].max()
new_rows_3 = df_temp_3_2[df_temp_3_2["Time"] > max_time_3].copy()
df_temp_3 = pd.concat([df_temp_3, new_rows_3], ignore_index=True)

# After running this, df_temp_1, df_temp_2, df_temp_3 each include only the “new” rows
# from the second‐batch DataFrames.

In [None]:
# Define a helper function to apply to all DataFrames
def rename_and_add_datetime(df):
    df = df.rename(columns={"Time": "Epochs"})
    df["Datetime"] = pd.to_datetime(df["Epochs"], unit="s")
    return df


# Apply to all three
df_temp_1 = rename_and_add_datetime(df_temp_1)
df_temp_2 = rename_and_add_datetime(df_temp_2)
df_temp_3 = rename_and_add_datetime(df_temp_3)

In [None]:
def resample_hourly(df):
    df = df.set_index("Datetime")  # set datetime index
    # resample and keep Datetime
    df_resampled = df.resample("1h").mean().reset_index()
    return df_resampled


# Resample all 3
df_temp_1 = resample_hourly(df_temp_1)
df_temp_2 = resample_hourly(df_temp_2)
df_temp_3 = resample_hourly(df_temp_3)

In [None]:
df_temp_1 = df_temp_1.drop(columns=["Epochs"])
df_temp_2 = df_temp_2.drop(columns=["Epochs"])
df_temp_3 = df_temp_3.drop(columns=["Epochs"])

df_temp_1 = df_temp_1.rename(columns={"Voltage": "Volt_H1"})
df_temp_2 = df_temp_2.rename(columns={"Voltage": "Volt_H2"})
df_temp_3 = df_temp_3.rename(columns={"Voltage": "Volt_H3"})

In [None]:
df_temp_1.columns, df_temp_2.columns, df_temp_2.columns

In [None]:
# Merge by Datetime
df_merged = pd.merge(df_temp_1, df_temp_2, on="Datetime", how="outer")
df_merged = pd.merge(df_merged, df_temp_3, on="Datetime", how="outer")

# Optional: sort by time
df_merged = df_merged.sort_values("Datetime").reset_index(drop=True)

In [None]:
df_merged.columns

In [None]:
rows_with_nans = df_merged[df_merged.isna().any(axis=1)]
print(rows_with_nans.head())

In [None]:
print(f"Rows with any NaNs: {rows_with_nans.shape[0]} of {df_merged.shape[0]}")

In [None]:
import plotly.graph_objects as go

# Create figure
fig = go.Figure()

# Hub list
hubs = [('Hub 1', df_temp_1), ('Hub 2', df_temp_2), ('Hub 3', df_temp_3)]

# Add all T_ series to the figure using human-readable datetime
for hub_name, df in hubs:
    for col in df.columns:
        if col.startswith("T_") and not col.startswith("dT_"):
            fig.add_trace(go.Scatter(
                x=df['Datetime'],  # ✅ Use datetime here
                y=df[col],
                mode='lines',
                name=f"{hub_name} - {col}"
            ))

# Update layout
fig.update_layout(
    title="All Temperature Series by Datetime",
    xaxis_title="Datetime",
    yaxis_title="Temperature (°C)",
    template="plotly_white",
    height=800
)

fig.show()

In [None]:
df_merged_clean = df_merged.dropna().reset_index(drop=True)

In [None]:
print(f"Original rows: {len(df_merged)}")
print(f"Cleaned rows:  {len(df_merged_clean)}")

In [None]:
df_merged_clean["Hour"] = df_merged_clean["Datetime"].dt.hour

In [None]:
df_merged_clean.columns

In [None]:
df_merged_clean.shape

### Correlation matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select only temperature sensor columns
temp_columns = [col for col in df_merged_clean.columns if col.startswith("T_")]
df_t_only = df_merged_clean[temp_columns]

# Compute the correlation matrix
corr_matrix_t = df_t_only.corr()

In [None]:
corr_matrix_t

In [None]:
# Plot with opacity and annotations
plt.figure(figsize=(14, 12))
sns.heatmap(
    corr_matrix_t,
    cmap="coolwarm",
    center=0,
    annot=True,                   # Show values
    fmt=".2f",                    # Format to 2 decimal places
    annot_kws={"size": 8},
    cbar_kws={"label": "Correlation"},
    alpha=0.99                     # Set opacity
)
plt.title("Correlation Matrix of Temperature Sensors (T_*)", fontsize=14)
plt.tight_layout()
plt.show()

### Basic Statistics

In [None]:
import numpy as np
import pandas as pd

# Filter T_ columns
temp_columns = [col for col in df_merged_clean.columns if col.startswith("T_")]
df_t_only = df_merged_clean[temp_columns]

# Compute correlation matrix
corr_matrix = df_t_only.corr()

# Extract upper triangle without diagonal
upper_tri = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Flatten and drop NaNs
correlations = upper_tri.unstack().dropna()

# Describe the correlation values
print("📊 Descriptive statistics for pairwise T-sensor correlations:\n")
print(correlations.describe().round(3))

In [None]:
# Cell 1: Initialization with predefined MultiIndex columns

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Copy your data
df = df_merged_clean.copy()

# Define hub groups
hub_1 = ['T_15_pv1', 'T_15_pv2', 'T_17_pv1', 'T_17_pv2',
         'T_19_pv1', 'T_19_pv2', 'T_14_pv1', 'T_14_pv2']
hub_2 = ['T_16_pv1', 'T_16_pv2', 'T_13_pv1', 'T_13_pv2',
         'T_18_pv1', 'T_18_pv2', 'T_20_pv1', 'T_20_pv2']
hub_3 = ['T_11_pv1', 'T_11_pv2', 'T_12_pv1',
         'T_12_pv2', 'T_27_pv1', 'T_27_pv2']

sensor_to_features = {}
for s in hub_1:
    sensor_to_features[s] = hub_2 + hub_3
for s in hub_2:
    sensor_to_features[s] = hub_1 + hub_3
for s in hub_3:
    sensor_to_features[s] = hub_1 + hub_2

In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 1) Prepare a dict to hold each sensor's trained LinearRegression model
linear_models = {}

# 2) Prepare a list to collect metrics in the same style as other models
records_linear = []

for target, feats in tqdm(sensor_to_features.items(), desc="LinearRegression", unit="sensor"):
    # a) Extract features X and target y
    X, y = df[feats], df[target]

    # b) Perform the same 80/20 time‐ordered split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    # c) Fit the model on the 80% training data
    lr = LinearRegression().fit(X_tr, y_tr)

    # d) Save the fitted model object under this sensor name
    linear_models[target] = lr

    # e) Compute predictions on the 20% test set (for metrics)
    y_pred = lr.predict(X_te)

    # f) Record RMSE/R² exactly as before
    records_linear.append({
        "Target":     target,
        "Parameters": {},  # no hyperparameters for plain LinearRegression
        "RMSE":       np.sqrt(mean_squared_error(y_te, y_pred)),
        "R²":         r2_score(y_te, y_pred)
    })

# 3) Build the summary DataFrame of metrics
df_linear = pd.DataFrame(records_linear).set_index("Target")

In [None]:
# Cell: Save the trained models to disk using pickle

import pickle

# Choose a filename for the serialized models
model_filename = "linear_models.pkl"

# Dump the entire dictionary of models to a file
with open(model_filename, "wb") as f:
    pickle.dump(linear_models, f)

print(f"Saved {len(linear_models)} models to {model_filename}")

In [None]:
(df_linear[["RMSE", "R²"]]), df_linear[["RMSE", "R²"]].describe()

In [None]:
from tqdm.auto import tqdm
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Sweep range
alphas = np.logspace(-5, 3.5, 1000)

# Output containers
records = []
ridge_models = {}  # Stores best Ridge model per sensor

for target, feats in tqdm(sensor_to_features.items(), desc="Ridge", unit="sensor"):
    X, y = df[feats], df[target]
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False)

    best = {"alpha": None, "rmse": np.inf, "r2": None, "model": None}
    for a in tqdm(alphas, desc=f"  α-loop for {target}", leave=False, unit="α"):
        model = Ridge(alpha=a).fit(X_tr, y_tr)
        pred = model.predict(X_te)
        rmse = np.sqrt(mean_squared_error(y_te, pred))
        if rmse < best["rmse"]:
            best.update(alpha=a, rmse=rmse, r2=r2_score(
                y_te, pred), model=model)

    # Save model
    ridge_models[target] = best["model"]

    # Log metrics
    records.append({
        "Target": target,
        "Parameters": {"alpha": best["alpha"]},
        "RMSE": best["rmse"],
        "R²": best["r2"]
    })

# Results DataFrame
df_ridge = pd.DataFrame(records).set_index("Target")

In [None]:
import pickle

# Save all Ridge models into a single file
with open("ridge_models.pkl", "wb") as f:
    pickle.dump(ridge_models, f)

In [None]:
df_ridge, df_ridge.describe()

In [None]:
from tqdm.auto import tqdm
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Hyperparameter grid
alphas = np.logspace(-5, 3.5, 100)

# Containers
records = []
ridge_poly2_tscv_models = {}

# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)

for target, feats in tqdm(sensor_to_features.items(), desc="Ridge + Poly2 + TSCV", unit="sensor"):
    X, y = df[feats].values, df[target].values

    best = {"alpha": None, "rmse": np.inf, "r2": None, "model": None}

    for alpha in alphas:
        pipe = Pipeline([
            ("poly", PolynomialFeatures(degree=2, include_bias=False)),
            ("scale", StandardScaler()),
            ("ridge", Ridge(alpha=alpha))
        ])

        y_preds = np.zeros_like(y, dtype=float)
        y_preds[:] = np.nan

        for train_idx, test_idx in tscv.split(X):
            X_tr, X_te = X[train_idx], X[test_idx]
            y_tr = y[train_idx]

            pipe.fit(X_tr, y_tr)
            y_preds[test_idx] = pipe.predict(X_te)

        valid_mask = ~np.isnan(y_preds)
        rmse = np.sqrt(mean_squared_error(y[valid_mask], y_preds[valid_mask]))
        r2 = r2_score(y[valid_mask], y_preds[valid_mask])

        if rmse < best["rmse"]:
            best.update(alpha=alpha, rmse=rmse, r2=r2, model=pipe)

    # Final refit on full data
    best["model"].fit(X, y)
    ridge_poly2_tscv_models[target] = best["model"]

    records.append({
        "Target": target,
        "Parameters": {"alpha": best["alpha"]},
        "RMSE": best["rmse"],
        "R²": best["r2"]
    })

df_ridge_poly2_tscv = pd.DataFrame(records).set_index("Target")

In [None]:
import pickle

# Save all models to a .pkl file
with open("ridge_poly2_tscv_models.pkl", "wb") as f:
    pickle.dump(ridge_poly2_tscv_models, f)

In [None]:
df_ridge_poly2_tscv, df_ridge_poly2_tscv.describe()

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

records_knn = []
models_knn = {}

neighbors = range(1, 11)
weights = ["uniform", "distance"]

for target, feats in tqdm(sensor_to_features.items(), desc="KNN", unit="sensor"):
    X, y = df[feats], df[target]
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False)

    best = {"k": None, "w": None, "rmse": np.inf, "r2": None, "model": None}

    for k in neighbors:
        for w in weights:
            pipe = Pipeline([
                ("scaler", StandardScaler()),
                ("knn", KNeighborsRegressor(n_neighbors=k, weights=w))
            ])
            pipe.fit(X_tr, y_tr)
            pred = pipe.predict(X_te)
            rmse = np.sqrt(mean_squared_error(y_te, pred))
            r2 = r2_score(y_te, pred)

            if rmse < best["rmse"]:
                best.update(k=k, w=w, rmse=rmse, r2=r2, model=pipe)

    models_knn[target] = best["model"]
    records_knn.append({
        "Target": target,
        "Parameters": {"k": best["k"], "weights": best["w"]},
        "RMSE": best["rmse"],
        "R²": best["r2"]
    })

df_knn = pd.DataFrame(records_knn).set_index("Target")

In [None]:
import pickle

# Save the dictionary of best models
knn_model_filename = "models_knn.pkl"

with open(knn_model_filename, "wb") as f:
    pickle.dump(models_knn, f)

print(f"Saved {len(models_knn)} KNN models to '{knn_model_filename}'")

In [None]:
df_knn, df_knn.describe()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

records = []
model_dict_rf = {}

ests = [100, 200]
depths = [10, 20]
feats_opts = [0.65, 0.85]

for target, feats in tqdm(sensor_to_features.items(), desc="RandomForest", unit="sensor"):
    X, y = df[feats], df[target]
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, shuffle=False, random_state=42)

    best = {"params": None, "rmse": np.inf, "r2": None, "model": None}

    for n in ests:
        for d in depths:
            for f in feats_opts:
                model = RandomForestRegressor(
                    n_estimators=n,
                    max_depth=d,
                    max_features=f,
                    random_state=42,
                    n_jobs=-1
                ).fit(X_tr, y_tr)

                pred = model.predict(X_te)
                rmse = np.sqrt(mean_squared_error(y_te, pred))
                r2 = r2_score(y_te, pred)

                if rmse < best["rmse"]:
                    best.update(params={"n_estimators": n, "max_depth": d, "max_features": f},
                                rmse=rmse, r2=r2, model=model)

    model_dict_rf[target] = best["model"]

    records.append({
        "Target": target,
        "Parameters": best["params"],
        "RMSE": best["rmse"],
        "R²": best["r2"]
    })

df_rf = pd.DataFrame(records).set_index("Target")

In [None]:
# Cell 1: Pickle the trained RandomForest models
import pickle

# Save the dictionary of best RF models
rf_model_filename = "models_rf.pkl"

with open(rf_model_filename, "wb") as f:
    pickle.dump(model_dict_rf, f)

print(f"Saved {len(model_dict_rf)} RandomForest models to '{rf_model_filename}'")

In [None]:
# Expand the dict‐column into separate columns
df_rf_params = pd.json_normalize(df_rf["Parameters"]).set_index(df_rf.index)

# If you want to merge it back into df_rf:
df_rf_expanded = df_rf.drop(columns="Parameters").join(df_rf_params)

# Display it:
print(df_rf_expanded)

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

# Containers
records_mlp = []
mlp_models = {}

# You can tweak these MLP hyperparameters as needed:
mlp_config = {
    "hidden_layer_sizes": (100,),  # one hidden layer of 100 neurons
    "activation": "relu",
    "alpha": 0.0001,               # L2 penalty
    "learning_rate_init": 0.001,
    "max_iter": 1000
}

for target, feats in tqdm(sensor_to_features.items(), desc="MLP Regressor", unit="sensor"):
    # 1) Split train/test (80/20 time‐ordered)
    X, y = df[feats], df[target]
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    # 2) Build pipeline: scale → MLP
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("mlp", MLPRegressor(**mlp_config))
    ])

    # 3) Fit on the 80% training portion
    pipe.fit(X_tr, y_tr)

    # 4) Evaluate on 20% hold‐out
    y_pred = pipe.predict(X_te)
    rmse = np.sqrt(mean_squared_error(y_te, y_pred))
    r2 = r2_score(y_te, y_pred)

    # 5) Save the fitted pipeline and metrics
    mlp_models[target] = pipe
    records_mlp.append({
        "Target":     target,
        "Parameters": mlp_config.copy(),
        "RMSE":       rmse,
        "R²":         r2
    })

# 6) Create a DataFrame of metrics
df_mlp = pd.DataFrame(records_mlp).set_index("Target")

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

# ──────────────────────────────────────────────────────────────────────────────
# 1) Map each sensor → its Volt_H column
# ──────────────────────────────────────────────────────────────────────────────
volt_map = {}
for s in hub_1:
    volt_map[s] = "Volt_H1"
for s in hub_2:
    volt_map[s] = "Volt_H2"
for s in hub_3:
    volt_map[s] = "Volt_H3"

# ──────────────────────────────────────────────────────────────────────────────
# 2) MLP configuration (two hidden layers + early stopping)
# ──────────────────────────────────────────────────────────────────────────────
records_mlp = []
mlp_models = {}

mlp_config = {
    "hidden_layer_sizes": (100, 50),    # two layers: 100 → 50
    "activation": "relu",
    "alpha": 0.0001,
    "learning_rate_init": 0.001,
    "max_iter": 500,
    "early_stopping": True,
    "validation_fraction": 0.1,
    "n_iter_no_change": 20,
    "random_state": 42
}

for target, feats in tqdm(sensor_to_features.items(), desc="MLP Regressor", unit="sensor"):
    # 3) Build X by combining:
    #    • hub‐outside sensors (feats)
    #    • the correct Volt_H# column
    #    • the Hour column
    X_base = df[feats]
    volt_col = volt_map[target]
    X = pd.concat([
        X_base,
        df[[volt_col]],
        df[["Hour"]]
    ], axis=1)

    y = df[target]

    # 4) Time‐ordered 80/20 split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    # 5) Pipeline: StandardScaler → MLP
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("mlp",    MLPRegressor(**mlp_config))
    ])

    # 6) Train on 80% portion
    pipe.fit(X_tr, y_tr)

    # 7) Evaluate on 20% hold‐out
    y_pred = pipe.predict(X_te)
    rmse = np.sqrt(mean_squared_error(y_te, y_pred))
    r2 = r2_score(y_te, y_pred)

    # 8) Save model and metrics
    mlp_models[target] = pipe
    records_mlp.append({
        "Target":     target,
        "Parameters": {"Volt": volt_col, "Hour": True, **mlp_config},
        "RMSE":       rmse,
        "R²":         r2
    })

# 9) DataFrame of metrics
df_mlp = pd.DataFrame(records_mlp).set_index("Target")

In [None]:
print(df_mlp[['RMSE', 'R²']])

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

# Hyperparameter grid
Cs = np.logspace(0, 4, 10)        # 10 values from 10^0 to 10^4
epsilons = [0.01, 0.05, 0.1]
gammas = ["scale"]                # you can also add "auto" if desired

# Containers for metrics and best models
records_svr = []
svr_models = {}

for target, feats in tqdm(sensor_to_features.items(), desc="SVR", unit="sensor"):
    # Prepare feature matrix and target vector
    X, y = df[feats], df[target]
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    best = {"C": None, "epsilon": None, "gamma": None,
            "rmse": np.inf, "r2": None, "model": None}

    # Grid search over (C, ε, γ)
    for C in Cs:
        for epsilon in epsilons:
            for gamma in gammas:
                pipe = Pipeline([
                    ("scaler", StandardScaler()),
                    ("svr", SVR(kernel="rbf", C=C, epsilon=epsilon, gamma=gamma))
                ])
                pipe.fit(X_tr, y_tr)
                y_pred = pipe.predict(X_te)
                rmse_val = np.sqrt(mean_squared_error(y_te, y_pred))
                r2_val = r2_score(y_te, y_pred)

                if rmse_val < best["rmse"]:
                    best.update(C=C, epsilon=epsilon, gamma=gamma,
                                rmse=rmse_val, r2=r2_val, model=pipe)

    # Save best pipeline for this sensor
    svr_models[target] = best["model"]

    # Record metrics
    records_svr.append({
        "Target": target,
        "Parameters": {"C": best["C"], "epsilon": best["epsilon"], "gamma": best["gamma"]},
        "RMSE": best["rmse"],
        "R²": best["r2"]
    })

# Build DataFrame of results
df_svr = pd.DataFrame(records_svr).set_index("Target")

In [None]:
# Cell 1: Save the trained SVR models to disk using pickle

import pickle

svr_model_filename = "models_svr.pkl"
with open(svr_model_filename, "wb") as f:
    pickle.dump(svr_models, f)

print(f"Saved {len(svr_models)} SVR models to '{svr_model_filename}'")

In [None]:
print(df_svr[["RMSE", "R²"]]), print(df_svr[["RMSE", "R²"]].describe())

In [None]:
# Cell 1: Train GradientBoostingRegressor over a small hyperparameter grid and save the best per sensor

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

# Hyperparameter grid
n_estimators_options = [100, 200, 400]
max_depth_options = [3, 5, 8]
learning_rate_options = [0.01, 0.05, 0.1]

records_gb = []
gb_models = {}

for target, feats in tqdm(sensor_to_features.items(), desc="GBRT Training", unit="sensor"):
    # 1) Prepare features and target, then split 80% train / 20% test (time‐ordered)
    X, y = df[feats], df[target]
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False
    )

    best = {"n_estimators": None, "max_depth": None,
            "learning_rate": None, "rmse": np.inf, "r2": None, "model": None}

    # 2) Grid‐search over (n_estimators, max_depth, learning_rate)
    for n_est in n_estimators_options:
        for md in max_depth_options:
            for lr in learning_rate_options:
                pipe = Pipeline([
                    ("scaler", StandardScaler()),
                    ("gbrt", GradientBoostingRegressor(
                        n_estimators=n_est,
                        max_depth=md,
                        learning_rate=lr,
                        random_state=42
                    ))
                ])
                pipe.fit(X_tr, y_tr)
                y_pred = pipe.predict(X_te)
                rmse_val = np.sqrt(mean_squared_error(y_te, y_pred))
                r2_val = r2_score(y_te, y_pred)

                if rmse_val < best["rmse"]:
                    best.update(
                        n_estimators=n_est,
                        max_depth=md,
                        learning_rate=lr,
                        rmse=rmse_val,
                        r2=r2_val,
                        model=pipe
                    )

    # 3) Save the best pipeline and its metrics for this sensor
    gb_models[target] = best["model"]
    records_gb.append({
        "Target": target,
        "Parameters": {
            "n_estimators": best["n_estimators"],
            "max_depth": best["max_depth"],
            "learning_rate": best["learning_rate"]
        },
        "RMSE": best["rmse"],
        "R²": best["r2"]
    })

# 4) Create DataFrame of results
df_gb = pd.DataFrame(records_gb).set_index("Target")

### Predictions

In [None]:
import pandas as pd
from tqdm.auto import tqdm

# ──────────────────────────────────────────────────────────────────────────────
# ASSUMPTION:
# • You have a dict `models` where each key is a sensor name and each value
#   is its fitted LinearRegression (trained on df_merged_clean[feats] DataFrames).
# • You have df_merged_clean and sensor_to_features defined exactly as before.
# ──────────────────────────────────────────────────────────────────────────────

predictions = {}

for target, feats in tqdm(sensor_to_features.items(), desc="Predict from Saved Models", unit="sensor"):
    # Instead of pulling out .values, keep it as a DataFrame:
    # <-- this is a DataFrame with column names
    X_full_df = df_merged_clean[feats]

    # Grab the pre‐trained LinearRegression for this sensor:
    lr_model = models[target]

    # Now predict using the DataFrame directly (no .values):
    y_pred = lr_model.predict(X_full_df)

    predictions[target] = y_pred

# Rebuild the merged‐predictions DataFrame exactly as before:
cols_non_sensors = ["Datetime", "Volt_H1", "Volt_H2", "Volt_H3", "Hour"]
df_linear_merged_clean = df_merged_clean[cols_non_sensors].copy()

for s in hub_1:
    df_linear_merged_clean[s] = predictions[s]
for s in hub_2:
    df_linear_merged_clean[s] = predictions[s]
for s in hub_3:
    df_linear_merged_clean[s] = predictions[s]

# Reorder columns to match the original
df_linear_merged_clean = df_linear_merged_clean[df_merged_clean.columns].copy()

In [None]:
import pandas as pd
from tqdm.auto import tqdm

# Assumes:
# - df_merged_clean is defined and cleaned
# - ridge_models contains trained Ridge models per sensor
# - sensor_to_features, hub_1, hub_2, hub_3 are defined

# 1) Container for predictions
ridge_predictions = {}

# 2) Predict all sensors using stored Ridge models
for target, feats in tqdm(sensor_to_features.items(), desc="Predicting (Ridge)", unit="sensor"):
    # Keep column names for sklearn compatibility
    X_full = df_merged_clean[feats]
    model = ridge_models[target]
    ridge_predictions[target] = model.predict(X_full)

# 3) Initialize with non-sensor columns
cols_non_sensors = ["Datetime", "Volt_H1", "Volt_H2", "Volt_H3", "Hour"]
df_ridge_merged_clean = df_merged_clean[cols_non_sensors].copy()

# 4) Add predictions in same order as original columns
for s in hub_1:
    df_ridge_merged_clean[s] = ridge_predictions[s]
for s in hub_2:
    df_ridge_merged_clean[s] = ridge_predictions[s]
for s in hub_3:
    df_ridge_merged_clean[s] = ridge_predictions[s]

# 5) Reorder to match original structure exactly
desired_order = list(df_merged_clean.columns)
df_ridge_merged_clean = df_ridge_merged_clean[desired_order].copy()

In [None]:
df_ridge_poly2_tscv_merged_clean = df_merged_clean[[
    "Datetime", "Volt_H1", "Volt_H2", "Volt_H3", "Hour"]].copy()

for group in [hub_1, hub_2, hub_3]:
    for sensor in group:
        feats = sensor_to_features[sensor]
        X_full = df_merged_clean[feats]
        model = models_ridge_poly2_tscv[sensor]
        df_ridge_poly2_tscv_merged_clean[sensor] = model.predict(X_full)

# Reorder columns to match original df_merged_clean
df_ridge_poly2_tscv_merged_clean = df_ridge_poly2_tscv_merged_clean[df_merged_clean.columns]

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Create predicted version of df_merged_clean using trained KNN models
# ─────────────────────────────────────────────────────────────────────────────

predictions_knn = {}

for sensor in tqdm(models_knn.keys(), desc="Predict (KNN)", unit="sensor"):
    X_full = df_merged_clean[sensor_to_features[sensor]]
    model = models_knn[sensor]
    predictions_knn[sensor] = model.predict(X_full)

# Prepare new DataFrame
cols_non_sensors = ["Datetime", "Volt_H1", "Volt_H2", "Volt_H3", "Hour"]
df_knn_merged_clean = df_merged_clean[cols_non_sensors].copy()

for s in hub_1 + hub_2 + hub_3:
    df_knn_merged_clean[s] = predictions_knn[s]

# Reorder to match original
df_knn_merged_clean = df_knn_merged_clean[df_merged_clean.columns]

In [None]:
# Predict on full data with best fitted models
predictions = {}

for sensor, model in model_dict_rf.items():
    X_full = df_merged_clean[sensor_to_features[sensor]]
    predictions[sensor] = model.predict(X_full)

# Base structure
cols_non_sensors = ["Datetime", "Volt_H1", "Volt_H2", "Volt_H3", "Hour"]
df_rf_merged_clean = df_merged_clean[cols_non_sensors].copy()

# Insert in hub order
for s in hub_1:
    df_rf_merged_clean[s] = predictions[s]
for s in hub_2:
    df_rf_merged_clean[s] = predictions[s]
for s in hub_3:
    df_rf_merged_clean[s] = predictions[s]

# Final reordering
df_rf_merged_clean = df_rf_merged_clean[df_merged_clean.columns]

In [None]:
# 1) Empty DataFrame with non‐sensor columns copied over
cols_non_sensors = ["Datetime", "Volt_H1", "Volt_H2", "Volt_H3", "Hour"]
df_mlp_merged_clean = df_merged_clean[cols_non_sensors].copy()

# 2) Predict each sensor's full series using the saved MLP pipelines
for sensor in tqdm(mlp_models.keys(), desc="Predicting (MLP)", unit="sensor"):
    feats = sensor_to_features[sensor]
    X_full = df_merged_clean[feats]

    # Use DataFrame directly so scaler sees column names
    pipe = mlp_models[sensor]
    df_mlp_merged_clean[sensor] = pipe.predict(X_full)

# 3) Re‐order columns to match original df_merged_clean
df_mlp_merged_clean = df_mlp_merged_clean[df_merged_clean.columns]

In [None]:
# Cell 2: Use saved SVR pipelines to predict full‐series and build df_svr_merged_clean

import pandas as pd
from tqdm.auto import tqdm

# 1) Copy non‐sensor columns from df_merged_clean
cols_non_sensors = ["Datetime", "Volt_H1", "Volt_H2", "Volt_H3", "Hour"]
df_svr_merged_clean = df_merged_clean[cols_non_sensors].copy()

# 2) Predict each sensor's entire time series using its SVR pipeline
for sensor, model in tqdm(svr_models.items(), desc="Predicting (SVR)", unit="sensor"):
    feats = sensor_to_features[sensor]
    X_full = df_merged_clean[feats]  # keep DataFrame with column names
    df_svr_merged_clean[sensor] = model.predict(X_full)

# 3) Re‐order to match original df_merged_clean
df_svr_merged_clean = df_svr_merged_clean[df_merged_clean.columns].copy()

In [None]:
# Cell 2: Use saved GBRT pipelines to predict full‐series and build df_gb_merged_clean

import pandas as pd
from tqdm.auto import tqdm

# 1) Copy non‐sensor columns from df_merged_clean
cols_non_sensors = ["Datetime", "Volt_H1", "Volt_H2", "Volt_H3", "Hour"]
df_gb_merged_clean = df_merged_clean[cols_non_sensors].copy()

# 2) Predict each sensor's entire time series using its GBRT pipeline
for sensor, model in tqdm(gb_models.items(), desc="Predicting (GBRT)", unit="sensor"):
    feats = sensor_to_features[sensor]
    X_full = df_merged_clean[feats]  # preserve column names for scaler
    df_gb_merged_clean[sensor] = model.predict(X_full)

# 3) Re‐order to match original df_merged_clean
df_gb_merged_clean = df_gb_merged_clean[df_merged_clean.columns].copy()

### Plots

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go


def plot_prediction_vs_measured_all(df_measured, df_predicted, sensors, df_stats):
    """
    Plots measured vs predicted values for all sensors in the list.

    Parameters:
    - df_measured: DataFrame with actual sensor values.
    - df_predicted: DataFrame with predicted sensor values.
    - sensors: list of column names to compare.
    - df_stats: DataFrame with RMSE and R² (e.g., df_linear), indexed by sensor.
    """
    for sensor in sensors:
        measured = df_measured[sensor].values
        predicted = df_predicted[sensor].values

        # Pull RMSE and R² from stats DataFrame
        rmse_val = df_stats.loc[sensor, "RMSE"]
        r2_val = df_stats.loc[sensor, "R²"]

        # Annotation text with only R² (Model) and RMSE
        annotation_text = (
            f"R² = {r2_val:.3f}<br>RMSE = {rmse_val:.3f}"
        )

        # Create scatter plot
        fig = go.Figure()

        fig.add_trace(go.Scatter(
            x=measured,
            y=predicted,
            mode="markers",
            name="Data",
            marker=dict(size=4, color="#196EE6", opacity=0.6)
        ))

        # Add y = x line
        fig.add_trace(go.Scatter(
            x=measured,
            y=measured,
            mode="lines",
            name="y = x",
            line=dict(color="gray", dash="dash")
        ))

        # Annotate stats
        fig.add_annotation(
            text=annotation_text,
            xref="paper", yref="paper",
            x=0.98, y=0.02,
            showarrow=False,
            font=dict(size=13),
            align="right"
        )

        fig.update_layout(
            title=f"Measured vs Predicted for {sensor}",
            xaxis_title="Measured",
            yaxis_title="Predicted",
            width=650,
            height=650,
            template="plotly_white",
            legend=dict(orientation="h", yanchor="bottom",
                        y=1.02, xanchor="right", x=1)
        )

        fig.show()

In [None]:
# Find sensor with best R², lowest RMSE, worst R², and highest RMSE
best_r2_sensor = df_linear["R²"].idxmax()
lowest_rmse_sensor = df_linear["RMSE"].idxmin()
worst_r2_sensor = df_linear["R²"].idxmin()
highest_rmse_sensor = df_linear["RMSE"].idxmax()

# Build unique list (avoid duplicates if same sensor satisfies multiple criteria)
selected_sensors = list(
    {best_r2_sensor, lowest_rmse_sensor, worst_r2_sensor, highest_rmse_sensor})

# # Plot them using existing function
# plot_prediction_vs_measured_all(
#     df_measured=df_merged_clean,
#     df_predicted=df_linear_merged_clean,
#     sensors=selected_sensors,
#     df_stats=df_linear
# )

In [None]:
# Find sensor with best R², lowest RMSE, worst R², and highest RMSE for Ridge
best_r2_sensor = df_ridge["R²"].idxmax()
lowest_rmse_sensor = df_ridge["RMSE"].idxmin()
worst_r2_sensor = df_ridge["R²"].idxmin()
highest_rmse_sensor = df_ridge["RMSE"].idxmax()

# Unique list of sensors to plot
selected_sensors = list({
    best_r2_sensor,
    lowest_rmse_sensor,
    worst_r2_sensor,
    highest_rmse_sensor
})

# # Plot them using the same plot function
# plot_prediction_vs_measured_all(
#     df_measured=df_merged_clean,
#     df_predicted=df_ridge_merged_clean,
#     sensors=selected_sensors,
#     df_stats=df_ridge
# )

In [None]:
# Identify extremes
best_r2_sensor = df_ridge_poly2_tscv["R²"].idxmax()
lowest_rmse_sensor = df_ridge_poly2_tscv["RMSE"].idxmin()
worst_r2_sensor = df_ridge_poly2_tscv["R²"].idxmin()
highest_rmse_sensor = df_ridge_poly2_tscv["RMSE"].idxmax()

selected_sensors = list({
    best_r2_sensor,
    lowest_rmse_sensor,
    worst_r2_sensor,
    highest_rmse_sensor
})

# # Plot
# plot_prediction_vs_measured_all(
#     df_measured=df_merged_clean,
#     df_predicted=df_ridge_poly2_tscv_merged_clean,
#     sensors=selected_sensors,
#     df_stats=df_ridge_poly2_tscv
# )

In [None]:
# Find extreme sensors based on df_knn
best_r2_sensor = df_knn["R²"].idxmax()
lowest_rmse_sensor = df_knn["RMSE"].idxmin()
worst_r2_sensor = df_knn["R²"].idxmin()
highest_rmse_sensor = df_knn["RMSE"].idxmax()

selected_sensors = list(
    {best_r2_sensor, lowest_rmse_sensor, worst_r2_sensor, highest_rmse_sensor})

# # Use your existing function to visualize
# plot_prediction_vs_measured_all(
#     df_measured=df_merged_clean,
#     df_predicted=df_knn_merged_clean,
#     sensors=selected_sensors,
#     df_stats=df_knn
# )

In [None]:
# Best/worst selectors
best_r2_sensor = df_rf["R²"].idxmax()
lowest_rmse_sensor = df_rf["RMSE"].idxmin()
worst_r2_sensor = df_rf["R²"].idxmin()
highest_rmse_sensor = df_rf["RMSE"].idxmax()

selected_sensors = list(
    {best_r2_sensor, lowest_rmse_sensor, worst_r2_sensor, highest_rmse_sensor})

# # Use previously defined plot function (assumed to be `plot_prediction_vs_measured_all`)
# plot_prediction_vs_measured_all(
#     df_measured=df_merged_clean,
#     df_predicted=df_rf_merged_clean,
#     sensors=selected_sensors,
#     df_stats=df_rf
# )

In [None]:
# 1) Identify extremes
best_r2_sensor = df_mlp["R²"].idxmax()
lowest_rmse_sensor = df_mlp["RMSE"].idxmin()
worst_r2_sensor = df_mlp["R²"].idxmin()
highest_rmse_sensor = df_mlp["RMSE"].idxmax()

selected_sensors = list({
    best_r2_sensor,
    lowest_rmse_sensor,
    worst_r2_sensor,
    highest_rmse_sensor
})

# # 2) Plot them
# plot_prediction_vs_measured_all(
#     df_measured=df_merged_clean,
#     df_predicted=df_mlp_merged_clean,
#     sensors=selected_sensors,
#     df_stats=df_mlp
# )

In [None]:
# Cell 3: Plot best/worst‐performing sensors for SVR using existing plot function

# 1) Identify extremes in df_svr
best_r2_sensor = df_svr["R²"].idxmax()
lowest_rmse_sensor = df_svr["RMSE"].idxmin()
worst_r2_sensor = df_svr["R²"].idxmin()
highest_rmse_sensor = df_svr["RMSE"].idxmax()

selected_sensors = list({
    best_r2_sensor,
    lowest_rmse_sensor,
    worst_r2_sensor,
    highest_rmse_sensor
})

# # 2) Plot measured vs predicted for those sensors
# plot_prediction_vs_measured_all(
#     df_measured=df_merged_clean,
#     df_predicted=df_svr_merged_clean,
#     sensors=selected_sensors,
#     df_stats=df_svr
# )

In [None]:
# Cell 3: Plot best/worst‐performing sensors for GBRT using existing plot function

# 1) Identify extremes in df_gb
best_r2_sensor = df_gb["R²"].idxmax()
lowest_rmse_sensor = df_gb["RMSE"].idxmin()
worst_r2_sensor = df_gb["R²"].idxmin()
highest_rmse_sensor = df_gb["RMSE"].idxmax()

selected_sensors = list({
    best_r2_sensor,
    lowest_rmse_sensor,
    worst_r2_sensor,
    highest_rmse_sensor
})

# 2) Plot measured vs predicted for those sensors
plot_prediction_vs_measured_all(
    df_measured=df_merged_clean,
    df_predicted=df_gb_merged_clean,
    sensors=selected_sensors,
    df_stats=df_gb
)

In [None]:
import plotly.graph_objects as go
import numpy as np


def plot_residual_histogram(sensor, df_measured, df_predicted, nbins=40):
    """
    Plot a histogram of residuals (Predicted − Measured) for a given sensor.
    
    Parameters:
      • sensor: string, column name of the sensor to analyze
      • df_measured: DataFrame containing the true 'Measured' series
      • df_predicted: DataFrame containing the model 'Predicted' series
      • nbins: integer, number of histogram bins (default: 40)
    """
    # 1) Compute residuals
    measured = df_measured[sensor].values
    predicted = df_predicted[sensor].values
    resid = predicted - measured

    # 2) Compute statistics
    mu = np.mean(resid)
    sigma = np.std(resid, ddof=1)

    # 3) Build histogram
    fig = go.Figure()
    fig.add_trace(go.Histogram(
        x=resid,
        nbinsx=nbins,
        name="Residuals",
        marker_color="#196EE6",
        opacity=0.75
    ))

    # 4) Add a vertical line at zero
    fig.add_trace(go.Scatter(
        x=[0, 0], y=[0, max(np.histogram(resid, bins=nbins)[0])],
        mode="lines",
        line=dict(color="red", dash="dash"),
        name="Zero"
    ))

    # 5) Annotate mean ± std
    annotation_text = (
        f"mean = {mu:.3f}<br>σ = {sigma:.3f}"
    )
    fig.add_annotation(
        text=annotation_text,
        xref="paper", yref="paper",
        x=0.95, y=0.95,
        showarrow=False,
        font=dict(size=12),
        bordercolor="black",
        borderwidth=1,
        bgcolor="white",
        align="right"
    )

    # 6) Layout
    fig.update_layout(
        title=f"{sensor}: Residuals (Predicted − Measured)",
        xaxis_title="Residual value",
        yaxis_title="Count",
        template="plotly_white",
        width=700,
        height=500,
        legend=dict(orientation="h", yanchor="bottom",
                    y=1.02, xanchor="right", x=1)
    )

    fig.show()

In [None]:
df_svr_merged_clean.shape

In [None]:
best_svr = df_svr["R²"].idxmax()
plot_residual_histogram(
    sensor=best_svr,
    df_measured=df_merged_clean,
    df_predicted=df_svr_merged_clean,
    nbins = 500
)

In [None]:
# Loop over all sensors and show their SVR residual histograms
all_sensors = df_svr.index.tolist()

for sensor in all_sensors:
    plot_residual_histogram(
        sensor=sensor,
        df_measured=df_merged_clean,
        df_predicted=df_svr_merged_clean,
        nbins=480
    )