# MVM case study for assessing operational modes of energy production in relation to C02 emission
##### by: Dr. Györk Fülöp - 19.01.2026

## 1. Import modules

In [1]:
import random

import nbformat
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from IPython.display import HTML
from nbconvert import HTMLExporter
from sklearn.cluster import DBSCAN
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## 2. Read in dataset - from silver schema

In [2]:
df = pd.read_csv("00_data/mvm_silver.csv")
df.columns

Index(['measurement_time', 'net_teny_rendszerterheles', 'net_hazai_termeles',
       'nuklearis', 'barnakoszen', 'gaz', 'olaj', 'szel', 'biomassza',
       'szemet', 'folyo', 'viztarozos', 'egyeb_megujulo', 'geotermikus',
       'egyeb_primer', 'feketekoszen', 'ipari_pv', 'hmke_pv', 'scte_pv',
       'imp_ex', 'co2_becsult', 'co2_teny', 'day_of_week', 'hour_of_day'],
      dtype='object')

## 3. Definition of operational modes 
Assess and clusterize the hourly measurements of the national energy production and the CO2 emission

### 3.1 Cluster analysis

In [3]:
df["net_hazai_termeles_log"] = np.log(df["net_hazai_termeles"])
features = ["net_hazai_termeles_log", "co2_teny"]
X = df[features].dropna()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

dbscan = DBSCAN(eps = 0.222, min_samples=15)
clusters = dbscan.fit_predict(X_scaled)

df_clustered = X.copy()
df_clustered["cluster"] = clusters

cluster_profile_df = df_clustered.groupby("cluster").agg({
    col: ["mean", "count" ] for col in df_clustered.columns if col != "cluster"
}).reset_index()

cluster_profile_df.columns = ["_".join(filter(None, col)) for col in cluster_profile_df.columns]

X_scaled_df = pd.DataFrame(X_scaled, columns=features)

unique_clusters = sorted(set(clusters))
colors = ["#%06x" % random.randint(0, 0xFFFFFF) for _ in range(len(unique_clusters))]

fig = go.Figure()
for i, cluster_id in enumerate(unique_clusters):
    mask = clusters == cluster_id
    fig.add_trace(
        go.Scatter(
            x=X_scaled_df.loc[mask, "net_hazai_termeles_log"],
            y=X_scaled_df.loc[mask, "co2_teny"],
            mode="markers",
            name=f"Cluster {cluster_id}" if cluster_id != -1 else "Noise",  # Name for legend
            marker={"color": colors[i]}))
fig.update_layout(title="DBSCAN Clustering", xaxis_title=features[0], yaxis_title=features[-1], showlegend=True)

HTML(fig.to_html(include_plotlyjs="cdn"))

In [4]:
cluster_profile_df["net_hazai_termeles_mean"] = np.exp(cluster_profile_df["net_hazai_termeles_log_mean"])
cluster_profile_df["net_hazai_termeles_count"] = cluster_profile_df["net_hazai_termeles_log_count"]

cluster_profile_df = cluster_profile_df.drop(
    columns=["net_hazai_termeles_log_mean", "net_hazai_termeles_log_count"]
    )

cluster_profile_df


Unnamed: 0,cluster,co2_teny_mean,co2_teny_count,net_hazai_termeles_mean,net_hazai_termeles_count
0,-1,480.553477,172,4939.553998,172
1,0,312.728594,1483,3447.883523,1483
2,1,908.566544,373,4039.258128,373
3,2,572.820112,143,3519.770292,143
4,3,319.320533,30,6871.467081,30


### 3.2 Output clustered data and cluster profiles to gold schema

In [5]:
df = pd.concat([df, df_clustered["cluster"]], axis=1)
df = df.drop(columns=["net_hazai_termeles_log"])

df.to_csv("00_data/mvm_with_clusters_gold.csv", index=False)

cluster_profile_df.to_csv("00_data/mvm_cluster_profiles_gold.csv", index=False)

## 4. Correlations of energy production and CO2 emissions in clusters 

### 4.1 Cluster_0

In [6]:
df_sub_0 = df[df["cluster"] == 0]
correlation_features_0 = df_sub_0[
    [
        "co2_teny","gaz","barnakoszen", "biomassza", "feketekoszen", "szemet", "olaj",
        "nuklearis","ipari_pv","hmke_pv","folyo", "viztarozos", "geotermikus", "scte_pv"
    ]
].corr()
correlation_features_0

Unnamed: 0,co2_teny,gaz,barnakoszen,biomassza,feketekoszen,szemet,olaj,nuklearis,ipari_pv,hmke_pv,folyo,viztarozos,geotermikus,scte_pv
co2_teny,1.0,0.68786,0.835023,0.116257,-0.096189,0.075989,0.060271,-0.210881,-0.039924,-0.063355,0.113513,0.20171,0.015207,0.04648
gaz,0.68786,1.0,0.369062,0.123662,0.071575,0.182336,0.047621,-0.239497,-0.1272,-0.163541,0.17883,0.172106,0.016746,0.074276
barnakoszen,0.835023,0.369062,1.0,0.102662,-0.213147,-0.093959,0.04426,-0.089455,-0.015372,-0.027016,-0.015847,0.178759,0.034106,0.041891
biomassza,0.116257,0.123662,0.102662,1.0,0.092865,0.025388,0.005728,-0.156679,0.011727,0.022564,0.116085,-0.011952,0.08959,-0.163748
feketekoszen,-0.096189,0.071575,-0.213147,0.092865,1.0,0.293656,-0.031005,-0.086392,-0.088084,-0.086195,0.387096,0.031568,-0.102461,0.152978
szemet,0.075989,0.182336,-0.093959,0.025388,0.293656,1.0,-0.129351,-0.303363,-0.152932,-0.142952,0.321342,0.328755,-0.065971,-0.014863
olaj,0.060271,0.047621,0.04426,0.005728,-0.031005,-0.129351,1.0,0.047851,-0.008694,-0.007994,-0.021281,0.010858,-0.005852,0.002579
nuklearis,-0.210881,-0.239497,-0.089455,-0.156679,-0.086392,-0.303363,0.047851,1.0,0.078631,0.09665,-0.622973,-0.418473,0.122972,0.077114
ipari_pv,-0.039924,-0.1272,-0.015372,0.011727,-0.088084,-0.152932,-0.008694,0.078631,1.0,0.974493,-0.120004,-0.092638,0.011661,0.399615
hmke_pv,-0.063355,-0.163541,-0.027016,0.022564,-0.086195,-0.142952,-0.007994,0.09665,0.974493,1.0,-0.133817,-0.104275,0.024533,0.399365


### 4.2 Cluster_1

In [7]:
df_sub_1 = df[df["cluster"] == 1]
correlation_features_1 = df_sub_1[
    [
        "co2_teny","gaz","barnakoszen", "biomassza", "feketekoszen", "szemet", "olaj",
        "nuklearis","ipari_pv","hmke_pv","folyo", "viztarozos", "geotermikus", "scte_pv"
    ]
].corr()
correlation_features_1

Unnamed: 0,co2_teny,gaz,barnakoszen,biomassza,feketekoszen,szemet,olaj,nuklearis,ipari_pv,hmke_pv,folyo,viztarozos,geotermikus,scte_pv
co2_teny,1.0,0.683587,0.583065,0.02329,-0.229322,-0.033112,0.136875,0.249342,-0.237907,-0.277508,-0.183152,-0.133936,0.073972,0.344492
gaz,0.683587,1.0,0.147697,0.20588,-0.139085,0.004861,0.000813,0.236458,-0.304162,-0.362936,-0.204646,-0.298292,-0.001265,0.460547
barnakoszen,0.583065,0.147697,1.0,-0.05708,-0.103857,-0.087293,0.18195,0.269519,-0.197193,-0.177878,-0.234811,-0.067712,0.186272,0.070843
biomassza,0.02329,0.20588,-0.05708,1.0,0.235949,-0.277113,-0.03803,0.189817,-0.042377,-0.001263,-0.090095,-0.392717,0.207818,-0.198714
feketekoszen,-0.229322,-0.139085,-0.103857,0.235949,1.0,0.335978,-0.085115,-0.547366,0.140328,0.173767,0.396903,0.089332,-0.214166,-0.061104
szemet,-0.033112,0.004861,-0.087293,-0.277113,0.335978,1.0,-0.093314,-0.469131,-0.037965,-0.009618,0.458049,0.449666,-0.305256,0.061155
olaj,0.136875,0.000813,0.18195,-0.03803,-0.085115,-0.093314,1.0,0.0901,0.024024,0.038978,-0.073972,-0.069235,0.218573,0.143864
nuklearis,0.249342,0.236458,0.269519,0.189817,-0.547366,-0.469131,0.0901,1.0,-0.210243,-0.229095,-0.811057,-0.61244,0.3963,0.163994
ipari_pv,-0.237907,-0.304162,-0.197193,-0.042377,0.140328,-0.037965,0.024024,-0.210243,1.0,0.958522,0.135987,0.019413,-0.073494,-0.006026
hmke_pv,-0.277508,-0.362936,-0.177878,-0.001263,0.173767,-0.009618,0.038978,-0.229095,0.958522,1.0,0.171635,0.072806,-0.047749,-0.100655


### 4.3 Cluster_2

In [8]:
df_sub_2 = df[df["cluster"] == 2]
correlation_features_2 = df_sub_2[
    [
        "co2_teny","gaz","barnakoszen", "biomassza", "feketekoszen", "szemet", "olaj",
        "nuklearis","ipari_pv","hmke_pv","folyo", "viztarozos", "geotermikus", "scte_pv"
    ]
].corr()
correlation_features_2

Unnamed: 0,co2_teny,gaz,barnakoszen,biomassza,feketekoszen,szemet,olaj,nuklearis,ipari_pv,hmke_pv,folyo,viztarozos,geotermikus,scte_pv
co2_teny,1.0,0.700216,0.253436,-0.087789,-0.23822,-0.054782,-0.000976,0.110103,0.113127,0.083842,-0.093949,-0.016977,0.035634,0.208589
gaz,0.700216,1.0,0.330648,-0.021132,-0.120814,0.092886,0.111548,0.083879,0.019554,-0.018317,0.057904,0.083774,-0.101871,0.307869
barnakoszen,0.253436,0.330648,1.0,-0.404622,-0.223066,0.203934,0.617311,0.21899,0.086121,0.112304,-0.077211,0.059495,-0.037673,0.429433
biomassza,-0.087789,-0.021132,-0.404622,1.0,0.139335,-0.509414,-0.217161,0.224964,-0.09458,-0.087366,-0.229105,-0.284314,0.354603,-0.370405
feketekoszen,-0.23822,-0.120814,-0.223066,0.139335,1.0,0.321115,-0.096749,-0.701058,0.084512,0.122622,0.627619,0.251942,-0.271205,-0.016464
szemet,-0.054782,0.092886,0.203934,-0.509414,0.321115,1.0,0.160203,-0.455486,0.146482,0.143046,0.602035,0.581253,-0.381139,0.205995
olaj,-0.000976,0.111548,0.617311,-0.217161,-0.096749,0.160203,1.0,0.092142,-0.006469,0.017205,0.029107,0.145128,-0.042427,0.30195
nuklearis,0.110103,0.083879,0.21899,0.224964,-0.701058,-0.455486,0.092142,1.0,-0.126641,-0.135051,-0.829827,-0.542868,0.42304,-0.090903
ipari_pv,0.113127,0.019554,0.086121,-0.09458,0.084512,0.146482,-0.006469,-0.126641,1.0,0.97105,0.038686,-0.007793,-0.061754,0.170837
hmke_pv,0.083842,-0.018317,0.112304,-0.087366,0.122622,0.143046,0.017205,-0.135051,0.97105,1.0,0.058438,-0.009337,-0.055139,0.175434


### 4.4 Cluster_3

In [9]:
df_sub_3 = df[df["cluster"] == 3]
correlation_features_3 = df_sub_3[
    [
        "co2_teny","gaz","barnakoszen", "biomassza", "feketekoszen", "szemet", "olaj",
        "nuklearis","ipari_pv","hmke_pv","folyo", "viztarozos", "geotermikus", "scte_pv"
    ]
].corr()
correlation_features_3

Unnamed: 0,co2_teny,gaz,barnakoszen,biomassza,feketekoszen,szemet,olaj,nuklearis,ipari_pv,hmke_pv,folyo,viztarozos,geotermikus,scte_pv
co2_teny,1.0,0.343061,0.410532,0.319559,0.079829,0.181108,0.019166,0.213696,-0.404178,-0.349172,-0.39503,-0.025626,,-0.422916
gaz,0.343061,1.0,0.473653,-0.064673,-0.431174,0.41344,0.308207,-0.016504,-0.644939,-0.765973,0.093603,0.351173,,-0.550159
barnakoszen,0.410532,0.473653,1.0,0.175956,0.03244,-0.20012,0.193864,0.18843,-0.443794,-0.641316,-0.030587,-0.083177,,-0.545356
biomassza,0.319559,-0.064673,0.175956,1.0,0.262172,-0.121132,0.199847,0.868697,-0.311953,-0.240516,-0.740444,-0.175602,,0.221049
feketekoszen,0.079829,-0.431174,0.03244,0.262172,1.0,-0.367776,-0.15242,0.305134,0.127665,0.170842,-0.199474,-0.181041,,-0.000157
szemet,0.181108,0.41344,-0.20012,-0.121132,-0.367776,1.0,-0.000502,-0.054963,-0.150512,-0.151384,-0.005256,0.465753,,-0.177651
olaj,0.019166,0.308207,0.193864,0.199847,-0.15242,-0.000502,1.0,0.053696,-0.231085,-0.39303,-0.274382,-0.223872,,-0.147288
nuklearis,0.213696,-0.016504,0.18843,0.868697,0.305134,-0.054963,0.053696,1.0,-0.287528,-0.167842,-0.375195,0.181024,,0.060681
ipari_pv,-0.404178,-0.644939,-0.443794,-0.311953,0.127665,-0.150512,-0.231085,-0.287528,1.0,0.865465,0.115918,-0.178051,,0.454079
hmke_pv,-0.349172,-0.765973,-0.641316,-0.240516,0.170842,-0.151384,-0.39303,-0.167842,0.865465,1.0,0.121354,-0.032323,,0.584506


### 4.5 Output of correlation matrices to gold schema

In [10]:
correlation_features_0.to_csv("00_data/correlation_features_cluster_0_gold.csv")
correlation_features_1.to_csv("00_data/correlation_features_cluster_1_gold.csv")
correlation_features_2.to_csv("00_data/correlation_features_cluster_2_gold.csv")
correlation_features_3.to_csv("00_data/correlation_features_cluster_3_gold.csv")

## 5. Modelling operational mode emission with production data

### 5.1 Model verification with train-test set-up, to prevent overfitting (on Cluster_0)

In [11]:
not_to_use_as_feature = [
    "co2_teny",
    "cluster",
    "hour_of_day",
    "day_of_week",
    "measurement_time",
    "net_teny_rendszerterheles",
    "net_hazai_termeles",
    "geotermikus",
    "imp_ex",
    "co2_becsult",
    "folyo",
    "viztarozos"
]

X = df_sub_0.drop(columns=not_to_use_as_feature)
y = df_sub_0["co2_teny"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

betas = [model.intercept_] + list(model.coef_)
features = ["intercept"] + list(X.columns)

beta_df_0 = pd.DataFrame({
    "Feature": features,
    "Beta": betas
})

beta_df_0 = beta_df_0.sort_values(by="Beta", ascending=False).reset_index(drop=True)
beta_df_0.index = beta_df_0["Feature"]
beta_df_0 = beta_df_0.drop(columns=["Feature"])

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Regression - Cluster_0:")
print("Beta Coefficients:")
print(beta_df_0)
print(f"\nTrain R-squared: {train_r2:.4f}")
print(f"Test R-squared: {test_r2:.4f}")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

Regression - Cluster_0:
Beta Coefficients:
                      Beta
Feature                   
intercept       111.623454
szemet            1.382271
barnakoszen       1.139945
olaj              0.369549
feketekoszen      0.122079
gaz               0.088354
szel              0.022090
hmke_pv           0.015800
ipari_pv         -0.000141
nuklearis        -0.005706
biomassza        -0.034979
egyeb_primer     -0.129179
scte_pv          -1.097124
egyeb_megujulo   -2.946871

Train R-squared: 0.8887
Test R-squared: 0.8813
Train RMSE: 21.3237
Test RMSE: 21.9295


In [12]:
cluster_model_name_list = []
rmse_list = []
r2_list = []

### 5.2 Modelling - Cluster_0 operational mode

In [13]:
X = df_sub_0.drop(columns=not_to_use_as_feature)
y = df_sub_0["co2_teny"]

model = LinearRegression()
model.fit(X, y)

betas = [model.intercept_] + list(model.coef_)
features = ["intercept"] + list(X.columns)

beta_df_0 = pd.DataFrame({
    "Feature": features,
    "Beta": betas
})

beta_df_0 = beta_df_0.sort_values(by="Beta", ascending=False).reset_index(drop=True)
beta_df_0.index = beta_df_0["Feature"]
beta_df_0 = beta_df_0.drop(columns=["Feature"])

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))

cluster_model_name_list.append("Cluster_0_lin_reg")
rmse_list.append(rmse)
r2_list.append(r2)

print("Regression - Cluster_0:")
print("Beta Coefficients:")
print(beta_df_0)
print(f"\nR-squared: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

Regression - Cluster_0:
Beta Coefficients:
                      Beta
Feature                   
intercept       117.706666
szemet            1.376081
barnakoszen       1.145905
olaj              0.333075
feketekoszen      0.123231
gaz               0.086923
szel              0.016617
hmke_pv           0.005057
ipari_pv          0.003539
nuklearis        -0.007109
biomassza        -0.052298
egyeb_primer     -0.140595
scte_pv          -1.265126
egyeb_megujulo   -2.912252

R-squared: 0.8876
RMSE: 21.4247


### 5.3 Modelling - Cluster_1 operational mode

In [14]:
X = df_sub_1.drop(columns=not_to_use_as_feature)
y = df_sub_1["co2_teny"]

model = LinearRegression()
model.fit(X, y)

betas = [model.intercept_] + list(model.coef_)
features = ["intercept"] + list(X.columns)

beta_df_1 = pd.DataFrame({
    "Feature": features,
    "Beta": betas
})

beta_df_1 = beta_df_1.sort_values(by="Beta", ascending=False).reset_index(drop=True)
beta_df_1.index = beta_df_1["Feature"]
beta_df_1 = beta_df_1.drop(columns=["Feature"])

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))

cluster_model_name_list.append("Cluster_1_lin_reg")
rmse_list.append(rmse)
r2_list.append(r2)

print("Regression - Cluster_1:")
print("Beta Coefficients:")
print(beta_df_1)
print(f"\nR-squared: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

Regression - Cluster_1:
Beta Coefficients:
                      Beta
Feature                   
intercept       345.049192
barnakoszen       1.512059
szemet            1.068677
egyeb_primer      0.267801
gaz               0.164131
olaj              0.119507
szel              0.059867
ipari_pv          0.049795
egyeb_megujulo    0.001316
biomassza        -0.010936
nuklearis        -0.015654
hmke_pv          -0.234459
scte_pv          -0.234533
feketekoszen     -0.283865

R-squared: 0.7341
RMSE: 24.9803


### 5.4 Modelling - Cluster_2 operational mode

In [15]:
X = df_sub_2.drop(columns=not_to_use_as_feature)
y = df_sub_2["co2_teny"]

model = LinearRegression()
model.fit(X, y)

betas = [model.intercept_] + list(model.coef_)
features = ["intercept"] + list(X.columns)

beta_df_2 = pd.DataFrame({
    "Feature": features,
    "Beta": betas
})

beta_df_2 = beta_df_2.sort_values(by="Beta", ascending=False).reset_index(drop=True)
beta_df_2.index = beta_df_2["Feature"]
beta_df_2 = beta_df_2.drop(columns=["Feature"])

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))

cluster_model_name_list.append("Cluster_2_lin_reg")
rmse_list.append(rmse)
r2_list.append(r2)

print("Regression - Cluster_2:")
print("Beta Coefficients:")
print(beta_df_2)
print(f"\nR-squared: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

Regression - Cluster_2:
Beta Coefficients:
                      Beta
Feature                   
intercept       370.673500
egyeb_megujulo    5.159652
gaz               0.239405
ipari_pv          0.028835
hmke_pv           0.001298
barnakoszen      -0.024175
nuklearis        -0.043952
szel             -0.050207
scte_pv          -0.091641
egyeb_primer     -0.130445
biomassza        -0.188003
olaj             -0.506069
feketekoszen     -0.535524
szemet           -4.703544

R-squared: 0.5951
RMSE: 32.6149


### 5.5 Modelling - Cluster_3 operational mode

In [16]:
X = df_sub_3.drop(columns=not_to_use_as_feature)
y = df_sub_3["co2_teny"]

model = LinearRegression()
model.fit(X, y)

betas = [model.intercept_] + list(model.coef_)
features = ["intercept"] + list(X.columns)

beta_df_3 = pd.DataFrame({
    "Feature": features,
    "Beta": betas
})

beta_df_3 = beta_df_3.sort_values(by="Beta", ascending=False).reset_index(drop=True)
beta_df_3.index = beta_df_3["Feature"]
beta_df_3 = beta_df_3.drop(columns=["Feature"])

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))

cluster_model_name_list.append("Cluster_3_lin_reg")
rmse_list.append(rmse)
r2_list.append(r2)

print("Regression - Cluster_3:")
print("Beta Coefficients:")
print(beta_df_3)
print(f"\nR-squared: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

Regression - Cluster_3:
Beta Coefficients:
                      Beta
Feature                   
intercept       694.902318
feketekoszen      5.921033
olaj              5.045838
egyeb_megujulo    2.740164
szemet            1.577232
biomassza         1.239335
barnakoszen       0.791052
hmke_pv           0.245393
szel              0.174282
gaz               0.069939
ipari_pv         -0.055376
nuklearis        -0.418518
egyeb_primer     -2.488286
scte_pv          -8.216985

R-squared: 0.8108
RMSE: 14.7498


### 5.6. Output of feature importance and model performance information

In [17]:
df_beta_summary = pd.concat(
    [beta_df_0.rename(columns={"Beta": "cluster_0_beta"}),
    beta_df_1.rename(columns={"Beta": "cluster_1_beta"}),
    beta_df_2.rename(columns={"Beta": "cluster_2_beta"}),
    beta_df_3.rename(columns={"Beta": "cluster_3_beta"})], axis=1)
df_beta_summary["feature"] = beta_df_0.index
df_beta_summary = df_beta_summary.reset_index(drop=True)
df_beta_summary = df_beta_summary[[
    "feature",
    "cluster_0_beta",
    "cluster_1_beta",
    "cluster_2_beta",
    "cluster_3_beta"
]]
df_beta_summary

Unnamed: 0,feature,cluster_0_beta,cluster_1_beta,cluster_2_beta,cluster_3_beta
0,intercept,117.706666,345.049192,370.6735,694.902318
1,szemet,1.376081,1.068677,-4.703544,1.577232
2,barnakoszen,1.145905,1.512059,-0.024175,0.791052
3,olaj,0.333075,0.119507,-0.506069,5.045838
4,feketekoszen,0.123231,-0.283865,-0.535524,5.921033
5,gaz,0.086923,0.164131,0.239405,0.069939
6,szel,0.016617,0.059867,-0.050207,0.174282
7,hmke_pv,0.005057,-0.234459,0.001298,0.245393
8,ipari_pv,0.003539,0.049795,0.028835,-0.055376
9,nuklearis,-0.007109,-0.015654,-0.043952,-0.418518


In [18]:
df_performance_summary = pd.DataFrame({
    "cluster_model": cluster_model_name_list,
    "rmse": rmse_list,
    "r2": r2_list
})
df_performance_summary

Unnamed: 0,cluster_model,rmse,r2
0,Cluster_0_lin_reg,21.424662,0.887573
1,Cluster_1_lin_reg,24.980308,0.734114
2,Cluster_2_lin_reg,32.614883,0.595104
3,Cluster_3_lin_reg,14.749808,0.810786


In [19]:
df_beta_summary.to_csv("00_data/cluster_model_beta_coefficients_gold.csv", index=False)
df_performance_summary.to_csv("00_data/cluster_model_performance_summary_gold.csv", index=False)

## 6. Export Notebook with Interactive Plots

In [20]:
notebook_path = "experiment.ipynb"
output_html = "experiment_output_mvm.html"

with open(notebook_path, encoding="utf-8") as f:
    notebook = nbformat.read(f, as_version=4)

html_exporter = HTMLExporter()
html_exporter.exclude_input_prompt = False
html_exporter.exclude_output_prompt = False

(body, resources) = html_exporter.from_notebook_node(notebook)

with open(output_html, "w+", encoding="utf-8") as f:
    f.write(body)

print(f"Experiment notebook exported to: {output_html}")

Experiment notebook exported to: experiment_output_mvm.html
