### Import necessary packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import plotly.express as px

### Load data to DataFrame called master

In [None]:
master = pd.read_parquet("../results/processed/master_clean.parquet")
master.drop(columns = ['t_delta'], inplace = True)
master.head()

### Check if there is each combination of `load_value` and `speedSet` for each fault type

In [None]:
fault_types = master['gear_fault_desc'].unique()
display(fault_types)

In [None]:
experiments = master[['experiment_id', 'speedSet', 'load_value', 'gear_fault_desc']].drop_duplicates()
display(experiments)

In [None]:
pd.crosstab(experiments['gear_fault_desc'], [experiments['speedSet'], experiments['load_value']])

Now I know that there is one experiment for each combination of `load_value` and `speedSet` per type of failure

### Check if order of magnitude is generally similar for data from both sensors

In [None]:
basic_sensor_stats = master[['sensor1', 'sensor2']].agg(["min", "max", "mean", "std"])
display(basic_sensor_stats)

### Check the correlation between `sensor1`and `sensor2` for each experiment separately

In [None]:
corr_sensors=(
    master.groupby('experiment_id')[['sensor1', 'sensor2']]
    .apply(lambda g: g['sensor1'].corr(g['sensor2']))
    .rename("corr")
    .reset_index()
)

corr_sensors.head()
display(corr_sensors[corr_sensors['corr']>0.5])

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(corr_sensors["corr"], bins=36, kde=True)
plt.title("histogram of correlation between sensor1 and sensor2")
plt.xlabel("correlation")
plt.ylabel("number of experiments")
plt.show()

Correlation between sensor 1 and sensor 2 is low (<0.4) for all experiments. 

### Plot sensor data for each cathegory (type of failure)

In [None]:
master["combo"] = master["speedSet"].astype(str) + " / " + master["load_value"].astype(str)

sensors = ["sensor1","sensor2"]
id_cols = ["gear_fault_desc","combo","t_rel_s"]
df_long = master[id_cols + sensors].melt(
    id_vars=id_cols,
    value_vars=sensors,
    var_name="sensor",
    value_name="value"
)

for fault in sorted(df_long["gear_fault_desc"].unique()):
    sub = df_long.query("gear_fault_desc == @fault")
    fig = px.line(
        sub,
        x="t_rel_s", y="value",
        color="combo",        # -> 6 klar verschiedene Farben
        facet_col="sensor" ,   # links sensor1, rechts sensor2
        title = fault
    )
    fig.update_layout(width=1100, height=300)  # optional größer
    fig.show()

###### Calculate typical coeffcients used in time series analysis:

1.**RMS:** (DE: Effektivwert)

2.**P2P:** Peak to Peak 

In [None]:
def rms(x):
     return np.sqrt(np.mean(x**2))

def p2p(x):
    return np.max(x) - np.min(x)

keys = ['gear_fault_desc', 'experiment_id', 'speedSet', 'load_value']
sensor_cols = ['sensor1', 'sensor2']

features = master.groupby(keys)[sensor_cols].agg([rms,p2p])

In [None]:
display(features)

In [None]:
#sns.boxplot(data=features, x="gear_fault_desc", y=("sensor1","rms"))
#plt.xticks(rotation=90)
#plt.xlabel("fault type", labelpad=20)

plt.figure(figsize=(10,5))
sns.boxplot(data=features, x="gear_fault_desc", y=("sensor1","rms"), color="lightgray")
sns.stripplot(data=features, x="gear_fault_desc", y=("sensor1","rms"), hue="experiment_id", dodge=True, size=5, alpha=0.7)
plt.xticks(rotation=90)
plt.legend(ncols=3, loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(
    data=features,
    x=('sensor1','rms'), y=('sensor1','p2p'),
    hue="gear_fault_desc",    # Farbe = Fehlerklasse
    style="load_value",       # Markerform = Last
    size="speedSet",          # Markergröße = Drehzahl
    sizes=(40, 200)           # kleinste und größte Markergröße
)
plt.title("RMS vs P2P (Sensor1) mit Load und Speed")
plt.xlabel("RMS (Sensor1)")
plt.ylabel("Peak-to-Peak (Sensor1)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # Legende rechts außen
plt.tight_layout()
plt.show()

In [None]:
features = features.reset_index()

def flatten_column(col):
    if isinstance(col, tuple):
        if col[1] == '':
            return col[0]
        else:
            return "_".join(col).strip()
    else:
        return col

features.columns = [flatten_column(col) for col in features.columns]

In [None]:
features.head()

In [None]:
feat_cols = ["sensor1_rms","sensor1_p2p","sensor2_rms","sensor2_p2p"]

baseline = (
    features.query("gear_fault_desc == 'No fault'")
    .set_index(["speedSet","load_value"])[feat_cols]
)

# 2. Deltas berechnen
def compute_delta(row):
    base = baseline.loc[(row.speedSet, row.load_value)]
    return pd.Series({f"{col}_delta": row[col] - base[col] for col in feat_cols})


features_with_delta = features.join(features.apply(compute_delta, axis=1))
features_with_delta = features_with_delta.join(features.apply(compute_delta_rel_prc, axis=1))
features_with_delta.set_index("experiment_id", inplace = True)


In [None]:
features_with_delta = features_with_delta[features_with_delta['gear_fault_desc'] != 'No fault']

features_with_delta.head()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(features_with_delta.filter(like="prc"), cmap="coolwarm",center=0, vmin=-200, vmax=200, annot=True)
plt.show()

In [None]:


df = features_with_delta  # für bessere Y-Labels

# Spalten auswählen
rms_cols = df.filter(like="rms_delta_rel_prc")
p2p_cols = df.filter(like="p2p_delta_rel_prc")

# dynamische Höhe (0.35 inch pro Zeile als grober Richtwert)
h = max(6, 0.35 * len(df))

# --- RMS-Heatmap (feine Skala, um 0 zentriert)
rms_vmax = np.nanpercentile(np.abs(rms_cols.values), 95)  # robust gegen Ausreißer
plt.figure(figsize=(10, h))
sns.heatmap(rms_cols, cmap="coolwarm", center=0, vmin=-rms_vmax, vmax=rms_vmax,
            annot=True, fmt=".2f")
plt.title("Δ% vs. No fault – RMS")
plt.xlabel("Feature")
plt.ylabel("experiment_id")
plt.tight_layout()
plt.show()

# --- P2P-Heatmap (eigene größere Skala, ggf. deckeln)
p2p_vmax = np.nanpercentile(np.abs(p2p_cols.values), 95)  # z.B. ~200–400, je nach Daten
plt.figure(figsize=(10, h))
sns.heatmap(p2p_cols, cmap="coolwarm", center=0, vmin=-p2p_vmax, vmax=p2p_vmax,
            annot=True, fmt=".1f")
plt.title("Δ% vs. No fault – P2P")
plt.xlabel("Feature")
plt.ylabel("experiment_id")
plt.tight_layout()
plt.show()