load basic packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

### First data exploration: structure, data types, missing values 

(at first only based on one of the files)

In [None]:
eccentricity = pd.read_csv("../data/eccentricity.csv")

In [None]:
eccentricity.info()

In [None]:
eccentricity.isnull().sum()

In [None]:
eccentricity.head()

In [None]:
eccentricity['gear_fault_desc'].unique()

In [None]:
eccentricity['load_value'].unique()

In [None]:
eccentricity['speedSet'].unique()

In [None]:
eccentricity['datetime']= pd.to_datetime(eccentricity['time_x'])

In [None]:
eccentricity['time'] = eccentricity['datetime'].dt.time

In [None]:
eccentricity['time_delta'] = eccentricity['datetime'].diff()

In [None]:
eccentricity.head()

In [None]:
eccentricity['time_delta'].unique()

### Load all data, merge all 6 files to one dataframe, add columns with file id and file name

In [None]:
files = list(Path("../data/").glob("*.csv"))

In [None]:
dfs = []
for i,f in enumerate(files,start = 1):
    df = pd.read_csv(f) 
    df['file_id'] = i
    df['file_name'] = f.name
    dfs.append(df)

master = pd.concat(dfs, axis = 0)

In [None]:
master.head()

In [None]:
master.isnull().sum()

In [None]:
file_nr = master['file_id'].unique()

### Time is of type object. Add a new column `datetime` of type datetime.

In [None]:
master['datetime'] = pd.to_datetime(master['time_x'])

In [None]:
master.head()

In [None]:
speedSets = master['speedSet'].unique()
load_values = master['load_value'].unique()

In [None]:
print(load_values, speedSets)

### Add a new column trial_id. This is needed because data from multiple trials was saved to one file.  

### Creating the `trial_id` column

To split each file into separate trials, following approach was used:

1. **Sort the data** by `file_id` and `datetime` so that time differences are computed in the right order.

2. **Compute time differences** within each file:
   ```python
   master["time_delta"] = master.groupby("file_id")["datetime"].diff()

3. **Define a sampling rate**, that indicates whenever the time gap is too large or too small

4. **Cumulative sum f0r each file_id**
    master['trial_id'] = mask.groupby(master['file_id']).cumsum()

In [None]:
master = master.sort_values(by = ['file_id', 'datetime'])

In [None]:
master['t_delta'] = master.groupby('file_id')['datetime'].diff()

In [None]:
master.head()

In [None]:
sampling_rate = pd.Timedelta(seconds = 0.0002)
print(sampling_rate)

In [None]:
mask = (master['t_delta'].isna()) | (master['t_delta'] != sampling_rate)

In [None]:
master['trial_id'] = mask.groupby(master['file_id']).cumsum()

In [None]:
master.head()

### Check if the splitting worked correctly

1.**Plotting all trials separately** (`trial_id` and `file_id`)

2.**Checking the unique values per file name:** expecting 6 trials in each file (combination of 3 different speed settings and 2 different load values)

3.**Using `describe()` per `trial_id` and `file_id`:** Expecting min to be 0.0002.

4.**Checking the number of time_delta != sampling_rate per (`trial_id` and `file_id`):** Expecting 1 

5.**Checking the number of unique values for `speedSet` and `load_value` per tiral:** Expecting 1 as the conditions should be constant in each trial

In [None]:
speed_str = master["speedSet"].map(lambda x: f"{float(x):.1f}")

master["experiment_id"] = (
    master["gear_fault_desc"].str.replace(" ", "_") + "_" +
    speed_str + "_" +
    master["load_value"].astype(str))

master["combo"] = speed_str + "/" + master["load_value"].astype(str)

master.head()

In [None]:
import plotly.express as px

#master["experiment_id"] = master["file_id"].astype(str) + "_" + master["trial_id"].astype(str)

fig = px.scatter(
    master,
    x=master.index,
    y="sensor1",
    color="experiment_id",   # jede (file_id, trial_id)-Kombi eigene Farbe
    title="Index-Plot farblich nach (file_id, trial_id)",
    labels={"x": "Datenpunkt (Index)", "sensor1": "sensor1"},
    opacity=0.6,
)

fig.update_layout(
    legend_title_text="experiment_id",
    legend=dict(itemsizing="constant", orientation="h", y=-0.2)  # horizontale Legende unten
)

fig.show()

In [None]:
master.groupby('file_id')['trial_id'].unique()

In [None]:
master.groupby(["file_id","trial_id"])["t_delta"].describe()

In [None]:
off_counts = (
    master
    .groupby(["file_id","trial_id"])["t_delta"]
    .apply(lambda s: (s != sampling_rate).sum())
    
)
off_counts = off_counts.to_frame("off_count")

off_counts['off_count'].unique()

In [None]:
master.groupby(["file_id","trial_id"])["speedSet"].nunique()

In [None]:
master.groupby(["file_id","trial_id"])["load_value"].nunique()

### Adding two new columns for relative time

1.`t_rel`: realtive time per trial (of type timedelta)

2.`t_rel_s`: relative time per trial in seconds(numerical)

In [None]:
master.sort_values(by = ['file_id', 'trial_id'])
master['t_rel'] = master.groupby(by=['file_id', 'trial_id']).cumcount() * sampling_rate
master.head()
master['t_rel'].dtype

In [None]:
master["t_rel_s"] = master["t_rel"].dt.total_seconds()
master.head()

### Reset index and sort columns

In [None]:
master = master.reset_index(drop=True)
master.drop('time_x', axis = 1, inplace=True)

In [None]:
new_order = ['experiment_id', 'combo', 'gear_fault_desc', 'speedSet', 'load_value', 't_rel_s', 'sensor1', 'sensor2']
master = master[new_order]

In [None]:
master.head()

### The dataframe master ist now completed and read for. 

No missing values, correct time columns, corrrectly splitted and labeld trials. 

Exporting the dataframe as parquet file 

In [None]:
master.to_parquet("../results/processed/master_clean.parquet", index=False)