In [7]:
from datetime import timedelta
from functools import partial
from pathlib import Path
from typing import List
import plotly.express as px
import h5py
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.cluster import BisectingKMeans, KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tables import NoSuchNodeError
from tqdm import tqdm
from tslearn.clustering import TimeSeriesKMeans
from utils import extract_consecutive_sequences, clean_on_length, clean_on_mode

from tabata import Opset, Selector
from checks import remove_problematic_records
from plotly.subplots import make_subplots
import plotly.graph_objects as go

%reload_ext autoreload
%autoreload 2



In [8]:
path_to_dataset = Path("archive/Aircraft_01.h5")
ds = Opset(path_to_dataset)
mini_ds = Opset.from_generator(ds.rewind()[:35], "mini_ds.h5")

remove_problematic_records(ds)
remove_problematic_records(mini_ds)

Checking if every dataframe is iterable...


100%|██████████| 1002/1002 [00:17<00:00, 57.89it/s]


Health checks: continuous, duration, components...


100%|██████████| 1002/1002 [00:23<00:00, 43.02it/s]


Checking if every dataframe is iterable...


100%|██████████| 35/35 [00:00<00:00, 50.92it/s]


Health checks: continuous, duration, components...


100%|██████████| 35/35 [00:00<00:00, 63.80it/s]


In [9]:
def compute_statistics(ds):
    statistics = []
    for df in tqdm(ds):
        if df.index.name == "record_651":
            continue
        first_derivative = df["ALT [ft]"] - df["ALT [ft]"].shift(1).ffill()
        n = len(first_derivative)
        first_derivative = first_derivative.sort_values().iloc[
            int(0.05 * n) : int(0.95 * n)
        ]
        statistics.append(
            {
                "record": df.index.name,
                "length": len(df),
                "mean_alt_diff": first_derivative.mean(),
                "std_alt_diff": first_derivative.std(),
                "beginning_alt": df["ALT [ft]"].min(),
                "ending_alt": df["ALT [ft]"].max(),
            }
        )
    return pd.DataFrame.from_records(statistics)

mini_climbs = Opset(Path("mini_climbs_100_20.h5"))
climbs = Opset(Path("climbs_100_20.h5"))

mini_climbs_statistics = compute_statistics(mini_climbs)
climbs_statistics = compute_statistics(climbs)


100%|██████████| 40/40 [00:00<00:00, 73.35it/s]
100%|██████████| 1165/1165 [00:14<00:00, 81.43it/s]


In [16]:
climbs_statistics.columns

Index(['record', 'length', 'mean_alt_diff', 'std_alt_diff', 'beginning_alt',
       'ending_alt', 'category'],
      dtype='object')

In [31]:
nb_clusters = 2
cluster = KMeans(n_clusters=nb_clusters)
climbs_statistics["category"] = cluster.fit_predict(
    climbs_statistics.drop(labels=["record", "category"], axis=1)
)
fig = px.scatter(
    climbs_statistics,
    x="length",
    y="mean_alt_diff",
    color="category",
    hover_data=["record"],
)
fig.show()

In [27]:
nb_samples = 2
fig = make_subplots(
    rows=nb_samples,
    cols=nb_clusters,
)
colname = "ALT [ft]"

for index_cluster in range(nb_clusters):
    record_concerned_climbs = climbs_statistics[
        climbs_statistics["category"] == index_cluster
    ].sort_values(by="length", ascending=False)
    selected_climbs = record_concerned_climbs.sample(nb_samples)
    for index, (index_row, row) in enumerate(selected_climbs.iterrows()):
        climbs_df = climbs[climbs.records.index(row["record"])]
        df = ds[ds.records.index("_".join(row["record"].split("_")[:2]))]
        df["climb_category"] = 0
        df.loc[climbs_df.index, "climb_category"] = index_cluster + 1

        colors = np.array(px.colors.qualitative.Plotly)[df["climb_category"]]
        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df[colname],
                name=f"{index_cluster}_{index}",
                mode="markers",
                hovertemplate="<i>Category</i>: %{text}"
                + f"<br><b>{colname}</b>: "
                + "%{y}<br>"
                + f"<br><b>Timestamp</b>: "
                + "%{x}<br>",
                text=df["climb_category"],
                showlegend=False,
            ),
            row=index + 1,
            col=index_cluster + 1,
        )
        fig.update_traces(
            selector=dict(name=f"{index_cluster}_{index}"),
            marker_color=colors,
            marker_size=2,
        )
fig.update_layout(
    autosize=False,
    width=250*nb_clusters,
    height=400*nb_samples,
)
fig.show()
