In [None]:
import urllib.request, zipfile
import pandas as pd, numpy as np,xarray as xr, plotly
from pathlib import Path
import re, yaml, copy, json
import helper, config_adapter
from helper import RenderJSON
plotly.offline.init_notebook_mode()
plotly_config = {'scrollZoom': True, 'displaylogo': False, 'toImageButtonOptions': {
    'format': 'svg', # one of png, svg, jpeg, webp
    'filename': 'custom_image',
    'height': None,
    'width': None,
    'scale': 1 # Multiply title/legend/axis/canvas sizes by this factor
  },
  'modeBarButtonsToAdd': 
    ['drawline',
    'drawopenpath',
    'drawclosedpath',
    'drawcircle',
    'drawrect',
    'eraseshape'
    ]
  
  }

In [None]:
import itables
itables.init_notebook_mode(all_interactive=True )
itables.options.maxBytes = "1MB"
itables.options.lengthMenu = [25, 10, 50, 100, 200]
itables.options.buttons = ["copyHtml5", "csvHtml5", "excelHtml5"]
itables.options.layout={"topEnd": "pageLength", "top1": "searchBuilder"}

In [None]:
params = yaml.safe_load(Path("params.yaml").open("r"))
config_path = Path(params["config_path"])
if "variables" in params:
    variables = config_adapter.normalize_yaml_paramlist(params["variables"], format=config_adapter.variable_param_format)
else: 
    variables = []
RenderJSON(params)

In [None]:

config = config_adapter.load(config_path)
RenderJSON(config)

In [None]:
if "variables" in config:
    variables += config_adapter.normalize_yaml_paramlist(config["variables"], format=config_adapter.variable_param_format)
display(RenderJSON(variables))
ctx = config_adapter.Context()
for var in variables:
    config_adapter.add_variable_context(ctx, var)
RenderJSON(ctx.variables)


In [None]:
import random
if "random_seed" in config:
    random.seed(ctx.evaluate(config["random_seed"]))

In [None]:
def from_koe_link(ctx, params):
    params = ctx.evaluate(params)
    zip_path = Path(params["download_path"])
    zip_path.parent.mkdir(exist_ok=True, parents=True)
    if not zip_path.exists() or params["force_download"]:
        urllib.request.urlretrieve(params["link"], zip_path)
    zip = zipfile.Path(zip_path)
    audio_folder = Path(params["audio_folder"])
    # display(RenderJSON([str(s) for s in zip.iterdir()]))
    labels = zip / "segment.extraattrvalue.json"
    with labels.open("r") as f:
        labels = json.load(f)
    labels = pd.DataFrame(labels, columns=["id", "?", "label"])[["id", "label"]]
    # display(labels)
    timestamps = zip / "songinfo.json"
    with timestamps.open("r") as f:
        timestamps = json.load(f)
    all = []
    for f, k in timestamps.items():
        all.append(pd.DataFrame(k[1], columns=["id", "start", "end", "?1", "?2", "?3", "?id2"]).assign(file=f)[["id", "file", "start", "end"]])
    timestamps = pd.concat(all)
    timestamps[["start", "end"]] = timestamps[["start", "end"]] /1000
    # display(timestamps)
    annotations = pd.merge(timestamps, labels, on="id", how="outer").drop(columns="id")
    # display(annotations)
    def find_audio(file):
        r = list(audio_folder.glob(f"**/{file}.*"))
        if len(r) == 0:
            return None
        if len(r) > 1:
            raise Exception("Multiple matching files")
        return r[0]
    annotations["audio_path"] = annotations["file"].apply(find_audio)
    return annotations


In [None]:

ctx.methods["from_koe_link"] = from_koe_link
annotations = ctx.evaluate(config["processing"]["annotations"]).sort_values(["audio_path", "start"])
del ctx.methods["from_koe_link"]
display(annotations)




Handling of corrections todo

In [None]:
def mk_blocks(ctx, params):
    global annotations
    params = ctx.evaluate(params)
    block_shoulder_duration = params["block_shoulder_duration"]
    annotations = annotations.copy()
    annotations["block_change"] = (annotations["file"] != annotations["file"].shift(1)) | ((annotations["start"] - 2*block_shoulder_duration) > annotations["end"].shift(1))
    annotations["block"] = annotations["block_change"].cumsum()
    annotations = annotations.drop(columns="block_change")
    groups = [df for _, df in annotations.groupby('block')]
    random.shuffle(groups)
    shuffled: pd.DataFrame = pd.concat(groups).reset_index(drop=True)
    ars=[]
    t_prev=0
    new_t_start = []
    new_t_end = []
    common_fs = None
    for i, row in shuffled.to_dict(orient="index").items():
        import scipy.io.wavfile
        fs, data = scipy.io.wavfile.read(row["audio_path"], mmap=True)
        if common_fs is None:
            common_fs = fs
        elif fs!=common_fs:
            raise Exception("Not same fs")
        ar = xr.Dataset()
        istart = int((row["start"]-block_shoulder_duration)*fs)
        iend = int((row["end"]+block_shoulder_duration)*fs)
        ar["data"] = xr.DataArray(data[istart: iend], dims="t")
        ar["t"] = np.arange(ar["data"].size)/fs + t_prev
        ar["file"] = row["audio_path"]
        ar["t_file"] = xr.DataArray(np.arange(ar["data"].size)/fs + (row["start"]-block_shoulder_duration), dims="t")
        ar["block"] = row["block"]
        ar["label"] = xr.where((ar["t_file"] >= row["start"]) & (ar["t_file"] <= row["end"]), row["label"], "noise")
        ar["syb_num"] = i
        new_t_start.append(t_prev+block_shoulder_duration)
        t_prev += ar["data"].size/fs
        new_t_end.append(t_prev-block_shoulder_duration)
        ars.append(ar)
    data: xr.Dataset = xr.concat(ars, dim="t")
    data["t"].attrs["fs"] = common_fs
    shuffled["new_t_start"] = new_t_start
    shuffled["new_t_end"] = new_t_end
    shuffled["syb_num"] = np.arange(len(shuffled.index))
    return data, shuffled


In [None]:

ctx.methods["blocks"] = mk_blocks
merge_data, df = ctx.evaluate(config["processing"]["merge_data"])
fs = merge_data["t"].attrs["fs"]
del ctx.methods["blocks"]
display(merge_data)
display(df)

In [None]:
def percent_split(ctx, params):
    params = ctx.evaluate(params)
    max_syb = len(annotations.index)
    if np.abs(np.sum(list(params.values())) -1) > 10**(-5):
        raise Exception("problem sum!=1")
    merge_data["goal"] = xr.DataArray(np.full(merge_data.sizes["t"], "none", dtype=object), dims="t")
    df["goal"] = "none"
    cur=0
    for k,v in params.items():
        n = cur+ max_syb*v
        df["goal"] = np.where((df["syb_num"] >=cur) & (df["syb_num"] < n), str(k), df["goal"])
        merge_data["goal"] = xr.where((merge_data["syb_num"] >=cur) & (merge_data["syb_num"] < n), str(k), merge_data["goal"])
        cur=n

In [None]:
ctx.methods["percent_split"] = percent_split
ctx.evaluate(config["processing"]["split_data"])
del ctx.methods["percent_split"]
display(merge_data)
display(df)


In [None]:

display(df.groupby(["goal", "label"]).size().unstack("goal").fillna(0))

In [None]:
display(df.groupby(["label"]).apply(lambda d: 
                                    pd.Series(dict(
                                        duration_mean=(d["end"] - d["start"]).mean(),
                                        duration_std=(d["end"] - d["start"]).std(),
                                        duration_min=(d["end"] - d["start"]).min(),
                                        duration_max=(d["end"] - d["start"]).max(),
                                    ))
, include_groups=False).reset_index())

In [None]:
spectrogram_data = merge_data.set_coords([c for c in merge_data.data_vars if not c=="data"]).rolling(
    t=512, min_periods=512, center=True).construct("window_t", stride=128)

spectrogram_data["windowed"] = spectrogram_data["data"] * xr.DataArray(np.hanning(512), dims="window_t")
spectrogram_data["fft"]= xr.apply_ufunc(np.fft.rfft, spectrogram_data["windowed"], input_core_dims=[["window_t"]], output_core_dims=[["f"]])
spectrogram_data["psd"] = np.abs(spectrogram_data["fft"])**2
spectrogram_data["f"] = np.fft.rfftfreq(512, 1/fs)
spectrogram_data["display_psd"] = np.log(spectrogram_data["psd"])
spectrogram_data = spectrogram_data.sel(f=slice(200, 8000))
display(spectrogram_data)

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
max = spectrogram_data["display_psd"].max().item()
fig.add_trace(go.Heatmap(
    z=spectrogram_data["display_psd"].transpose("f", "t").values, 
    x=spectrogram_data["t"].values, 
    y=spectrogram_data["f"].values,
    hovertemplate ="""
          <b>t: %{x}s</b>
          f: %{y}Hz
          val: %{z} (log10(fft**2))
        """.replace('\n', '<br>'),
    zmin=max/2, zmax=max, name="spectrogram"))
min_t = spectrogram_data["t"].min().item()
max_t = spectrogram_data["t"].max().item()
fig.add_trace(go.Scatter(
        x=spectrogram_data["t"].values,
        y=spectrogram_data["f"].isel(f=10).item()*np.ones(spectrogram_data.sizes["t"]),
        customdata= spectrogram_data.assign(src_file=spectrogram_data["file"].astype(str)).reset_coords(["label", "t_file", "goal", "block"])[["label", "src_file", "t_file", "goal", "block"]].reset_coords(drop=True).to_dataframe(),
        mode='lines',
        opacity=0,
        hovertemplate ="""
          <b>syb_label: %{customdata[0]}</b>
          src_file: %{customdata[1]}
          t_file: %{customdata[2]}s
          block_num: %{customdata[4]}
          goal: %{customdata[3]}
        """.replace('\n', '<br>'),
        showlegend=False,
        name="info"
))
for _, row in df.to_dict(orient="index").items():
    fig.add_vrect(x0=row["new_t_start"], x1=row["new_t_end"], 
                label = dict(
                    text=row["label"],
                    textposition="top center",
                    font=dict(size=20, family="Times New Roman", color="white"),
                ),
                line=dict(color="MediumPurple"))
fig.update_layout(hovermode='x unified')
fig.show(config = plotly_config)

In [None]:
def export_das(ctx, params):
    params = ctx.evaluate(params)
    global merge_data
    labels = df["label"].drop_duplicates()
    if "noise" in labels:
        merge_data["syb"] = xr.DataArray(["noise"] +[l for l in labels if l!="noise"], dims="syb")
    else:
        merge_data["syb"] = xr.DataArray(labels, dims="syb")
    merge_data["label_proba"] = merge_data["label"] == merge_data["syb"]
    merge_data = merge_data.transpose("t", "syb")


    goals = df["goal"].drop_duplicates().tolist()
    datasets = {k: merge_data.where(merge_data["goal"] == k, drop=True) for k in goals}
    all = {k: dict(x= datasets[k]["data"].to_numpy().reshape(-1, 1),y=datasets[k]["label_proba"].to_numpy()) for k in datasets}
    attrs = dict(samplerate_x_Hz=fs, samplerate_y_Hz=fs, class_names=merge_data["syb"].to_numpy(), class_types=["segment"]*merge_data["syb"].size)
    all["attrs"] = attrs
    dir = Path(params["dest_folder"])

    for folder in all:
        if folder != "attrs":
            (dir/folder).mkdir(exist_ok=True, parents=True)
            for arr in all[folder]:
                np.save(dir/folder/(arr + ".npy"),  all[folder][arr])
        else:
            np.save(dir/"attrs.npy", all["attrs"], allow_pickle=True)



In [None]:
ctx.methods["das_export"] = export_das
if not isinstance(config["processing"]["exports"], list):
    config["processing"]["exports"] = [config["processing"]["exports"]]
for item in config["processing"]["exports"]:
    ctx.evaluate(item)
del ctx.methods["das_export"]
display(merge_data)