In [None]:
import pandas as pd
import numpy as np
from itertools import permutations

import src.utils.analyse_utils as autils
from src.clean.make_dataset import CorpusMakerFromExcel

In [None]:
corpus = CorpusMakerFromExcel(
    fname='corpus_bill_evans',
    bandleader='Bill Evans',
    bandleader_role='pianist',
    dump_json=False
).tracks

In [None]:
df = pd.DataFrame(corpus)
df = pd.concat([df.drop(['musicians'], axis=1), df['musicians'].apply(pd.Series)], axis=1)
df = pd.concat([df.drop(['timestamps'], axis=1), df['timestamps'].apply(pd.Series)], axis=1)

In [None]:
exclusion_ids = [
    "e3a24b59-114e-4163-aa05-e71f582f5098",
    "d1b60961-c55e-4cec-861c-d23d9054e49d",
    "cf4fe905-cc4e-4206-812e-47236cbe1598",
    "8828ee88-55c0-4383-8f84-852592c1c817",
    "5ad284e9-cacb-4477-be08-b65e11c4271a",
    "616886f2-9997-4902-8bdf-4a1eff4f3720",
    "2955522b-d70a-4862-9359-397ca4ed2b1d",
    "0fc0ae4f-f8a7-4745-b682-94f1bfbf604c",
    "ed6edc64-8f29-40c9-a036-6a24479a3eb4",
    "426cf53f-23a9-46c8-96c2-451128bc6cdc",
    "18569365-43f0-46ec-9500-7c3d14e9fd18",
    "a47c25f3-1cd7-4b48-aa90-26998a00271a",
    "360d7a67-b8ff-4002-8c5a-e5d87b74c214",
    "b6e70afa-1184-4792-acf3-cb12b6e275bd",
    "ed3a8a72-25d6-408e-8c97-8c1524508d8a",
    "94d467a6-4cd0-416e-aa05-66addb8d648f",
    "056f18dc-5b2f-41ff-840d-86f3e096b6b9",
    "0397954a-3c7b-4879-a03b-b86ac769e23b",
    "5ba04874-b4b9-4f95-a35f-60265a26d200",
    "b0963cd3-311d-4a7b-b475-95a94bee6830",
    "3ef5652b-6880-42eb-817f-36e3be565b68"
]

In [None]:
has_manual_annotations = [
    '96983faa-05bb-4900-b8f3-a6af4ec08290',
    '29cee7e1-f0a4-4ee0-be3b-ad1129933c7a',
    # '360d7a67-b8ff-4002-8c5a-e5d87b74c214',
    '57707551-2a88-4a64-ae65-552f1b9ce4bc',
    # '616886f2-9997-4902-8bdf-4a1eff4f3720',
    '902d3125-5eab-4c81-b504-975ca7a8a841',
    'd66bd275-f3ac-45b8-bdd8-60045917d694',
    "597be228-4b30-4d1b-909d-dbe4fb57d937"
]
df['has_annotations'] = df['mbz_id'].isin(has_manual_annotations)

In [None]:
parser = lambda x : pd.to_datetime(x, format="%H:%M:%S") if len(x) > 5 else pd.to_datetime(x, format="%M:%S")
df['excerpt_duration'] = (df['end'].apply(parser) - df['start'].apply(parser))

In [None]:
all_tracks = df.groupby(['bassist', 'drummer'])['excerpt_duration'].agg(['sum', lambda x: x.sum() / 10])
only_annotated = df[df['has_annotations'] == True].groupby(['bassist', 'drummer'])['excerpt_duration'].sum()

In [None]:
summary = pd.concat([all_tracks, only_annotated], axis=1)
summary.columns = ['total', 'total/10', 'total_annotated']
summary['total_annotated'] = summary['total_annotated'].fillna(pd.Timedelta(0))
summary['total_required'] = (summary['total/10'] - summary['total_annotated']).dt.total_seconds()
summary['total/10'] = summary['total/10'].dt.total_seconds()

In [None]:
def get_closest_match(nums: np.ndarray, targ: int, depth: int = 5) -> int:
    permlist = [list(permutations(nums, r=i)) for i in range(1, depth)]
    permlist = [item for sublist in permlist for item in sublist]
    sumlist = [sum(l) for l in permlist]
    maxpos = 0
    for i in range(1, len(sumlist)):
        if abs(sumlist[i] - targ) < abs(sumlist[maxpos] - targ):
             maxpos = i
    return np.where(np.in1d(nums, permlist[maxpos]))[0]


to_annotate = []
for idx, grp in df[(df['has_annotations'] == False)].groupby(['bassist', 'drummer']):
    grp = grp[~grp['mbz_id'].isin(exclusion_ids)]
    ids = grp['mbz_id'].values
    vals = grp['excerpt_duration'].dt.total_seconds().values
    su = summary.reset_index(drop=False)
    target = su[(su['bassist'] == idx[0]) & (su['drummer'] == idx[1])]['total_required'].iloc[0]
    closest_match = get_closest_match(vals, target)
    to_annotate.extend(ids[closest_match])

In [None]:
sub1 = df[df['mbz_id'].isin(to_annotate)].drop_duplicates(subset=['bassist', 'drummer', 'excerpt_duration'])
sub2 = pd.concat([df[df['has_annotations'] == True], pd.DataFrame(df.iloc[81]).transpose()], axis=0)
pd.concat([
    pd.concat([sub1, sub2], axis=0).groupby(['bassist', 'drummer'])['excerpt_duration'].sum().dt.total_seconds().rename('seconds_to_get'),
    summary['total/10'].rename('seconds_required')
    ],
axis=1)


In [None]:
big = pd.concat([sub1, sub2], axis=0)
with open(rf'{autils.get_project_root()}\references\manual_annotation\tracks_to_annotate.txt', 'w') as f:
    for line in big.mbz_id.astype(str).to_list():
        f.write(f"{line}\n")

In [None]:
big.sort_values(by='track_name')[['track_name', 'recording_year', 'mbz_id', 'bassist', 'drummer', 'excerpt_duration']].to_clipboard()

In [None]:
big