In [1]:
import json
import pandas as pd
import numpy as np

from distance_functions import get_overlap
from evaluation_metrics import get_tokens_bar, get_token_flags

PATH_DATA = "../0_data/6_word_data"
PATH_TOKENS = "../0_data/8_predictions/tokens"
PATH_EVAL = "../0_data/9_evaluation"

In [2]:
vocab_configs = {
    "a1" : {
        "pitch_range": 128,
        "duration_steps": 64,
        "triole_tokens": False,
        "key": "all",
    },
    "a2" : {
        "pitch_range": 128,
        "duration_steps": 64,
        "triole_tokens": True,
        "key": "all",
    },
    "a3" : {
        "pitch_range": 128,
        "duration_steps": 32,
        "triole_tokens": False,
        "key": "all",
    },
    "b" : {
        "pitch_range": 128,
        "duration_steps": 64,
        "triole_tokens": False,
        "key": "C",
    },
    "c" : {
        "pitch_range": 36,
        "duration_steps": 64,
        "triole_tokens": False,
        "key": "all",
    },
    "d" : {
        "pitch_range": 36,
        "duration_steps": 32,
        "triole_tokens": True,
        "key": "C",
    }
}

In [3]:
plagiarism_analysis = {}

for model in ["a1", "a2", "a3", "b", "c", "d"]:

    print(model)

    with open(f"{PATH_DATA}/{model}_token_data.json", "r") as fp:
        tokens_original = json.load(fp)
    token_data_original = [tokens_original[key] for key in tokens_original.keys()]
    with open(f"{PATH_TOKENS}/{model}.json", "r") as fp:
        tokens_generated = json.load(fp)
    token_data_generated = tokens_generated["data"]

    token_flags = get_token_flags(vocab_configs[model])
    pitch_start = token_flags["start_pitch_token"]
    pitch_end = token_flags["end_pitch_token"]
    rythm_start = token_flags["start_duration_token"]
    rythm_end = token_flags["position_triole_2"] if token_flags["position_triole_2"] > 0 else token_flags["end_position_token"]

    token_data_original_pitches = [get_tokens_bar(song, 0, pitch_start, pitch_end) for song in token_data_original]
    token_data_original_rythm = [get_tokens_bar(song, 0, rythm_start, rythm_end) for song in token_data_original]
    token_data_generated_pitches = [get_tokens_bar(song, 0, pitch_start, pitch_end) for song in token_data_generated]
    token_data_generated_rythm = [get_tokens_bar(song, 0, rythm_start, rythm_end) for song in token_data_generated]

    within_overlap_count, within_max_overlap = get_overlap(token_data_original[:400], token_data_original[400:])
    within_overlap_pitch, within_max_overlap_pitch = get_overlap(token_data_original_pitches[:400], token_data_original_pitches[400:], bars=False)
    within_overlap_rythm, within_max_overlap_rythm = get_overlap(token_data_original_rythm[:400], token_data_original_rythm[400:],  bars=False)
    plagiarism_analysis[f"{model} within original"] = {
        "bar_overlaps_total": np.sum(within_overlap_count),
        "bar_max_overlap_value": np.max(within_max_overlap),
        "pitch_overlaps_total": np.sum(within_overlap_pitch),
        "pitch_max_overlap_value": np.max(within_max_overlap_pitch),
        "rythm_overlaps_total": np.sum(within_overlap_rythm),
        "rythm_max_overlap_value": np.max(within_max_overlap_rythm),
    }

    overlap_count, max_overlap = get_overlap(token_data_generated, token_data_original)
    overlap_pitch, max_overlap_pitch = get_overlap(token_data_generated_pitches, token_data_original_pitches, bars=False)
    overlap_rythm, max_overlap_rythm = get_overlap(token_data_generated_rythm, token_data_original_rythm,  bars=False)
    plagiarism_analysis[f"{model}"] = {
        "bar_overlaps_total": np.sum(overlap_count),
        "bar_max_overlap_value": np.max(max_overlap),
        "pitch_overlaps_total": np.sum(overlap_pitch),
        "pitch_max_overlap_value": np.max(max_overlap_pitch),
        "rythm_overlaps_total": np.sum(overlap_rythm),
        "rythm_max_overlap_value": np.max(max_overlap_rythm),
    }

a1


100%|██████████| 400/400 [01:23<00:00,  4.78it/s]
100%|██████████| 400/400 [00:59<00:00,  6.69it/s]
100%|██████████| 400/400 [00:58<00:00,  6.81it/s]
100%|██████████| 100/100 [00:34<00:00,  2.91it/s]
100%|██████████| 100/100 [00:24<00:00,  4.10it/s]
100%|██████████| 100/100 [00:23<00:00,  4.22it/s]


a2


100%|██████████| 400/400 [01:26<00:00,  4.65it/s]
100%|██████████| 400/400 [01:00<00:00,  6.62it/s]
100%|██████████| 400/400 [00:58<00:00,  6.89it/s]
100%|██████████| 100/100 [00:32<00:00,  3.12it/s]
100%|██████████| 100/100 [00:21<00:00,  4.64it/s]
100%|██████████| 100/100 [00:20<00:00,  4.90it/s]


a3


100%|██████████| 400/400 [01:26<00:00,  4.63it/s]
100%|██████████| 400/400 [01:02<00:00,  6.37it/s]
100%|██████████| 400/400 [01:01<00:00,  6.55it/s]
100%|██████████| 100/100 [00:36<00:00,  2.71it/s]
100%|██████████| 100/100 [00:26<00:00,  3.72it/s]
100%|██████████| 100/100 [00:25<00:00,  3.86it/s]


b


100%|██████████| 400/400 [01:14<00:00,  5.38it/s]
100%|██████████| 400/400 [00:55<00:00,  7.26it/s]
100%|██████████| 400/400 [00:52<00:00,  7.64it/s]
100%|██████████| 100/100 [00:31<00:00,  3.13it/s]
100%|██████████| 100/100 [00:23<00:00,  4.24it/s]
100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


c


100%|██████████| 400/400 [01:14<00:00,  5.37it/s]
100%|██████████| 400/400 [00:53<00:00,  7.52it/s]
100%|██████████| 400/400 [00:52<00:00,  7.55it/s]
100%|██████████| 100/100 [00:32<00:00,  3.11it/s]
100%|██████████| 100/100 [00:23<00:00,  4.32it/s]
100%|██████████| 100/100 [00:22<00:00,  4.47it/s]


d


100%|██████████| 400/400 [01:08<00:00,  5.84it/s]
100%|██████████| 400/400 [00:52<00:00,  7.61it/s]
100%|██████████| 400/400 [00:46<00:00,  8.65it/s]
100%|██████████| 100/100 [00:29<00:00,  3.43it/s]
100%|██████████| 100/100 [00:22<00:00,  4.40it/s]
100%|██████████| 100/100 [00:19<00:00,  5.25it/s]


In [4]:
plagiarism_df = pd.DataFrame(plagiarism_analysis)
plagiarism_df

Unnamed: 0,a1 within original,a1,a2 within original,a2,a3 within original,a3,b within original,b,c within original,c,d within original,d
bar_overlaps_total,442,325,259,190,775,669,769,656,442,493,1347,1184
bar_max_overlap_value,1,1,1,1,2,1,1,1,1,1,2,1
pitch_overlaps_total,44926,36279,44926,32755,54958,45672,85606,62159,48402,41771,149389,117004
pitch_max_overlap_value,3,2,3,2,3,2,3,2,3,2,3,2
rythm_overlaps_total,13812,11775,9360,5813,20970,17853,12029,10950,12772,13628,10977,9875
rythm_max_overlap_value,3,2,3,2,4,2,3,2,3,2,2,2


In [6]:
plagiarism_df.to_excel(f"{PATH_EVAL}/plagiarism_analysis.xlsx")