In [1]:
import json
import pandas as pd
import numpy as np

from distance_functions import get_overlap
from evaluation_metrics import get_tokens_bar, get_token_flags

PATH_DATA = "../0_data/6_word_data"
PATH_TOKENS = "../0_data/8_predictions/tokens"
PATH_EVAL = "../0_data/9_evaluation"

In [2]:
vocab_configs = {
    "a1" : {
        "pitch_range": 128,
        "duration_steps": 64,
        "triole_tokens": False,
        "key": "all",
    },
    "a2" : {
        "pitch_range": 128,
        "duration_steps": 64,
        "triole_tokens": True,
        "key": "all",
    },
    "a3" : {
        "pitch_range": 128,
        "duration_steps": 32,
        "triole_tokens": False,
        "key": "all",
    },
    "b" : {
        "pitch_range": 128,
        "duration_steps": 64,
        "triole_tokens": False,
        "key": "C",
    },
    "c" : {
        "pitch_range": 36,
        "duration_steps": 64,
        "triole_tokens": False,
        "key": "all",
    },
    "d" : {
        "pitch_range": 36,
        "duration_steps": 32,
        "triole_tokens": True,
        "key": "C",
    }
}

In [3]:
plagiarism_analysis = {}

for model in ["a1", "a2", "a3", "b", "c", "d"]:

    print(model)

    with open(f"{PATH_DATA}/{model}_token_data.json", "r") as fp:
        tokens_original = json.load(fp)
    token_data_original = [tokens_original[key] for key in tokens_original.keys()]
    with open(f"{PATH_TOKENS}/{model}.json", "r") as fp:
        tokens_generated = json.load(fp)
    token_data_generated = tokens_generated["data"]
    with open(f"{PATH_TOKENS}/{model}_50.json", "r") as fp:
        tokens_generated_50 = json.load(fp)
    token_data_generated_50 = tokens_generated_50["data"]

    token_flags = get_token_flags(vocab_configs[model])
    pitch_start = token_flags["start_pitch_token"]
    pitch_end = token_flags["end_pitch_token"]
    rythm_start = token_flags["start_duration_token"]
    rythm_end = token_flags["position_triole_2"] if token_flags["position_triole_2"] > 0 else token_flags["end_position_token"]

    token_data_original_pitches = [get_tokens_bar(song, 0, pitch_start, pitch_end) for song in token_data_original]
    token_data_original_rythm = [get_tokens_bar(song, 0, rythm_start, rythm_end) for song in token_data_original]
    token_data_generated_pitches = [get_tokens_bar(song, 0, pitch_start, pitch_end) for song in token_data_generated]
    token_data_generated_rythm = [get_tokens_bar(song, 0, rythm_start, rythm_end) for song in token_data_generated]
    token_data_generated_50_pitches = [get_tokens_bar(song, 0, pitch_start, pitch_end) for song in token_data_generated_50]
    token_data_generated_50_rythm = [get_tokens_bar(song, 0, rythm_start, rythm_end) for song in token_data_generated_50]

    within_overlap_count, within_max_overlap = get_overlap(token_data_original[:400], token_data_original[400:])
    within_overlap_pitch, within_max_overlap_pitch = get_overlap(token_data_original_pitches[:400], token_data_original_pitches[400:], bars=False)
    within_overlap_rythm, within_max_overlap_rythm = get_overlap(token_data_original_rythm[:400], token_data_original_rythm[400:],  bars=False)
    plagiarism_analysis[f"{model} within original"] = {
        "bar_overlaps_total": np.sum(within_overlap_count),
        "bar_max_overlap_value": np.max(within_max_overlap),
        "pitch_overlaps_total": np.sum(within_overlap_pitch),
        "pitch_max_overlap_value": np.max(within_max_overlap_pitch),
        "rythm_overlaps_total": np.sum(within_overlap_rythm),
        "rythm_max_overlap_value": np.max(within_max_overlap_rythm),
    }

    overlap_count, max_overlap = get_overlap(token_data_generated, token_data_original)
    overlap_pitch, max_overlap_pitch = get_overlap(token_data_generated_pitches, token_data_original_pitches, bars=False)
    overlap_rythm, max_overlap_rythm = get_overlap(token_data_generated_rythm, token_data_original_rythm,  bars=False)
    plagiarism_analysis[f"{model}"] = {
        "bar_overlaps_total": np.sum(overlap_count),
        "bar_max_overlap_value": np.max(max_overlap),
        "pitch_overlaps_total": np.sum(overlap_pitch),
        "pitch_max_overlap_value": np.max(max_overlap_pitch),
        "rythm_overlaps_total": np.sum(overlap_rythm),
        "rythm_max_overlap_value": np.max(max_overlap_rythm),
    }

    overlap_count_50, max_overlap_50 = get_overlap(token_data_generated_50, token_data_original)
    overlap_pitch_50, max_overlap_pitch_50 = get_overlap(token_data_generated_50_pitches, token_data_original_pitches, bars=False)
    overlap_rythm_50, max_overlap_rythm_50 = get_overlap(token_data_generated_50_rythm, token_data_original_rythm,  bars=False)
    plagiarism_analysis[f"{model}_50"] = {
        "bar_overlaps_total": np.sum(overlap_count_50),
        "bar_max_overlap_value": np.max(max_overlap_50),
        "pitch_overlaps_total": np.sum(overlap_pitch_50),
        "pitch_max_overlap_value": np.max(max_overlap_pitch_50),
        "rythm_overlaps_total": np.sum(overlap_rythm_50),
        "rythm_max_overlap_value": np.max(max_overlap_rythm_50),
    }

a1


100%|██████████| 400/400 [01:04<00:00,  6.16it/s]
100%|██████████| 400/400 [00:51<00:00,  7.84it/s]
100%|██████████| 400/400 [00:46<00:00,  8.58it/s]
100%|██████████| 100/100 [00:30<00:00,  3.24it/s]
100%|██████████| 100/100 [00:24<00:00,  4.06it/s]
100%|██████████| 100/100 [00:21<00:00,  4.68it/s]
100%|██████████| 100/100 [00:31<00:00,  3.18it/s]
100%|██████████| 100/100 [00:25<00:00,  3.97it/s]
100%|██████████| 100/100 [00:21<00:00,  4.56it/s]


a2


100%|██████████| 400/400 [01:07<00:00,  5.91it/s]
100%|██████████| 400/400 [00:50<00:00,  7.92it/s]
100%|██████████| 400/400 [00:45<00:00,  8.87it/s]
100%|██████████| 100/100 [00:28<00:00,  3.53it/s]
100%|██████████| 100/100 [00:21<00:00,  4.70it/s]
100%|██████████| 100/100 [00:18<00:00,  5.46it/s]
100%|██████████| 100/100 [00:28<00:00,  3.48it/s]
100%|██████████| 100/100 [00:21<00:00,  4.71it/s]
100%|██████████| 100/100 [00:18<00:00,  5.43it/s]


a3


100%|██████████| 400/400 [01:07<00:00,  5.91it/s]
100%|██████████| 400/400 [00:53<00:00,  7.53it/s]
100%|██████████| 400/400 [00:47<00:00,  8.39it/s]
100%|██████████| 100/100 [00:30<00:00,  3.26it/s]
100%|██████████| 100/100 [00:25<00:00,  4.00it/s]
100%|██████████| 100/100 [00:21<00:00,  4.71it/s]
100%|██████████| 100/100 [00:32<00:00,  3.09it/s]
100%|██████████| 100/100 [00:28<00:00,  3.51it/s]
100%|██████████| 100/100 [00:23<00:00,  4.29it/s]


b


100%|██████████| 400/400 [01:04<00:00,  6.18it/s]
100%|██████████| 400/400 [00:49<00:00,  8.15it/s]
100%|██████████| 400/400 [00:44<00:00,  8.90it/s]
100%|██████████| 100/100 [00:28<00:00,  3.46it/s]
100%|██████████| 100/100 [00:22<00:00,  4.49it/s]
100%|██████████| 100/100 [00:19<00:00,  5.07it/s]
100%|██████████| 100/100 [00:30<00:00,  3.29it/s]
100%|██████████| 100/100 [00:24<00:00,  4.11it/s]
100%|██████████| 100/100 [00:21<00:00,  4.71it/s]


c


100%|██████████| 400/400 [01:04<00:00,  6.20it/s]
100%|██████████| 400/400 [00:49<00:00,  8.09it/s]
100%|██████████| 400/400 [00:45<00:00,  8.85it/s]
100%|██████████| 100/100 [00:30<00:00,  3.31it/s]
100%|██████████| 100/100 [00:23<00:00,  4.21it/s]
100%|██████████| 100/100 [00:20<00:00,  4.77it/s]
100%|██████████| 100/100 [00:30<00:00,  3.23it/s]
100%|██████████| 100/100 [00:24<00:00,  4.04it/s]
100%|██████████| 100/100 [00:21<00:00,  4.61it/s]


d


100%|██████████| 400/400 [01:07<00:00,  5.90it/s]
100%|██████████| 400/400 [00:51<00:00,  7.71it/s]
100%|██████████| 400/400 [00:45<00:00,  8.79it/s]
100%|██████████| 100/100 [00:28<00:00,  3.54it/s]
100%|██████████| 100/100 [00:21<00:00,  4.68it/s]
100%|██████████| 100/100 [00:18<00:00,  5.48it/s]
100%|██████████| 100/100 [00:28<00:00,  3.57it/s]
100%|██████████| 100/100 [00:21<00:00,  4.73it/s]
100%|██████████| 100/100 [00:18<00:00,  5.54it/s]


In [4]:
plagiarism_df = pd.DataFrame(plagiarism_analysis)
plagiarism_df

Unnamed: 0,a1 within original,a1,a1_50,a2 within original,a2,a2_50,a3 within original,a3,a3_50,b within original,b,b_50,c within original,c,c_50,d within original,d,d_50
bar_overlaps_total,1102,1034,798,676,552,363,2030,1577,2759,1102,888,1025,1102,661,759,1347,714,849
bar_max_overlap_value,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,2,1,2
pitch_overlaps_total,126055,107943,107136,126055,96640,88097,149389,131396,157295,126055,99937,116055,126055,102786,107617,149389,111663,105303
pitch_max_overlap_value,3,2,2,3,3,3,3,2,3,3,3,3,3,2,2,3,3,2
rythm_overlaps_total,10789,11097,10863,7310,6341,4983,15872,14475,21338,10789,9434,12125,10789,9809,10268,10977,8521,7881
rythm_max_overlap_value,2,2,2,1,2,2,2,2,2,2,2,2,2,1,2,2,2,2


In [5]:
plagiarism_df.to_excel(f"{PATH_EVAL}/plagiarism_analysis.xlsx")