In [1]:
import json
import pandas as pd
import numpy as np

from distance_functions import get_overlap
from evaluation_metrics import get_tokens_bar, get_token_flags

PATH_DATA = "../0_data/6_word_data"
PATH_TOKENS = "../0_data/8_predictions/tokens"
PATH_EVAL = "../0_data/9_evaluation"

In [2]:
# define vocabulary configs
vocab_configs = {
    "a1" : {
        "pitch_range": 128,
        "duration_steps": 64,
        "triole_tokens": False,
        "key": "all",
    },
    "a2" : {
        "pitch_range": 128,
        "duration_steps": 64,
        "triole_tokens": True,
        "key": "all",
    },
    "a3" : {
        "pitch_range": 128,
        "duration_steps": 32,
        "triole_tokens": False,
        "key": "all",
    },
    "b" : {
        "pitch_range": 128,
        "duration_steps": 64,
        "triole_tokens": False,
        "key": "C",
    },
    "c" : {
        "pitch_range": 36,
        "duration_steps": 64,
        "triole_tokens": False,
        "key": "all",
    },
    "d" : {
        "pitch_range": 36,
        "duration_steps": 32,
        "triole_tokens": True,
        "key": "C",
    }
}

In [3]:
plagiarism_analysis = {}

for model in ["a1", "a2", "a3", "b", "c", "d"]:

    print(model)

    # load data
    with open(f"{PATH_DATA}/{model}_token_data.json", "r") as fp:
        tokens_original = json.load(fp)
    token_data_original = [tokens_original[key] for key in tokens_original.keys()]
    with open(f"{PATH_TOKENS}/{model}.json", "r") as fp:
        tokens_generated = json.load(fp)
    token_data_generated = tokens_generated["data"]
    with open(f"{PATH_TOKENS}/{model}_50.json", "r") as fp:
        tokens_generated_50 = json.load(fp)
    token_data_generated_50 = tokens_generated_50["data"]

    # get token flags
    token_flags = get_token_flags(vocab_configs[model])
    pitch_start = token_flags["start_pitch_token"]
    pitch_end = token_flags["end_pitch_token"]
    rythm_start = token_flags["start_duration_token"]
    rythm_end = token_flags["position_triole_2"] if token_flags["position_triole_2"] > 0 else token_flags["end_position_token"]

    # transform datasets into lists of bars
    token_data_original_pitches = [get_tokens_bar(song, 0, pitch_start, pitch_end) for song in token_data_original]
    token_data_original_rythm = [get_tokens_bar(song, 0, rythm_start, rythm_end) for song in token_data_original]
    token_data_generated_pitches = [get_tokens_bar(song, 0, pitch_start, pitch_end) for song in token_data_generated]
    token_data_generated_rythm = [get_tokens_bar(song, 0, rythm_start, rythm_end) for song in token_data_generated]
    token_data_generated_50_pitches = [get_tokens_bar(song, 0, pitch_start, pitch_end) for song in token_data_generated_50]
    token_data_generated_50_rythm = [get_tokens_bar(song, 0, rythm_start, rythm_end) for song in token_data_generated_50]

    # calculate plagiarism scores within training data
    within_overlap_count, within_max_overlap = get_overlap(token_data_original[:400], token_data_original[400:])
    within_overlap_pitch, within_max_overlap_pitch = get_overlap(token_data_original_pitches[:400], token_data_original_pitches[400:], bars=False)
    within_overlap_rythm, within_max_overlap_rythm = get_overlap(token_data_original_rythm[:400], token_data_original_rythm[400:],  bars=False)
    plagiarism_analysis[f"{model} within original"] = {
        "bar_overlaps_total": np.sum(within_overlap_count),
        "bar_max_overlap_value": np.max(within_max_overlap),
        "pitch_overlaps_total": np.sum(within_overlap_pitch),
        "pitch_max_overlap_value": np.max(within_max_overlap_pitch),
        "rythm_overlaps_total": np.sum(within_overlap_rythm),
        "rythm_max_overlap_value": np.max(within_max_overlap_rythm),
    }

    # calculate plagiarism scores between generated data with less epochs and training data
    overlap_count, max_overlap = get_overlap(token_data_generated, token_data_original)
    overlap_pitch, max_overlap_pitch = get_overlap(token_data_generated_pitches, token_data_original_pitches, bars=False)
    overlap_rythm, max_overlap_rythm = get_overlap(token_data_generated_rythm, token_data_original_rythm,  bars=False)
    plagiarism_analysis[f"{model}"] = {
        "bar_overlaps_total": np.sum(overlap_count),
        "bar_max_overlap_value": np.max(max_overlap),
        "pitch_overlaps_total": np.sum(overlap_pitch),
        "pitch_max_overlap_value": np.max(max_overlap_pitch),
        "rythm_overlaps_total": np.sum(overlap_rythm),
        "rythm_max_overlap_value": np.max(max_overlap_rythm),
    }

     # calculate plagiarism scores between generated data with 50 epochs and training data
    overlap_count_50, max_overlap_50 = get_overlap(token_data_generated_50, token_data_original)
    overlap_pitch_50, max_overlap_pitch_50 = get_overlap(token_data_generated_50_pitches, token_data_original_pitches, bars=False)
    overlap_rythm_50, max_overlap_rythm_50 = get_overlap(token_data_generated_50_rythm, token_data_original_rythm,  bars=False)
    plagiarism_analysis[f"{model}_50"] = {
        "bar_overlaps_total": np.sum(overlap_count_50),
        "bar_max_overlap_value": np.max(max_overlap_50),
        "pitch_overlaps_total": np.sum(overlap_pitch_50),
        "pitch_max_overlap_value": np.max(max_overlap_pitch_50),
        "rythm_overlaps_total": np.sum(overlap_rythm_50),
        "rythm_max_overlap_value": np.max(max_overlap_rythm_50),
    }

a1


100%|██████████| 400/400 [01:05<00:00,  6.09it/s]
100%|██████████| 400/400 [00:47<00:00,  8.45it/s]
100%|██████████| 400/400 [00:46<00:00,  8.64it/s]
100%|██████████| 100/100 [00:29<00:00,  3.41it/s]
100%|██████████| 100/100 [00:20<00:00,  4.93it/s]
100%|██████████| 100/100 [00:19<00:00,  5.05it/s]
100%|██████████| 100/100 [00:29<00:00,  3.43it/s]
100%|██████████| 100/100 [00:20<00:00,  4.99it/s]
100%|██████████| 100/100 [00:19<00:00,  5.07it/s]


a2


100%|██████████| 400/400 [01:07<00:00,  5.89it/s]
100%|██████████| 400/400 [00:47<00:00,  8.47it/s]
100%|██████████| 400/400 [00:45<00:00,  8.84it/s]
100%|██████████| 100/100 [00:26<00:00,  3.76it/s]
100%|██████████| 100/100 [00:17<00:00,  5.85it/s]
100%|██████████| 100/100 [00:16<00:00,  6.15it/s]
100%|██████████| 100/100 [00:27<00:00,  3.65it/s]
100%|██████████| 100/100 [00:18<00:00,  5.47it/s]
100%|██████████| 100/100 [00:17<00:00,  5.69it/s]


a3


100%|██████████| 400/400 [01:07<00:00,  5.93it/s]
100%|██████████| 400/400 [00:48<00:00,  8.19it/s]
100%|██████████| 400/400 [00:47<00:00,  8.38it/s]
100%|██████████| 100/100 [00:31<00:00,  3.17it/s]
100%|██████████| 100/100 [00:22<00:00,  4.35it/s]
100%|██████████| 100/100 [00:22<00:00,  4.51it/s]
100%|██████████| 100/100 [00:31<00:00,  3.21it/s]
100%|██████████| 100/100 [00:22<00:00,  4.43it/s]
100%|██████████| 100/100 [00:21<00:00,  4.58it/s]


b


100%|██████████| 400/400 [01:06<00:00,  6.05it/s]
100%|██████████| 400/400 [00:48<00:00,  8.20it/s]
100%|██████████| 400/400 [00:46<00:00,  8.64it/s]
100%|██████████| 100/100 [00:29<00:00,  3.34it/s]
100%|██████████| 100/100 [00:22<00:00,  4.48it/s]
100%|██████████| 100/100 [00:20<00:00,  4.79it/s]
100%|██████████| 100/100 [00:30<00:00,  3.32it/s]
100%|██████████| 100/100 [00:22<00:00,  4.52it/s]
100%|██████████| 100/100 [00:20<00:00,  4.83it/s]


c


100%|██████████| 400/400 [01:06<00:00,  6.04it/s]
100%|██████████| 400/400 [00:47<00:00,  8.44it/s]
100%|██████████| 400/400 [00:46<00:00,  8.64it/s]
100%|██████████| 100/100 [00:31<00:00,  3.17it/s]
100%|██████████| 100/100 [00:23<00:00,  4.30it/s]
100%|██████████| 100/100 [00:21<00:00,  4.56it/s]
100%|██████████| 100/100 [00:31<00:00,  3.19it/s]
100%|██████████| 100/100 [00:22<00:00,  4.40it/s]
100%|██████████| 100/100 [00:21<00:00,  4.57it/s]


d


100%|██████████| 400/400 [01:09<00:00,  5.76it/s]
100%|██████████| 400/400 [00:53<00:00,  7.55it/s]
100%|██████████| 400/400 [00:46<00:00,  8.69it/s]
100%|██████████| 100/100 [00:28<00:00,  3.54it/s]
100%|██████████| 100/100 [00:21<00:00,  4.67it/s]
100%|██████████| 100/100 [00:18<00:00,  5.55it/s]
100%|██████████| 100/100 [00:29<00:00,  3.37it/s]
100%|██████████| 100/100 [00:23<00:00,  4.17it/s]
100%|██████████| 100/100 [00:19<00:00,  5.12it/s]


In [4]:
# create dataframe out of plagiarsim analysis
plagiarism_df = pd.DataFrame(plagiarism_analysis)
plagiarism_df

Unnamed: 0,a1 within original,a1,a1_50,a2 within original,a2,a2_50,a3 within original,a3,a3_50,b within original,b,b_50,c within original,c,c_50,d within original,d,d_50
bar_overlaps_total,362,237,153,199,137,103,601,614,594,683,528,569,424,442,264,1347,806,1232
bar_max_overlap_value,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1
pitch_overlaps_total,37113,30208,22366,37113,25696,24491,45435,41967,39198,78945,66448,61572,43136,47293,36807,149389,109677,121403
pitch_max_overlap_value,3,2,2,3,2,2,3,2,2,3,2,2,3,2,2,3,3,3
rythm_overlaps_total,10729,8354,8756,7209,3842,4188,15613,14832,15061,10729,9278,10336,10729,8903,9241,10977,7513,9513
rythm_max_overlap_value,2,2,2,1,2,2,2,2,2,2,2,2,2,1,1,2,2,2


In [5]:
# save plagiarism scores
plagiarism_df.to_excel(f"{PATH_EVAL}/plagiarism_analysis.xlsx")