In [1]:
import json
import pandas as pd
import numpy as np

from distance_functions import get_overlap

PATH_DATA = "../0_data/6_word_data"
PATH_TOKENS = "../0_data/8_predictions/tokens"

In [2]:
plagiarism_analysis = {}

for model in ["a1", "a2", "a3", "b", "c", "d"]:

    print(model)

    with open(f"{PATH_DATA}/{model}_token_data.json", "r") as fp:
        tokens_original = json.load(fp)
    token_data_original = [tokens_original[key] for key in tokens_original.keys()]
    with open(f"{PATH_TOKENS}/{model}.json", "r") as fp:
        tokens_generated = json.load(fp)
    token_data_generated = tokens_generated["data"]

    within_overlap_count, within_max_overlap = get_overlap(token_data_original[:400], token_data_original[400:])
    plagiarism_analysis[f"{model} within original"] = {
        "bar_overlap_count": within_overlap_count,
        "bar_max_overlaps": within_max_overlap,
        "bar_overlaps_total": np.sum(within_overlap_count),
        "bar_max_overlap_value": np.max(within_max_overlap),
    }

    overlap_count, max_overlap = get_overlap(token_data_generated, token_data_original)
    plagiarism_analysis[model] = {
        "bar_overlap_count": overlap_count,
        "bar_max_overlaps": max_overlap,
        "bar_overlaps_total": np.sum(overlap_count),
        "bar_max_overlap_value": np.max(max_overlap),
    }

a1


100%|██████████| 400/400 [01:50<00:00,  3.62it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]


a2


100%|██████████| 400/400 [01:51<00:00,  3.59it/s]
100%|██████████| 100/100 [00:41<00:00,  2.42it/s]


a3


100%|██████████| 400/400 [01:48<00:00,  3.69it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]


b


100%|██████████| 400/400 [01:34<00:00,  4.23it/s]
100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


c


100%|██████████| 400/400 [01:37<00:00,  4.10it/s]
100%|██████████| 100/100 [00:42<00:00,  2.37it/s]


d


100%|██████████| 400/400 [01:29<00:00,  4.47it/s]
100%|██████████| 100/100 [00:37<00:00,  2.67it/s]


In [3]:
pd.DataFrame(plagiarism_analysis)

Unnamed: 0,a1 within original,a1,a2 within original,a2,a3 within original,a3,b within original,b,c within original,c,d within original,d
bar_overlap_count,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
bar_max_overlaps,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
bar_overlaps_total,442,325,259,190,775,669,769,656,442,493,1347,1184
bar_max_overlap_value,1,1,1,1,2,1,1,1,1,1,2,1
