# Imports

In [1]:
import json, os, random
dir = os.getcwd()
datasets = os.path.join(dir, "..", "datasets")
outputs = os.path.join(dir, "..", "outputs")

from SetSimilaritySearch import all_pairs
import pandas as pd

# NTL, RTL extractor

In [2]:
def load_doc(string, mode, name=None):
    json_dict = []
    count = 0
    if string == "imdb":
        with open(os.path.join(datasets, string), encoding='utf-8') as f:
            for line in f:
                doc = json.loads(line)
                json_dict.append(doc)
                count+=1
    else:
        with open(os.path.join(datasets, string), encoding='utf-8') as f:
            json_dict = json.load(f)
            count = len(json_dict)

    print(count, "documents loaded.")

    stack = []
    final_dict = {}
    all_keys = set()

    def do_walk(datadict):
        if isinstance(datadict, dict):
            for key, value in datadict.items():
                stack.append(key)
                if isinstance(value, dict) and len(value.keys()) == 0:
                    for val in stack:
                        all_keys.add(val)
                    final_dict["/".join(stack)] = "EMPTY_DICT"
                if isinstance(value, list) and len(value) == 0:
                    for val in stack:
                        all_keys.add(val)
                    final_dict["/".join(stack)] = 'EMPTY_LIST'
                if isinstance(value, dict):
                    do_walk(value)
                if isinstance(value, list):
                    do_walk(value)
                if isinstance(value, str):
                    for val in stack:
                        all_keys.add(val)
                    final_dict["/".join(stack)] = value
                stack.pop()

        if isinstance(datadict, list):
            n = 0
            for key in datadict:
                n = n + 1
                if isinstance(key, dict):
                    do_walk(key)
                if isinstance(key, list):
                    do_walk(key)
                if isinstance(key, str):
                    for val in stack:
                        all_keys.add(val)
                    final_dict["/".join(stack)] = key

    keys_list = []
    rtl_paths_list = []
    for i in range(0,len(json_dict)):
        do_walk(json_dict[i])
        keys_list.append(all_keys)
        rtl_paths_list.append([x for x in final_dict.keys()])
        final_dict={}
        all_keys=set()

    def flatten(t):
        return [item for sublist in t for item in sublist]

    if mode == "ntl":
        import re
        final_append_array = []
        for document in rtl_paths_list:
            for path in document:
                if path is not None:
                    result = [path[_.start()+1:] for _ in re.finditer("/", path)]
                for item in result : document.append(item)
            final_append_array.append(list(set(document)))
        rtl_paths_list = final_append_array

    f = open(os.path.join(outputs, name or "NTL_paths_list.json"), "w")
    f.write(json.dumps(rtl_paths_list, indent=4))
    f.close()

In [10]:
load_doc("imdb", "ntl", "f1")
load_doc("Full_Schema.json", "ntl", "f2")
with open(os.path.join(outputs, "f1")) as f1:
    with open(os.path.join(outputs, "f2")) as f2:
        sets1 = json.load(f1)
        sets = sets1[0:50]
        sets2 = json.load(f2)
        sets += sets2[0:50]

pairs = all_pairs(sets, similarity_func_name = "jaccard", similarity_threshold = 0)
data = list(pairs)
df = pd.DataFrame(data, columns=["Document 1", "Document 2", "SIMILARITY"]).sort_values(by = ["Document 2", "Document 1"])
df.to_csv(os.path.join(outputs, "similarity_result.csv"))

13797 documents loaded.
23374 documents loaded.


In [11]:
len(sets)

100

# Jaccard on RTL - IMDB

In [96]:
load_doc("imdb", "rtl")
with open(os.path.join(outputs, "NTL_paths_list.json")) as f:
    sets = json.load(f)
    sets = sets[0:500]

pairs = all_pairs(sets, similarity_func_name = "jaccard", similarity_threshold = 0)
data = list(pairs)
df = pd.DataFrame(data, columns=["Document 1", "Document 2", "JACCARD SIMILARITY"]).sort_values(by = ["Document 2", "Document 1"])
df.to_csv(os.path.join(outputs, "JACCARD-IMDB-RTL.csv"))

13797 documents loaded.


# Jaccard on NTL - IMDB

In [97]:
load_doc("imdb", "ntl")
with open(os.path.join(outputs, "NTL_paths_list.json")) as f:
    sets = json.load(f)
    sets = sets[0:500]

pairs = all_pairs(sets, similarity_func_name = "jaccard", similarity_threshold = 0)
data = list(pairs)
df = pd.DataFrame(data, columns=["Document 1", "Document 2", "JACCARD SIMILARITY"]).sort_values(by = ["Document 2", "Document 1"])
df.dropna().to_csv(os.path.join(outputs, "JACCARD-IMDB-NTL.csv"))

13797 documents loaded.


# Jaccard on RTL - Synthetic

In [98]:
load_doc("Full_Schema.json", "rtl")
with open(os.path.join(outputs, "NTL_paths_list.json")) as f:
    sets = json.load(f)
    sets = sets[0:500]

pairs = all_pairs(sets, similarity_func_name = "jaccard", similarity_threshold = 0)
data = list(pairs)
df = pd.DataFrame(data, columns=["Document 1", "Document 2", "JACCARD SIMILARITY"]).sort_values(by = ["Document 2", "Document 1"])
df.dropna().to_csv(os.path.join(outputs, "JACCARD-Synthetic-RTL.csv"))

23374 documents loaded.


# Jaccard on NTL - Synthetic

In [99]:
load_doc("Full_Schema.json", "ntl")
with open(os.path.join(outputs, "NTL_paths_list.json")) as f:
    sets = json.load(f)
    sets = sets[0:500]

pairs = all_pairs(sets, similarity_func_name = "jaccard", similarity_threshold = 0)
data = list(pairs)
df = pd.DataFrame(data, columns=["Document 1", "Document 2", "JACCARD SIMILARITY"]).sort_values(by = ["Document 2", "Document 1"])
df.dropna().to_csv(os.path.join(outputs, "JACCARD-Synthetic-NTL.csv"))

23374 documents loaded.


# Cosine on RTL - IMDB

In [100]:
load_doc("imdb", "rtl")
with open(os.path.join(outputs, "NTL_paths_list.json")) as f:
    sets = json.load(f)
    sets = sets[0:500]

pairs = all_pairs(sets, similarity_func_name = "cosine", similarity_threshold = 0)
data = list(pairs)
df = pd.DataFrame(data, columns=["Document 1", "Document 2", "COSINE SIMILARITY"]).sort_values(by = ["Document 2", "Document 1"])
df.dropna().to_csv(os.path.join(outputs, "COSINE-IMDB-RTL.csv"))

13797 documents loaded.


# Cosine on NTL - IMDB

In [101]:
load_doc("imdb", "ntl")
with open(os.path.join(outputs, "NTL_paths_list.json")) as f:
    sets = json.load(f)
    sets = sets[0:500]

pairs = all_pairs(sets, similarity_func_name = "cosine", similarity_threshold = 0)
data = list(pairs)
df = pd.DataFrame(data, columns=["Document 1", "Document 2", "COSINE SIMILARITY"]).sort_values(by = ["Document 2", "Document 1"])
df.dropna().to_csv(os.path.join(outputs, "COSINE-IMDB-NTL.csv"))

13797 documents loaded.


# Cosine on RTL - Synthetic

In [102]:
load_doc("Full_Schema.json", "rtl")
with open(os.path.join(outputs, "NTL_paths_list.json")) as f:
    sets = json.load(f)
    sets = sets[0:500]

pairs = all_pairs(sets, similarity_func_name = "cosine", similarity_threshold = 0)
data = list(pairs)
df = pd.DataFrame(data, columns=["Document 1", "Document 2", "COSINE SIMILARITY"]).sort_values(by = ["Document 2", "Document 1"])
df.dropna().to_csv(os.path.join(outputs, "COSINE-Synthetic-RTL.csv"))

23374 documents loaded.


# Cosine on NTL - Synthetic

In [103]:
load_doc("Full_Schema.json", "ntl")
with open(os.path.join(outputs, "NTL_paths_list.json")) as f:
    sets = json.load(f)
    sets = sets[0:500]

pairs = all_pairs(sets, similarity_func_name = "cosine", similarity_threshold = 0)
data = list(pairs)
df = pd.DataFrame(data, columns=["Document 1", "Document 2", "COSINE SIMILARITY"]).sort_values(by = ["Document 2", "Document 1"])
df.dropna().to_csv(os.path.join(outputs, "COSINE-Synthetic-NTL.csv"))

23374 documents loaded.
