In [1]:
from scrape.wp_get_page import WikipediaPage
from scrape.wp_structured_text import WikipediaStructuredText
from scrape.wp_structured_diffs import Diff
import difflib
from setup.db_trie import db
from lakat.submit import content_submit
from config.scrape_cfg import WIKIPEDIA_API_URL, EXAMPLE_ARTICLE_TITLE
from config.bucket_cfg import DEFAULT_ATOMIC_BUCKET_SCHEMA, DEFAULT_MOLECULAR_BUCKET_SCHEMA, DEFAULT_NAME_RESOLUTION_BUCKET_SCHEMA, BUCKET_ID_TYPE_NO_REF, BUCKET_ID_TYPE_WITH_ID_REF
from utils.signing.sign import get_public_key_from_file
from utils.serialize import unserialize, serialize
from scrape.wp_structured_diffs import _similar_content

# db.close()
article_title = EXAMPLE_ARTICLE_TITLE
wp = WikipediaPage(WIKIPEDIA_API_URL)
edit_history = wp.load_content_from_batches(
    article_title, 0, 105, download_if_not_exist=True)
edit_history.reverse()


In [2]:
# inds = list()
# for ind in range(len(edit_history) - 1): 
# ind = 65 ##65
# str_text_old = WikipediaStructuredText(edit_history[ind]["*"])
# str_text_new = WikipediaStructuredText(edit_history[ind+1]["*"])
# diff = Diff(old=str_text_old, new=str_text_new)
# df = diff.compare(similarity_threshold=0.75, zero_level_similarity_threshold=0.95)
# # if len(df["modified"]) != 0 and len(df["rearranged"])!=0:
# #     inds.append(ind)
# df

In [3]:
def getDefaultEmptyInteractions():
    socialRefs = list()
    interactions = {
        "socialRefs": socialRefs,
        "reviews": list(),
        "tokens": list(),
        "bucketRefs": list(),
        "storageProofs": list(),
        "children": list()}
    return interactions


def branch_creation(public_key, verbose=True):
    branchId = None
    contents = list()
    data_dict = {
            "schema_id": DEFAULT_NAME_RESOLUTION_BUCKET_SCHEMA,
            "public_key": public_key,
            "parent_bucket": None,
            "data": serialize({}),
            "refs": serialize([])
        }
    contents.append(serialize(data_dict))


    # socialRefs = [{"id": someId, "value":someValue}]
    interactions = getDefaultEmptyInteractions()

    res = content_submit(
            contents=contents,
            interactions=interactions, 
            branchId=None, 
            proof=b'', 
            msg="NAME REGISTRY AND INITIAL SUBMIT", 
            create_branch=True)

    for key, value in res.items():
        if key == "branch_id": 
            branchId = value
            break
    
    if verbose:
        print(res)
    
    return branchId

In [26]:
def first_submit(edit, public_key, branchId):
    structured_text_old = WikipediaStructuredText(edit["*"])
    old_parts = structured_text_old.parts
    comment = edit["comment"]
    author = edit["user"]
    msg =  f"{comment}. By '{author}'."

    contents = list()
    order = list()
    submission_id_to_new_part_id = dict()
    new_part_id_to_bucket_id = dict()
    content_order_index = 0
    for i, part in enumerate(old_parts):
        data_dict = {
            "schema_id": DEFAULT_ATOMIC_BUCKET_SCHEMA,
            "public_key": public_key,
            "parent_bucket": None,
            "data": serialize(part.content),
            "refs": serialize([])
        }
        contents.append(serialize(data_dict))
        order.append(i)
        submission_id_to_new_part_id[content_order_index] = i
        content_order_index += 1

    # # create a molecular bucket
    molecular_data = {
        "order":[{"id": oid, "type": BUCKET_ID_TYPE_NO_REF} for oid in order],
        "name": EXAMPLE_ARTICLE_TITLE}

    data_dict = {
        "schema_id": DEFAULT_MOLECULAR_BUCKET_SCHEMA,
        "public_key": public_key,
        "parent_bucket": None,
        "data": serialize(molecular_data),
        "refs": serialize([])}

    contents.append(serialize(data_dict))

    interactions = getDefaultEmptyInteractions()

    res = content_submit(
            contents=contents, 
            interactions=interactions,
            branchId=branchId, 
            proof=b'', 
            msg=msg, 
            create_branch=False)

    # now update the 
    print('submission_id_to_new_part_id', submission_id_to_new_part_id)
    for submission_id, new_part_id in submission_id_to_new_part_id.items():
        print('submission_id', submission_id, 'bucket_id', res["bucket_ids"][submission_id])
        new_part_id_to_bucket_id[new_part_id] = res["bucket_ids"][submission_id]

    return new_part_id_to_bucket_id, res



def submit_new_edit(public_key, branchId, prev_edit, new_edit, article_id, part_id_to_bucket_id,  similarity_threshold=0.75, zero_level_similarity_threshold=0.95):

    ### FUNCTION BEGINS HERE
    str_text_old = WikipediaStructuredText(prev_edit["*"])
    str_text_new = WikipediaStructuredText(new_edit["*"])
    comment = new_edit["comment"]
    author = new_edit["user"]
    msg =  f"{comment}. By '{author}'."

    diff = Diff(old=str_text_old, new=str_text_new)
    df = diff.compare(
        similarity_threshold=similarity_threshold, zero_level_similarity_threshold=zero_level_similarity_threshold)

    print('df', df)
    ## initialize lists and dicts
    new_part_id_to_bucket_id = dict()
    submission_id_to_new_part_id = dict()

    new_parts = str_text_new.parts
    order = [None for _ in range(len(new_parts))]
    ## add new buckets
    content_order_index = 0
    contents = list()
    for j in df["new"]:
        part = new_parts[j]
        print('new', j, part.title, part.header, part.level)
        
        data_dict = {
            "schema_id": DEFAULT_ATOMIC_BUCKET_SCHEMA,
            "public_key": public_key,
            "parent_bucket": None,
            "data": serialize(part.content),
            "refs": serialize([])
        }
        contents.append(serialize(data_dict))
        order[j] = ({"id":content_order_index, "type": BUCKET_ID_TYPE_NO_REF})
        submission_id_to_new_part_id[content_order_index] = j
        content_order_index += 1

    ## add the rearranged buckets
    for rearranged in df["rearranged"]:
        old_part_id = part_id_to_bucket_id[rearranged["old_index"]]
        new_part = new_parts[rearranged["new_index"]]
        print('rearranged', rearranged["new_index"], new_part.title, new_part.header, new_part.level)
        order[rearranged["new_index"]] = {"id":old_part_id, "type": BUCKET_ID_TYPE_WITH_ID_REF}

    ## add modified buckets
    for modified in df["modified"]:
        old_part_id = part_id_to_bucket_id[modified["old_index"]]
        new_part = new_parts[modified["new_index"]]
        print('mod', modified["new_index"], new_part.title, new_part.header, new_part.level)
        data_dict = {
            "schema_id": DEFAULT_ATOMIC_BUCKET_SCHEMA,
            "public_key": public_key,
            "parent_bucket": old_part_id,
            "data": serialize(new_part.content),
            "refs": serialize([])
        }
        contents.append(serialize(data_dict))
        order[modified["new_index"]] = {"id":content_order_index, "type": BUCKET_ID_TYPE_NO_REF}
        submission_id_to_new_part_id[content_order_index] = modified["new_index"]
        content_order_index += 1

    # check whether all the new parts are added
    if len(order) != len(new_parts):
        raise Exception("Not all new parts are added")

    print('order', order)
    ## create new molecular bucket
    molecular_data = {
        "order":order,
        "name": None}
    
    data_dict = {
        "schema_id": DEFAULT_MOLECULAR_BUCKET_SCHEMA,
        "public_key": public_key,
        "parent_bucket": article_id,
        "data": serialize(molecular_data),
        "refs": serialize([])
    }

    contents.append(serialize(data_dict))

    interactions = getDefaultEmptyInteractions()
    
    res = content_submit(
            contents=contents,
            interactions=interactions,
            branchId=branchId,
            proof=b'',
            msg=msg,
            create_branch=False)

    ## Now update the new_part_id_to_bucket_id
    # first update the submitted buckets
    for submission_id, new_part_id in submission_id_to_new_part_id.items():
        new_part_id_to_bucket_id[new_part_id] = res["bucket_ids"][submission_id]
    # next update the rearranged buckets
    for rearranged in df["rearranged"]:
        new_part_id_to_bucket_id[rearranged["new_index"]] = part_id_to_bucket_id[rearranged["old_index"]]
    # next update the modified buckets
    
    return new_part_id_to_bucket_id, res


In [27]:
key_file_prefix="lakat"
    # retrieve
public_key = get_public_key_from_file(key_file_prefix=key_file_prefix)
# create a name registry NR (optional)

In [28]:
branchId = branch_creation(public_key=public_key, verbose=False)
new_part_id_to_bucket_id, res = first_submit(edit=edit_history[0], public_key=public_key, branchId=branchId)
article_id = res["bucket_ids"][-1] ## hack TODO! Change this.
new_part_id_to_bucket_id, res = submit_new_edit(
    public_key, 
    branchId, prev_edit=edit_history[0], new_edit=edit_history[1], article_id=article_id, part_id_to_bucket_id=new_part_id_to_bucket_id)

DATA ORDER: [{'id': 0, 'type': 0}, {'id': 1, 'type': 0}, {'id': 2, 'type': 0}, {'id': 3, 'type': 0}, {'id': 4, 'type': 0}, {'id': 5, 'type': 0}, {'id': 6, 'type': 0}, {'id': 7, 'type': 0}, {'id': 8, 'type': 0}, {'id': 9, 'type': 0}]
submission_id_to_new_part_id {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9}
submission_id 0 bucket_id QmbLb8a86uHbULRdXfCgcMPjoYZWbhT2CkRKEN6o8Tb64T
submission_id 1 bucket_id QmZ8CLdrTCiVap9CEkJW79y6xo4fkNwDqLr6ex2jrn2wTa
submission_id 2 bucket_id QmW38tDzu2tdRmU54q7spuWfCtkzJ8ygkZ1bz7SpvXeRjx
submission_id 3 bucket_id QmRYj22BBpj25czmMYY6AtcRqQdhLgkWAxdApiBnvFBw7v
submission_id 4 bucket_id QmQLKCYqgNHdMiGKfErkcVo8m745PA97Xu61eDeeDL5hYC
submission_id 5 bucket_id QmWz8P2LwZxVbHSHj85bYJoevWt7h9e2HdbjZz5mS1fp86
submission_id 6 bucket_id QmS1rJbPxEKoNDCwU8eG3g3u7sNXqNJvRgjtvWxvBoCLCo
submission_id 7 bucket_id QmTTQ7MYxt2Crw6P3rLdf7zS47aZ4C1WdU7LfQYBvfW1io
submission_id 8 bucket_id QmWPYRLHk2RviJhaK11JqTHfVWUtmLbzWfPab2wDNRruUq
submission_id 9 buck

In [29]:
new_part_id_to_bucket_id

{8: 'QmbHL2S24fPZ9R9hKPFqSSfyxn9VeKTTQuc5Cwocrp5KLQ',
 0: 'QmbLb8a86uHbULRdXfCgcMPjoYZWbhT2CkRKEN6o8Tb64T',
 1: 'QmZ8CLdrTCiVap9CEkJW79y6xo4fkNwDqLr6ex2jrn2wTa',
 2: 'QmW38tDzu2tdRmU54q7spuWfCtkzJ8ygkZ1bz7SpvXeRjx',
 3: 'QmRYj22BBpj25czmMYY6AtcRqQdhLgkWAxdApiBnvFBw7v',
 4: 'QmQLKCYqgNHdMiGKfErkcVo8m745PA97Xu61eDeeDL5hYC',
 5: 'QmWz8P2LwZxVbHSHj85bYJoevWt7h9e2HdbjZz5mS1fp86',
 6: 'QmS1rJbPxEKoNDCwU8eG3g3u7sNXqNJvRgjtvWxvBoCLCo',
 7: 'QmTTQ7MYxt2Crw6P3rLdf7zS47aZ4C1WdU7LfQYBvfW1io',
 9: 'QmNvN4zp8YonbiiEzi58DmKtGV6uLRarYJC3YtVJjdzQBD'}

In [30]:
new_part_id_to_bucket_id, res = submit_new_edit(
    public_key, 
    branchId, prev_edit=edit_history[1], new_edit=edit_history[2], article_id=article_id, part_id_to_bucket_id=new_part_id_to_bucket_id)

df {'rearranged': [{'old_index': 0, 'new_index': 0}, {'old_index': 1, 'new_index': 1}, {'old_index': 2, 'new_index': 2}, {'old_index': 3, 'new_index': 3}, {'old_index': 4, 'new_index': 4}, {'old_index': 5, 'new_index': 5}, {'old_index': 6, 'new_index': 6}, {'old_index': 8, 'new_index': 8}, {'old_index': 9, 'new_index': 9}], 'modified': [{'old_index': 7, 'new_index': 7, 'score': 0.9984756097560976}], 'new': [], 'deleted': []}
rearranged 0 Summary  1
rearranged 1 The SIR model The SIR model 2
rearranged 2 The SIR model is dynamic in two senses The SIR model is dynamic in two senses 3
rearranged 3 Transition rates Transition rates 3
rearranged 4 Elaborations on the basic SIR model Elaborations on the basic SIR model 2
rearranged 5 The SEIR model The SEIR model 3
rearranged 6 The MSIR model The MSIR model 3
rearranged 8 The SIS model The SIS model 2
rearranged 9 See also See also 2
mod 7 Carrier state Carrier state 3
order [{'id': 'QmbLb8a86uHbULRdXfCgcMPjoYZWbhT2CkRKEN6o8Tb64T', 'type': 1

In [31]:
new_part_id_to_bucket_id, res = submit_new_edit(
    public_key, 
    branchId, prev_edit=edit_history[2], new_edit=edit_history[3], article_id=article_id, part_id_to_bucket_id=new_part_id_to_bucket_id)
res

df {'rearranged': [{'old_index': 0, 'new_index': 0}, {'old_index': 1, 'new_index': 1}, {'old_index': 3, 'new_index': 3}, {'old_index': 4, 'new_index': 4}, {'old_index': 5, 'new_index': 5}, {'old_index': 6, 'new_index': 6}, {'old_index': 7, 'new_index': 7}, {'old_index': 8, 'new_index': 8}, {'old_index': 9, 'new_index': 9}], 'modified': [{'old_index': 2, 'new_index': 2, 'score': 0.999247554552295}], 'new': [], 'deleted': []}
rearranged 0 Summary  1
rearranged 1 The SIR model The SIR model 2
rearranged 3 Transition rates Transition rates 3
rearranged 4 Elaborations on the basic SIR model Elaborations on the basic SIR model 2
rearranged 5 The SEIR model The SEIR model 3
rearranged 6 The MSIR model The MSIR model 3
rearranged 7 Carrier state Carrier state 3
rearranged 8 The SIS model The SIS model 2
rearranged 9 See also See also 2
mod 2 The SIR model is dynamic in two senses The SIR model is dynamic in two senses 3
order [{'id': 'QmbLb8a86uHbULRdXfCgcMPjoYZWbhT2CkRKEN6o8Tb64T', 'type': 1}

{'bucket_ids': ['QmVffK8CNU1fyMaDoPXBJr2taoCFNW7VArxgmSuZDd3XJG',
  'QmVAQbBRqLwpHFSVvY5HmWaBjqMCFSAGFA9wHa84q5wS7V'],
 'molecule_ids': ['QmVAQbBRqLwpHFSVvY5HmWaBjqMCFSAGFA9wHa84q5wS7V'],
 'branch_id': 'QmcNNwoRae5jAHq4gDKqFe64qVdpRDApMLXCibWRKgMoGW',
 'branch_state': 'QmQwd7SxSUHTMqfgiDSAfrtSAUVQhHEeDuFvNayUtJCYKs',
 'submit_id': 'QmYkfHZ8e1PjGuFeUTZrUcQYpkUMiQCy6FAeGX8QSqJRnd',
 'submit_trac_id': 'QmRytXCiTD3Zco9CZdcVKf2QUxNFd3KwREDN6eJEMxgczL',
 'registered_names': [],
 'nr_regs': -1,
 'name_registration_deployed': False,
 'msg': "/* The SIR model is dynamic in two senses */ Image tweak. By '82.148.33.8'."}