In [1]:
from scrape.wp_get_page import WikipediaPage
from scrape.wp_structured_text import WikipediaStructuredText
import difflib
from setup.db_trie import db
from lakat.submit import content_submit
from config.scrape_cfg import WIKIPEDIA_API_URL, EXAMPLE_ARTICLE_TITLE
from config.bucket_cfg import DEFAULT_ATOMIC_BUCKET_SCHEMA, DEFAULT_MOLECULAR_BUCKET_SCHEMA, DEFAULT_NAME_RESOLUTION_BUCKET_SCHEMA, BUCKET_ID_TYPE_NO_REF
from utils.signing.sign import get_public_key_from_file
from utils.serialize import unserialize, serialize

# db.close()
article_title = EXAMPLE_ARTICLE_TITLE
wp = WikipediaPage(WIKIPEDIA_API_URL)
edit_history = wp.load_content_from_batches(
    article_title, 0, 105, download_if_not_exist=True)
edit_history.reverse()


In [2]:
key_file_prefix="lakat"
    # retrieve
public_key = get_public_key_from_file(key_file_prefix=key_file_prefix)
# create a name registry NR (optional)
contents = list()
data_dict = {
        "schema_id": DEFAULT_NAME_RESOLUTION_BUCKET_SCHEMA,
        "public_key": public_key,
        "parent_bucket": None,
        "data": serialize({}),
        "refs": serialize([])
    }
contents.append(serialize(data_dict))


# socialRefs = [{"id": someId, "value":someValue}]
socialRefs = list()
interactions = {
    "socialRefs": socialRefs,
    "reviews": list(),
    "tokens": list(),
    "bucketRefs": list(),
    "storageProofs": list(),
    "children": list()}

res = content_submit(
        contents=contents,
        interactions=interactions, 
        branchId=None, 
        proof=b'', 
        msg="NAME REGISTRY AND INITIAL SUBMIT", 
        create_branch=True)
for id, type_of_content in res.items():
    if type_of_content == "BRANCH":
        branchId = id
        break
# print("branchId: ", branchId)
# print("res: ", res)

edit = edit_history[0]
structured_text_old = WikipediaStructuredText(edit["*"])
msg = edit["comment"]
old_parts = structured_text_old.parts
contents = list()
order = list()
for i, part in enumerate(old_parts):
    data_dict = {
        "schema_id": DEFAULT_ATOMIC_BUCKET_SCHEMA,
        "public_key": public_key,
        "parent_bucket": None,
        "data": serialize(part.content),
        "refs": serialize([])
    }
    contents.append(serialize(data_dict))
    order.append(i)

# # create a molecular bucket
molecular_data = {
    "order":[{"id": oid, "type": BUCKET_ID_TYPE_NO_REF} for oid in order],
    "name": EXAMPLE_ARTICLE_TITLE}

data_dict = {
    "schema_id": DEFAULT_MOLECULAR_BUCKET_SCHEMA,
    "public_key": public_key,
    "parent_bucket": None,
    "data": serialize(molecular_data),
    "refs": serialize([])
}
contents.append(serialize(data_dict))

res = content_submit(
        contents=contents, 
        interactions=interactions,
        branchId=branchId, 
        proof=b'', 
        msg=msg, 
        create_branch=False)

In [4]:
res

{'QmSHtwJV1w6rKH3CAgpGYPVf5F16vyFfGGEt2xudjWSxXC': 'ATOMIC',
 'QmYhjeyL4cXmtxGGMsJqfucq87c5FcbvqNdxYMUFNwdrZq': 'ATOMIC',
 'QmNviPBkGzzCWEhhuDyXEbV1NwER8GbXYpvAXgTvoWDpkV': 'ATOMIC',
 'QmVi1VysfHc5KNAPmTYJ72rFW77EP4FjdJxcVDaxca7kGg': 'ATOMIC',
 'Qmf2og8L8GyvdAmXVmozhgzPi62uqMB5sHNfZpRV2v5MqR': 'ATOMIC',
 'QmSi5Y9C1aRokVKRZ4WDKA78Zc3ArArEnmaGH8Uma9joXe': 'ATOMIC',
 'QmaEzXn3knZZbQ6iL6TJuWtidFfk3zLEexhVkRC5vDiSXs': 'ATOMIC',
 'QmZmK7RfJuFPKY5U2tXiy31sQyQNcLxvjPtK344oaZ7QJj': 'ATOMIC',
 'QmWLpd2qK5qeC1A5oEKRM3uqp4x8cf47EdKxWi3sFnNEmM': 'ATOMIC',
 'QmUhPKgizvzG6zxPYdq8izoNkTbHDyWafy1V2xQdjHHdHB': 'ATOMIC',
 'QmSCQ3wPf7nhrguyy9Gbn5YGrBfKpcVSuHcm1PizuPaKLu': 'SUBMIT_TRACE',
 'QmUQ3mLexWRjADXzSqmuyHjFUG95QgzPJifiBbYDZ2yRmD': 'SUBMIT',
 'QmahMPQdDWvU9Y7YwUsHuystuEfterDrPHh7sbR3fUjfhS': 'BRANCH'}

In [4]:
index = wp.save_edit_history(title=TITLE, limit=None, requests_limit=8000, batch_size=10, api_sleep_in_sec=2, debug=True)

Batch 10: 500 revisions
Batch 17: 338 revisions


In [3]:
loaded_content = wp.load_content_from_batches(TITLE, 0, 105)

In [4]:
d = [{k:str(v)[0:min(20,len(str(v)))] for k,v in c.items()} for c in loaded_content]
# d[3:7]

In [43]:
from scrape.wp_structured_diffs import Diff, _similar_content
from scrape.wp_structured_text import WikipediaStructuredText

ind = 152
str_text_old = WikipediaStructuredText(loaded_content[ind]["*"])
str_text_new = WikipediaStructuredText(loaded_content[ind+1]["*"])
diff = Diff(old=str_text_old, new=str_text_new)
df = diff.get_diff(similarity_threshold=0.75, zero_level_similarity_threshold=0.95)
df

{'deleted': [{'part': {'title': 'Pandemic', 'super_title': 'Variations on the basic SIR mo', 'level': '3', 'header': 'Pandemic', 'headerless_content': 'An SIR community based model t', 'part_id': '26'},
   'max_similarity': 0.05374823196605375,
   'new_index': 25}],
 'added': [],
 'rearranged': [{'old_index': 27, 'new_index': 26, 'score': 1},
  {'old_index': 28, 'new_index': 27, 'score': 1},
  {'old_index': 29, 'new_index': 28, 'score': 1},
  {'old_index': 30, 'new_index': 29, 'score': 1},
  {'old_index': 31, 'new_index': 30, 'score': 1},
  {'old_index': 32, 'new_index': 31, 'score': 1},
  {'old_index': 33, 'new_index': 32, 'score': 1},
  {'old_index': 34, 'new_index': 33, 'score': 1},
  {'old_index': 35, 'new_index': 34, 'score': 1},
  {'old_index': 36, 'new_index': 35, 'score': 1},
  {'old_index': 37, 'new_index': 36, 'score': 1},
  {'old_index': 38, 'new_index': 37, 'score': 1},
  {'old_index': 39, 'new_index': 38, 'score': 1},
  {'old_index': 40, 'new_index': 39, 'score': 1},
  {'o

In [35]:
# each loaded content has a revid and a parentid. could you make a networkx object from loaded_content? Can you check whether its a chain or something non-trivial?
# import networkx as nx
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd

# G = nx.DiGraph()
# for c in loaded_content:
#     G.add_node(c["revid"], parentid=c["parentid"], user=c["user"], timestamp=c["timestamp"], comment=c["comment"], revid=c["revid"])
#     if c["parentid"] != 0:
#         G.add_edge(c["parentid"], c["revid"])

In [34]:
# # make the nodes a lot smaller
# node_size = [1 for n in G.nodes()]

# # get the nodes with more than one children?
# # get the adjacency matrix and transpose it and 
# A = nx.adjacency_matrix(G)
# # max(A.sum(axis=0))



1

In [8]:

loaded_content[0]

{'revid': 16745668,
 'parentid': 0,
 'user': 'Imoen',
 'timestamp': '2004-09-04T18:24:28Z',
 'contentformat': 'text/x-wiki',
 'contentmodel': 'wikitext',
 'comment': 'Shiny new page',
 '*': "A [[population]] comprises a large number of individuals, all of whom are different in various fields. In order to [[mathematical modelling in epidemiology|model the progress of an epidemic]] in such a population this diversity must be reduced to a few key characteristics which are relevant to the infection under consideration. For example, for most common childhood diseases which confer long-lasting immunity it makes sense to divide the population into those who are [[susceptible]] to the disease, those who are [[infectious disease|infected]] and those who have recovered and are [[immune system|immune]]. These subdivisions of the population are called '''compartments'''.\n\n==The SIR model==\n\nStandard convention labels these three compartments S (for susceptible), I (for infectious) and R (for r

In [37]:
# str_text_old.parts[4].headerless_content
# _similar_content(str_text_old.parts[4].content, str_text_new.parts[2].content)
# loaded_content[0]
for i in range(0, len(loaded_content)-1):
    str_text_old = WikipediaStructuredText(loaded_content[i]["*"])
    str_text_new = WikipediaStructuredText(loaded_content[i+1]["*"])
    diff = Diff(old=str_text_old, new=str_text_new)
    df = diff.get_diff(similarity_threshold=0.75, zero_level_similarity_threshold=0.95)
    if len(df) > 0:
        print(i, i+1, df, "\n\n")

0 1 {'deleted': [], 'added': [], 'rearranged': [], 'modified': [{'old_index': 17, 'new_index': 17, 'score': 1}]} 


1 2 {'deleted': [], 'added': [], 'rearranged': [], 'modified': [{'old_index': 9, 'new_index': 9, 'score': 1}]} 


2 3 {'deleted': [], 'added': [], 'rearranged': [], 'modified': [{'old_index': 8, 'new_index': 8, 'score': 1}]} 


3 4 {'deleted': [], 'added': [], 'rearranged': [], 'modified': [{'old_index': 9, 'new_index': 9, 'score': 1}]} 




KeyboardInterrupt: 

In [12]:
# get the diff using difflib library of loaded_content[ind]["*"] and loaded_content[ind + 1]["*"]
# differencee = difflib.ndiff(loaded_content[ind]["*"].splitlines(keepends=True), loaded_content[ind + 1]["*"].splitlines(keepends=True))
# list(differencee)
# [m[1].content for m in df.get('modified')]
str_text_new.parts[3].content
print(str_text_old.parts[3].level)
# _similar_content(str_text_old.parts[5].content, str_text_new.parts[2].content)
[p.content for p in str_text_old.parts[1:5]]

0


['last1=Ross |first1=Ronald |title=An application of the theory of probabilities to the study of a priori pathometry.—Part I |journal=  Proceedings of the Royal Society of London. Series A, Containing Papers of a Mathematical and Physical Character|date=1 February 1916 |volume=92 |issue=638 |pages=204–230 |doi=10.1098/rspa.1916.0007|bibcode=1916RSPSA..92..204R |doi-access=free ',
 'last1=Ross |first1=Ronald |last2=Hudson |first2=Hilda |title=An application of the theory of probabilities to the study of a priori pathometry.—Part II |journal= Proceedings of the Royal Society of London. Series A, Containing Papers of a Mathematical and Physical Character|date=3 May 1917 |volume=93 |issue=650 |pages = 212–225 |doi=10.1098/rspa.1917.0014 |bibcode=1917RSPSA..93..212R |url=https://royalsocietypublishing.org/doi/10.1098/rspa.1917.0014|doi-access=free ',
 'last1=Ross |first1=Ronald |last2=Hudson |first2=Hilda |title=An application of the theory of probabilities to the study of a priori pathomet

In [28]:
cnt = df["modified"][0][0].content
cnt2 = df["modified"][0][1].content
diffr = difflib.unified_diff(cnt, cnt2, lineterm='')    # # return diff
[str(d) for d in diffr]

['--- ',
 '+++ ',
 '@@ -901,23 +901,6 @@',
 ' -',
 ' 0',
 ' 2',
 '-|',
 '-d',
 '-o',
 '-i',
 '--',
 '-a',
 '-c',
 '-c',
 '-e',
 '-s',
 '-s',
 '-=',
 '-f',
 '-r',
 '-e',
 '-e',
 '- ',
 ' }',
 ' }',
 ' <']

In [11]:
revids = [(x["revid"], x["parentid"]) for x in loaded_content]

In [12]:
# loaded_content[10]

In [9]:
import re

def convert_wikitext_to_html(wikitext):
    """
    Convert basic wikitext markup to HTML.
    
    Handles bold, italic, and new line conversions.
    """
    # Convert bold and italic (''''text'''' to <b><i>text</i></b>)
    html = re.sub(r"'''''(.*?)'''''", r"<b><i>\1</i></b>", wikitext)
    # Convert bold ('''text''' to <b>text</b>)
    html = re.sub(r"'''(.*?)'''", r"<b>\1</b>", html)
    # Convert italic (''text'' to <i>text</i>)
    html = re.sub(r"''(.*?)''", r"<i>\1</i>", html)
    # Convert new lines to <br>
    html = html.replace("\n", "<br>")

    return html

def save_to_html_file(text, filename):
    """
    Save the provided text as an HTML file.

    :param text: The text to be converted and saved.
    :param filename: The name of the file to save the HTML content in.
    """
    html_content = convert_wikitext_to_html(text)
    html_full_content = f"<html><head><title>Wikipedia Page</title></head><body>{html_content}</body></html>"

    # Write the HTML content to a file
    with open(filename, "w", encoding="utf-8") as file:
        file.write(html_full_content)

# Example usage
wikitext = "'''Bold''' and ''italic'' text.\nNew line here."
filename = "ExampleWikipediaPage.html"
save_to_html_file(wikitext, filename)


In [11]:
filename = "test.html"
save_to_html_file(res[3]['*'], filename)

In [17]:
# [{"revid": r["revid"], "parentid": r["parentid"], "comment": r["comment"]} for r in res]

In [23]:
def extract_sections(wikitext):
    sections = {}
    # Extracting the short description
    short_desc_match = re.search(r"{{short description\|(.*?)}}", wikitext)
    short_description = short_desc_match.group(1) if short_desc_match else ""
    sections["Short Description"] = {"level": 0, "content": short_description}

    # Regular expression for section titles
    pattern = r"(={2,})([^=]+)\1"

    # Find the start position of the first section header
    first_section_match = re.search(pattern, wikitext)
    summary_end = first_section_match.start() if first_section_match else len(wikitext)
    summary_start = short_desc_match.end() if short_desc_match else 0
    sections["Summary"] = {"level": 1, "content": wikitext[summary_start:summary_end].strip()}

    # Splitting the text based on section titles
    parts = re.split(pattern, wikitext)

    # Iterate over the rest of the parts to extract sections
    for i in range(1, len(parts), 3):
        level = len(parts[i])  # The number of '=' indicates the level of the section
        title = parts[i + 1].strip()
        content = parts[i + 1] + parts[i + 2].strip()  # Include the section title in the content
        sections[title] = {"level": level, "content": content}

    return sections


wikitext_example = "{{short description|Example Article}}SummaryBlabla\n== Section 1 ==\nContent of section 1\n=== Subsection ===\nContent of subsection."

sections = extract_sections(wikitext_example)
for title, details in sections.items():
    # print(title)
    # print(details)
    print(f"Title: {title}, Level: {details.get('level', 'Summary')}, Content: {details['content']}\n")

Title: Short Description, Level: 0, Content: Example Article

Title: Summary, Level: 1, Content: SummaryBlabla

Title: Section 1, Level: 2, Content:  Section 1 Content of section 1

Title: Subsection, Level: 3, Content:  Subsection Content of subsection.



In [35]:

sections = extract_sections(loaded_content[0]['*'])
for title, details in sections.items():
    content = details.get('content','')
    print(f"Title: {title}, Level: {details.get('level', 'Summary')}, Content: {content[0:min(20, len(content))]}\n")
# loaded_content[0]

Title: Short Description, Level: 0, Content: Type of mathematical

Title: Summary, Level: 1, Content: '''Compartmental mod

Title: The SIR model, Level: 3, Content:  The SIR model [[Fil

Title: Transition rates, Level: 3, Content: Transition ratesFor 

Title: The SIR model without birth and death, Level: 3, Content: The SIR model withou

Title: The force of infection, Level: 4, Content: The force of infecti

Title: Exact analytical solutions to the SIR model, Level: 4, Content: Exact analytical sol

Title: Numerical solutions to the SIR model with approximations, Level: 4, Content: Numerical solutions 

Title: The SIR model with vital dynamics and constant population, Level: 3, Content: The SIR model with v

Title: Steady-state solutions, Level: 3, Content:  Steady-state soluti

Title: Other compartmental models, Level: 3, Content:  Other compartmental

Title: Variations on the basic SIR model, Level: 2, Content: Variations on the ba

Title: The SIS model, Level: 3, Content: The SIS mo

In [67]:
import re
import difflib
from typing import Mapping, List


class Part:
    def __init__(self, title, super_title, level, header, headerless_content) -> None:
        self.title = title
        self.super_title =  super_title
        self.level = level
        self.header = header
        self.headerless_content = headerless_content

    @property
    def content(self):
        header = f"{self.header}\n" if self.header else ""
        return header + self.headerless_content


class WikipediaStructuredText():

    def __init__(self, wikitext):
        self.wikitext = wikitext

    # get a property called parts
    @property
    def parts(self):
        return self.extract_parts()

    def extract_parts(self) -> List[Part]:
        sections = list()
        section_stack = list()
        # Extracting the short description
        short_desc_match = re.search(r"{{short description\|(.*?)}}", self.wikitext)
        short_description = short_desc_match.group(1) if short_desc_match else ""
        sections.append(Part(title="Short Description", super_title="", level=0, header="", headerless_content=short_description))

        # Regular expression for section titles
        pattern = r"(={2,})([^=]+)\1"

        # Find the start position of the first section header
        first_section_match = re.search(pattern, self.wikitext)
        summary_end = first_section_match.start() if first_section_match else len(self.wikitext)
        summary_start = short_desc_match.end() if short_desc_match else 0
        sections.append(Part(title="Summary", super_title="", level=1, headerless_content= self.wikitext[summary_start:summary_end].strip(), header=""))

        # Splitting the text based on section titles
        parts = re.split(pattern, self.wikitext)
        
        for i in range(1, len(parts), 3):
            level = len(parts[i])  # The number of '=' indicates the level of the section
            title = parts[i + 1].strip()
            headerless_content = parts[i + 2].strip()
            super_title = ""

            # Update the stack and determine the super_title
            while section_stack and section_stack[-1].level >= level:
                section_stack.pop()
            if section_stack:
                super_title = section_stack[-1].title

            # Add the current section to the stack
            current_part = Part(title=title, level=level, header=title, headerless_content=headerless_content, super_title=super_title)
            section_stack.append(current_part)

            # Add the section to the list of sections
            sections.append(current_part)

        return sections

In [69]:
import difflib
text1 = WikipediaStructuredText(wikitext=loaded_content[31]['*'])
text2 = WikipediaStructuredText(wikitext=loaded_content[32]['*'])

In [73]:
text1_parts = text1.parts
# text1_parts[3].__dict__

In [43]:
# def fine_grained_diff(text1, text2):
sections1 = text1.parts
sections2 = text2.parts

diffs = {
    "structure_diff": [],
    "header_diff": [],
    "content_diff": {}
}

# Compare structure (section presence)
set1 = set(sections1.keys())
set2 = set(sections2.keys())
diffs["structure_diff"].extend(list(set1 - set2))  # Present in text1 but not in text2
diffs["structure_diff"].extend(list(set2 - set1))  # Present in text2 but not in text1

# Compare headers and short description
for section in set1.union(set2):
    if sections1.get(section) != sections2.get(section):
        diffs["header_diff"].append(section)

# Compare content of each section
for section in set1.intersection(set2):
    content_diff = list(difflib.unified_diff(
        sections1[section]["content"].splitlines(), 
        sections2[section]["content"].splitlines(),
        lineterm=''
    ))
    if content_diff:
        diffs["content_diff"][section] = content_diff

# return diffs


In [126]:
def is_similar(part1, part2, similarity_threshold=0.6):
    """
    Determines if two parts are similar based on their title, level, super_title, and content.

    :param part1: First part to compare.
    :param part2: Second part to compare.
    :param similarity_threshold: Threshold for content similarity.
    :return: Boolean indicating if the parts are similar.
    """
    # Check if levels are different
    if part1.level != part2.level:
        return False

    # Check if titles and super_titles are the same
    if part1.title == part2.title and part1.super_title == part2.super_title:
        return True

    # Check if titles are different but contents are similar
    if part1.title != part2.title and part1.super_title == part2.super_title:
        return _similar_content(part1.content, part2.content) >= similarity_threshold

    # Check if super_titles are different but titles are the same and contents are similar
    if part1.super_title != part2.super_title and part1.title == part2.title:
        return _similar_content(part1.content, part2.content) >= similarity_threshold

    return False


import difflib

def _similar_content(content1, content2):
    """
    Checks if the content of two parts is similar using difflib's SequenceMatcher.

    :param content1: Content of the first part.
    :param content2: Content of the second part.
    :return: Similarity ratio (float) between the contents.
    """
    # Create a SequenceMatcher object
    matcher = difflib.SequenceMatcher(None, content1, content2)

    # The ratio is a measure of the sequences' similarity
    return matcher.ratio()


# Helper function to find a similar part
def find_similar_part_in_second(part_1, part_2_s, similarity_threshold=0.6):
    for i, other_part in enumerate(part_2_s):
        if is_similar(part_1, other_part, similarity_threshold=similarity_threshold):
            return other_part, i
    return None, -1

# Helper function to find a similar part
def find_similar_part_in_first(part_1_s, part_2, similarity_threshold=0.6):
    for i, other_part in enumerate(part_1_s):
        if is_similar(other_part, part_2, similarity_threshold=similarity_threshold):
            return other_part, i
    return None, -1


def analyze_differences(text1_parts, text2_parts, similarity_threshold=0.6):
    deleted, added, rearranged, modified = [], [], [], []


    # Identify deleted and added parts
    for p in text1_parts:
        q, i = find_similar_part_in_second(p, text2_parts)
        if not q:
            deleted.append(p)
    for q in text2_parts:
        p, i = find_similar_part_in_first(text1_parts, q)
        if not p:
            added.append(q)

    # Identify rearranged and modified parts
    for i, p in enumerate(text1_parts):
        # print the title and the first 20 characters of the headerless content (save first the headerless content in a local variable)
        hl_content_p = p.headerless_content
        hl_content_q = q.headerless_content
        
        q, j = find_similar_part_in_second(p, text2_parts)
        if q:
            if i != j:
                rearranged.append((i, j))
            if p.content != q.content:
                modified.append((p, q))

    return {"deleted": deleted, "added": added, "rearranged": rearranged, "modified": modified}

# Example usage
result = analyze_differences(text1.parts, text2.parts)
result



{'deleted': [],
 'added': [],
 'rearranged': [],
 'modified': [(<__main__.Part at 0x7fccd7585280>,
   <__main__.Part at 0x7fccd75b9fa0>),
  (<__main__.Part at 0x7fccd7585a00>, <__main__.Part at 0x7fccd76296a0>),
  (<__main__.Part at 0x7fccd7585a30>, <__main__.Part at 0x7fccd76292e0>)]}

In [122]:
part1 = text1.parts[4]
part2 = text2.parts[3]
content1 = part1.content
content2 = part2.content
(_similar_content(part1.content, part2.content),
(part1.title, part1.super_title, part1.level, part1.headerless_content[0:min(20, len(part1.headerless_content))]),
(part2.title, part2.super_title, part2.level, part2.headerless_content[0:min(20, len(part2.headerless_content))]))

diff = list(difflib.unified_diff(content1.splitlines(), content2.splitlines(), lineterm=''))
print(diff)
diff_lines = sum(1 for line in diff if line.startswith('+ ') or line.startswith('- '))
total_lines = max(len(content1.splitlines()), len(content2.splitlines()))

# Check if the diff affects more than the similarity threshold
# return 1 - diff_lines / total_lines

['--- ', '+++ ', '@@ -1,73 +1,6 @@', '-The SIR model without birth and death', '-[[File:SIR trajectory.png|thumb|400px|right|A single realization of the SIR epidemic as produced with an implementation of the [[Gillespie algorithm]] and the numerical solution of the ordinary differential equation system (dashed).]]', '-The dynamics of an epidemic, for example, the [[Influenza|flu]], are often much faster than the dynamics of birth and death, therefore, birth and death are often omitted in simple compartmental models.  The SIR system without so-called vital dynamics (birth and death, sometimes called demography) described above can be expressed by the following system of ordinary [[differential equations]]:<ref name="Beckley"/><ref name="Hethcote2000">{{cite journal |author=Hethcote H |title=The Mathematics of Infectious Diseases |journal=SIAM Review |volume=42 |issue= 4|pages=599–653 |year=2000 |doi=10.1137/s0036144500371907|bibcode=2000SIAMR..42..599H |s2cid=10836889 }}</ref>', '+Trans

In [29]:
import difflib

def compare_texts(text1, text2):
    """
    Generate a diff between two texts.

    :param text1: The first text string.
    :param text2: The second text string.
    :return: The diff between the two texts.
    """
    text1_lines = text1.splitlines()
    text2_lines = text2.splitlines()

    diff = difflib.unified_diff(text1_lines, text2_lines, lineterm='')

    # return diff
    return [str(d) for d in diff]

# Example usage
text_version_1 = "This is the first version of the text."
text_version_2 = "This is the second version of the text."
diff_result = compare_texts(text_version_1, text_version_2)
print(diff_result)


['--- ', '+++ ', '@@ -1 +1 @@', '-This is the first version of the text.', '+This is the second version of the text.']


In [32]:
compare_texts(res[15]['*'], res[14]['*'])


['--- ',
 '+++ ',
 '@@ -9,7 +9,7 @@',
 ' ',
 ' ==The SIR model==',
 ' ',
 '-The \'\'\'SIR model\'\'\'<ref name="Harko">{{Cite journal| vauthors = Harko T, Lobo FS, Mak MK |s2cid=14509477|title=Exact analytical solutions of the Susceptible-Infected-Recovered (SIR) epidemic model and of the SIR model with equal death and birth rates |journal=Applied Mathematics and Computation|language=en|volume=236|pages=184–194|year=2014 |doi=10.1016/j.amc.2014.03.030|bibcode=2014arXiv1403.2160H |arxiv=1403.2160 }}</ref><ref name="Beckley">{{cite journal | vauthors = Beckley R, Weatherspoon C, Alexander M, Chandler M, Johnson A, Bhatt GS |date=2013 |title=Modeling epidemics with differential equations |url=http://www.tnstate.edu/mathematics/mathreu/filesreu/GroupProjectSIR.pdf |journal=Tennessee State University Internal Report |access-date=July 19, 2020}}</ref><ref name="KrogerSchlickeiser">{{Cite journal| vauthors = Kröger M, Schlickeiser R |s2cid=225555567 |title=Analytical solution of the SIR-model