# Wikipedia page history evaluation

Wikipedia pages can be vandalized. Wikipedia itself prioritizes having low amount of false positives from automatic checkers, which can cause the latest version to be vandalized. Try to find a stable – but recent –  version of the page to be used as a context.


## Approach testing – Editor history

Use editor edit-history to see if the user can be considered a "trustworthy".

Recent edits are collected from page, and every editor is checked how their recent edits have been reverted.

TODO: implement checks that allows self-reverts.
TODO: Check that revert is not reverted back.

In [1]:
!pip install -q requests ratelimit
import requests
import ratelimit
try:
    import requests_cache
    requests_cache.install_cache("/tmp/wp-api-cache")
except ImportError:
    print("No requests cache available")
    pass

# Disable hugginface stats
import os
os.environ['HF_HUB_DISABLE_TELEMETRY'] = "1"

import logging
logging.basicConfig(level=logging.INFO)

logger = logging.getLogger(__name__)

WIKI_API_URL = "https://en.wikipedia.org/w/api.php"

# revision tags that are used to indicate revision is reverted by later edit.
# https://en.wikipedia.org/wiki/Special:Tags
MW_REVERTED_TAGS: set[str] = {"mw-reverted"}

@ratelimit.sleep_and_retry
@ratelimit.limits(calls=5, period=1)
def wpapi(params, headers={}):
    url = "https://en.wikipedia.org/w/api.php"
    params.setdefault("format", "json")
    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()
    return response.json()


### Detecting revert

Wikipedia doesn't contain reliable labeled information that can be used to indicate if the edit is a reverted. Sometimes the edit might contain tags as indicator, sometimes not. We can use the edit comment to check if the edit is a reverting previous work. If the comment contains keywords such as "revert", "undid", or "rv" plus the user's name or revision number, we can consider the edit as a revert.

TODO: Check if the comment mentions that it's reverting to an earlier commit.

In [2]:
# Function to detect if the edit is a revert

import re
from typing import TypedDict
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

revert_phrases = [
    "Reverted edits by {user!r}",
    "Reverted 1 pending edit by [[Special:Contributions/{user}|{user}]] to revision <number> by <username>: <description>",
    "Reverting possible vandalism by {user!r} to version by <username>",  # Cluebot NG
    "Reverted edit by [[Special:Contribs/{user}|{user}]] ([[User talk:{user}|talk]]) to last version by <username>",
    # "Reverted edit ",
    # "Undid revision",
    # "Restored revision <number> by <username>",
    # "Revert",
    # "Rollback",
    # "Undo",
    # "Reverted"
]

class PageRevision(TypedDict, total=False):
    pageid: int
    revid: int
    parentid: int
    user: str
    timestamp: str
    comment: str
    tags: list[str]


def is_revert(revision: PageRevision, original_edit, threshold=0.75):
    """
    Indicate if the edit is a revert.

    Uses sentence-transformers to semantically compare the comment with
    predefined revert indicator phrases.

    Returns True if the comment is likely a revert.
    """

    comment = revision.get("comment", "")
    # Remove comment (/* ... */) from the comment that is used to indicate section
    comment = re.sub(r'/\* .*? \*/', '', comment)

    if not comment:
        logger.info("No comment found in revision: %r", revision)
        return False

    # If the revision ID or username is not mentioned, it's not likely a revert. 
    username = original_edit.get("user", "")
    revision_id_str = str(original_edit.get("revid"))

    rev_pattern = r'\b' + re.escape(revision_id_str) + r'\b'
    username_pattern = r'\b' + re.escape(username) + r'\b'
    
    has_revision_id = bool(re.search(rev_pattern, comment))
    user_mentioned = bool(re.search(username_pattern, comment))

    if not any((has_revision_id, user_mentioned)):
        logger.debug("Revision ID %s or username %r not mentioned in comment: %r", revision_id_str,  username, comment)
        return False

    formatted_revert_phrases = [s.format(**original_edit) for s in revert_phrases]

    # Encode the revert phrases.
    revert_embeddings = model.encode(formatted_revert_phrases, convert_to_tensor=True, show_progress_bar=False)

    # Encode the comment.
    comment_embedding = model.encode(comment, convert_to_tensor=True, show_progress_bar=False)
    # Compute (cosine) similarity with the pre-encoded revert phrases.
    cosine_scores = model.similarity(comment_embedding, revert_embeddings)

    #max_score = cosine_scores.max().item()
    max_score = 0.0
    for i, score in enumerate(cosine_scores[0]):
        max_score = max(score, max_score)
        print(f" {max_score:.2f} - Pattern {formatted_revert_phrases[i]!r} match to {comment!r}")

    # Set a threshold for similarity (this needs to be fine-tuned).
    if max_score >= threshold:
        return True
    return False


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [3]:
def fetch_last_revisions(page_title, limit=20) -> list[PageRevision]:
    """
    Fetches the last `limit` revisions for the given Wikipedia page title.
    """
    params = {
        'action': 'query',
        'prop': 'revisions',
        'titles': page_title,
        'rvlimit': limit,
        'rvprop': 'ids|timestamp|user|comment|tags',
    }

    data = wpapi(params)
    pages = data.get('query', {}).get('pages', {})
    revisions = []
    # The query returns a dictionary keyed by pageid
    for page_id, page in pages.items():
        if "missing" in page:
            print(f"The page '{page_title}' does not exist on Wikipedia.")
            return None
        revisions = page.get('revisions', [])
        revisions.extend([PageRevision(**rev, pageid=page_id) for rev in revisions])
    return revisions

page_edits = fetch_last_revisions("Sofi_Tukker", limit=15)
display(page_edits)



[{'revid': 1276236484,
  'parentid': 1276112103,
  'user': 'Duckmather',
  'timestamp': '2025-02-17T18:12:39Z',
  'comment': '/* History */ fix broken citation',
  'tags': ['visualeditor-wikitext']},
 {'revid': 1276112103,
  'parentid': 1276109139,
  'user': '162 etc.',
  'timestamp': '2025-02-16T23:14:40Z',
  'comment': '[[WP:NAMB]]',
  'tags': ['wikieditor']},
 {'revid': 1276109139,
  'parentid': 1272098082,
  'user': '107.116.79.140',
  'anon': '',
  'timestamp': '2025-02-16T22:51:33Z',
  'comment': '',
  'tags': ['mobile edit', 'mobile web edit']},
 {'revid': 1272098082,
  'parentid': 1258534763,
  'user': '174.92.221.85',
  'anon': '',
  'timestamp': '2025-01-27T04:07:16Z',
  'comment': 'Added citations',
  'tags': ['visualeditor', 'editcheck-newreference']},
 {'revid': 1258534763,
  'parentid': 1256971887,
  'user': 'GreenC bot',
  'timestamp': '2024-11-20T06:58:32Z',
  'comment': 'Rescued 1 archive link; Move 5 urls. [[User:GreenC/WaybackMedic_2.5|Wayback Medic 2.5]] per [[WP:UR

In [4]:
def fetch_user_contributions(username: str, limit=100) -> list[PageRevision]:
    """
    Fetches up to `limit` contributions made by the given user.
    """
    params = {
        "action": "query",
        "list": "usercontribs",
        "ucuser": username,
        "uclimit": limit,
        "ucprop": "ids|title|timestamp|comment|tags",
        "format": "json"
    }
    data = wpapi(params=params)

    # Only consider the page edits
    return [PageRevision(edit) for edit in data.get("query", {}).get("usercontribs", []) if edit.get("ns", 0) == 0]

username = page_edits[-1]['user']
user_contributions = fetch_user_contributions(username, limit=100)
display(user_contributions)

[{'userid': 41625025,
  'user': '162 etc.',
  'pageid': 11264584,
  'revid': 1276460266,
  'parentid': 1209651909,
  'ns': 0,
  'title': 'Alexander Volchkov (ice hockey, born 1952)',
  'timestamp': '2025-02-18T23:51:54Z',
  'comment': '[[WP:NAMB]]',
  'tags': ['wikieditor']},
 {'userid': 41625025,
  'user': '162 etc.',
  'pageid': 1959402,
  'revid': 1276458478,
  'parentid': 19279676,
  'ns': 0,
  'title': 'Dominion Super Market',
  'timestamp': '2025-02-18T23:37:21Z',
  'comment': '[[WP:AES|←]]Changed redirect target from [[Dominion Stores]] to [[Dominion (disambiguation)]]',
  'tags': ['mw-changed-redirect-target',
   'disambiguator-link-added',
   'wikieditor']},
 {'userid': 41625025,
  'user': '162 etc.',
  'pageid': 24956462,
  'revid': 1276458329,
  'parentid': 323835595,
  'ns': 0,
  'title': 'Dominion Store',
  'timestamp': '2025-02-18T23:35:52Z',
  'comment': '[[WP:AES|←]]Changed redirect target from [[Dominion (supermarket)]] to [[Dominion (disambiguation)]]',
  'tags': ['mw

In [5]:

from datetime import datetime, timedelta
from typing import Iterator, List

def get_next_revisions(revision: PageRevision, n) -> list[PageRevision]:
    """
    Fetches the next `n` revisions after the given timestamp on the specified page.
    """

    revision_timestamp = revision['timestamp']

    dt = datetime.strptime(revision_timestamp, "%Y-%m-%dT%H:%M:%SZ")
    dt_next = dt + timedelta(seconds=1)
    start_timestamp = dt_next.strftime("%Y-%m-%dT%H:%M:%SZ")

    params = {
        "action": "query",
        "prop": "revisions",
        "pageids": revision['pageid'],
        "rvstart": start_timestamp,
        "rvdir": "newer",
        "rvlimit": n,
        "rvprop": "ids|timestamp|comment|tags"
    }
    data = wpapi(params)
    logger.debug("Next revisions: %r", data)
    pages = data.get("query", {}).get("pages", {})
    revisions_list = []
    for pid, revision in pages.items():
        revisions = revision.get("revisions", [])
        revisions_list.extend([PageRevision(**rev, pageid=pid) for rev in revisions])
    return revisions_list


def check_contribution_reverted(revision: PageRevision, num_later_edits=10) -> List[bool | PageRevision]:
    """
    Has the contribution been reverted?

    Checks if a particular contribution was reverted by examining the next
    `num_later_edits` revisions on the page. If any of those revisions' comments
    indicate a revert targeting the given username or revision, returns True.

    Returns list of >0 if the edit is reverted.
    """

    # Check if the edit has the "mw-reverted" tag.
    cx_tags = MW_REVERTED_TAGS & set(revision.get("tags", []))
    logger.debug("Revision tags: %r", revision.get("tags", []))
    if len(cx_tags):
        logger.info("[REVERTED] Revision %d is tagged as reverted with tag(s) %r", revision['revid'], cx_tags)
        return [True]

    # Check the follow-up revisions if they mention this edit.
    next_revs = get_next_revisions(revision, num_later_edits)
    logger.debug("Number of following revisions: %d", len(next_revs))

    r = []
    for followup_revision in next_revs:
        comment = followup_revision.get("comment", "")

        logger.debug(f"Checking revision {followup_revision['revid']} with comment: {comment!r}")

        if is_revert(followup_revision, revision):
            # TODO: We should recurse to see, if reverting revision has been reverted
            logger.info("[REVERTED] Comment %r is reverted by %r (revid:%d)", revision['comment'], comment, followup_revision['revid'])
            r.append(followup_revision)
    return r

earliest_contrib = page_edits[-1]
print("Checking edit:", earliest_contrib)

reverts = list(check_contribution_reverted(earliest_contrib))

display(reverts)



INFO:__main__:No comment found in revision: {'revid': 1254925063, 'parentid': 1254811923, 'timestamp': '2024-11-02T10:02:53Z', 'comment': '', 'tags': ['mobile edit', 'mobile web edit', 'visualeditor', 'mw-reverted', 'disambiguator-link-added'], 'pageid': '49079438'}
INFO:__main__:No comment found in revision: {'revid': 1255033368, 'parentid': 1254954349, 'timestamp': '2024-11-02T20:16:04Z', 'comment': '', 'tags': ['mobile edit', 'mobile web edit', 'visualeditor', 'mw-manual-revert', 'mw-reverted', 'disambiguator-link-added'], 'pageid': '49079438'}
INFO:__main__:No comment found in revision: {'revid': 1256962846, 'parentid': 1255057937, 'timestamp': '2024-11-12T13:12:44Z', 'comment': '', 'tags': ['visualeditor', 'mw-manual-revert', 'mw-reverted', 'disambiguator-link-added'], 'pageid': '49079438'}


Checking edit: {'revid': 1254811923, 'parentid': 1254792543, 'user': '162 etc.', 'timestamp': '2024-11-01T20:19:03Z', 'comment': 'Undid revision [[Special:Diff/1254792543|1254792543]] by [[Special:Contributions/31.217.4.176|31.217.4.176]] ([[User talk:31.217.4.176|talk]])', 'tags': ['mw-undo', 'wikieditor'], 'pageid': '49079438'}


[]

In [6]:
# Final run - Collect edits, collect editors from the edits, and check how many of the lastest edits by the editor is reverted.

from typing import NamedTuple


class RevertStats(NamedTuple):
    rate: float
    total: int
    reverted: int

def compute_user_revert_rate(username: str, contrib_limit: int = 100, later_edits_to_check: int = 10) -> RevertStats:
    """
    Computes the revert rate for a user by checking up to `contrib_limit`
    of their contributions. For each contribution, it checks the next
    `later_edits_to_check` revisions to see if it was reverted.
    
    Returns a tuple: (revert_rate, total_checked, total_reverted).
    """
    contributions = fetch_user_contributions(username, limit=contrib_limit)
    if not contributions:
        return RevertStats(None, 0, 0)
    total = 0
    reverted = 0
    for contrib in contributions:
        total += 1
        if check_contribution_reverted(contrib, num_later_edits=later_edits_to_check):
            reverted += 1
        # time.sleep(0.2)
    rate = (reverted / total) if total > 0 else 0
    return RevertStats(rate, total, reverted)

username = "2A05:4F44:1701:2500:D010:2C76:10A4:52C7"
rate, total, reverted = compute_user_revert_rate(username)
print(f"Revert rate for {username}: {rate:.2%} ({reverted}/{total})")


INFO:__main__:[[REVERTED] Revision 1256966548 is tagged as reverted with tag(s) {'mw-reverted'}
INFO:__main__:[[REVERTED] Revision 1256962846 is tagged as reverted with tag(s) {'mw-reverted'}
INFO:__main__:[[REVERTED] Revision 1256962795 is tagged as reverted with tag(s) {'mw-reverted'}


Revert rate for 2A05:4F44:1701:2500:D010:2C76:10A4:52C7: 100.00% (3/3)


In [7]:
# Rank the editors by revert rate

editors = set([edit['user'] for edit in page_edits])

revert_rates = {editor: compute_user_revert_rate(editor) for editor in editors}

sorted_revert_rates = sorted(revert_rates.items(), key=lambda x: x[1].rate, reverse=True)

for editor, stats in sorted_revert_rates:
    print(f"{editor}: {stats.rate:.2%} ({stats.reverted}/{stats.total})")


INFO:__main__:No comment found in revision: {'revid': 1143594533, 'parentid': 1140841447, 'timestamp': '2023-03-08T18:38:23Z', 'comment': '', 'tags': ['wikieditor'], 'pageid': '71128864'}
INFO:__main__:No comment found in revision: {'revid': 1157381074, 'parentid': 1153788063, 'timestamp': '2023-05-28T07:11:24Z', 'comment': '', 'tags': ['wikieditor'], 'pageid': '71128864'}
INFO:__main__:No comment found in revision: {'revid': 1159978691, 'parentid': 1157381074, 'timestamp': '2023-06-13T19:13:15Z', 'comment': '', 'tags': ['visualeditor', 'mw-reverted'], 'pageid': '71128864'}
INFO:__main__:No comment found in revision: {'revid': 1090199809, 'parentid': 1087991721, 'timestamp': '2022-05-28T01:57:47Z', 'comment': '/* Early life */', 'tags': ['mw-reverted', 'wikieditor'], 'pageid': '17880869'}
INFO:__main__:No comment found in revision: {'revid': 1103755285, 'parentid': 1101897165, 'timestamp': '2022-08-10T22:41:39Z', 'comment': '', 'tags': ['mobile edit', 'mobile web edit', 'mw-reverted'],