# VanderDeleteBot

A Python script to delete Wikidata claims. Requires knowing the Q ID of the item and the UUID for the claim. Both of these identifiers are routinely stored after a VanderBot upload.

Designed as an add-on to [VanderBot](http://vanderbi.lt/vanderbot) and most of the configuration and functions are copied from the `vanderbot.py` script there, so go there for more explanation.


In [3]:
version = '0.1'
created = '2022-05-05'

# (c) 2022 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import uuid
import re
from datetime import datetime
import urllib.parse
import pandas as pd

# Set script-wide variable values. 

claims_to_delete_filename = 'author_delete.csv'
claim_uuid_column_name = 'author_uuid' # Note Q ID column is hard coded to "qid"
log_path = 'delete_log.txt' # path to log file, default to none if empty string

error_log = '' # start the error log

if log_path != '':
    log_object = open(log_path, 'wt', encoding='utf-8') # direct output sent to log_object to log file instead of sys.stdout
else:
    log_object = sys.stdout # log output defaults to the console screen

credentials_path_string = 'home' # value is "home", "working", "gdrive", or a relative or absolute path with trailing "/"
credentials_filename = 'wikibase_credentials.txt' # name of the API credentials file

if credentials_path_string == 'home': # credential file is in home directory
    home = str(Path.home()) # gets path to home directory; works for both Win and Mac
    credentials_path = home + '/' + credentials_filename
elif credentials_path_string == 'working': # credential file is in current working directory
    credentials_path = credentials_filename
else:  # credential file is in a directory whose path was specified by the credential_path_string
    credentials_path = credentials_path_string + credentials_filename

# The limit for bots without a bot flag seems to be 50 writes per minute. That's 1.2 s between writes.
# To be safe and avoid getting blocked, leave the api_sleep value at its default: 1.25 s.
# The option to increase the delay is offered if the user is a "newbie", defined as having an
# account less than four days old and with fewer than 50 edits. The newbie limit is 8 edits per minute.
# Therefore, newbies should set the API sleep value to 8 to avoid getting blocked.
api_sleep = 1.25

# DO NOT decrease this limit unless you have obtained a bot flag! If you have a bot flag, then you have created your own
# User-Agent and are not using VanderBot any more. In that case, you must change the user_agent_header below to reflect
# your own information. DO NOT get me in trouble by saying you are using my User-Agent if you are going to violate 
# Wikimedia guidelines !!!
if api_sleep < 1.25:
    api_sleep = 1.25

# See https://meta.wikimedia.org/wiki/User-Agent_policy
user_agent_header = 'VanderDeleteBot/' + version + ' (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'

# If you don't know what you are doing, leave this value alone. In any case, it is rude to use a value greater than 5.
maxlag = 5

accept_media_type = 'application/json'

def generate_header_dictionary(accept_media_type,user_agent_header):
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/json',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# Generate the request header using the function above
request_header = generate_header_dictionary(accept_media_type,user_agent_header)

# -----------------------------------------------------------------
# function definitions

def retrieveCredentials(path):
    with open(path, 'rt') as fileObject:
        lineList = fileObject.read().split('\n')
    endpointUrl = lineList[0].split('=')[1]
    username = lineList[1].split('=')[1]
    password = lineList[2].split('=')[1]
    #userAgent = lineList[3].split('=')[1]
    credentials = [endpointUrl, username, password]
    return credentials

def getLoginToken(apiUrl):    
    parameters = {
        'action':'query',
        'meta':'tokens',
        'type':'login',
        'format':'json'
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data['query']['tokens']['logintoken']

def logIn(apiUrl, token, username, password):
    parameters = {
        'action':'login',
        'lgname':username,
        'lgpassword':password,
        'lgtoken':token,
        'format':'json'
    }
    r = session.post(apiUrl, data=parameters)
    data = r.json()
    return data

def getCsrfToken(apiUrl):
    parameters = {
        "action": "query",
        "meta": "tokens",
        "format": "json"
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data["query"]["tokens"]["csrftoken"]

# This function attempts to post and handles maxlag errors
def attempt_post(apiUrl, parameters):
    maxRetries = 10
    # Wikidata recommends a retry delay of at least 5 seconds.
    # This differs from api_sleep, which is the delay when there is no lag. The baseDelay is a starting point; the
    # actual delay is increased with each retry after the server reports being lagged.
    baseDelay = 5
    delayLimit = 300
    retry = 0
    # maximum number of times to retry lagged server = maxRetries
    while retry <= maxRetries:
        if retry > 0:
            print('retry:', retry)
        r = session.post(apiUrl, data = parameters)
        data = r.json()
        try:
            # check if response is a maxlag error
            # see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
            if data['error']['code'] == 'maxlag':
                print('Lag of ', data['error']['lag'], ' seconds.')
                # recommended delay is basically useless
                # recommendedDelay = int(r.headers['Retry-After'])
                #if recommendedDelay < 5:
                    # recommendation is to wait at least 5 seconds if server is lagged
                #    recommendedDelay = 5
                recommendedDelay = baseDelay*2**retry # double the delay with each retry 
                if recommendedDelay > delayLimit:
                    recommendedDelay = delayLimit
                if retry != maxRetries:
                    print('Waiting ', recommendedDelay , ' seconds.')
                    print()
                    sleep(recommendedDelay)
                retry += 1

                # after this, go out of if and try code blocks
            else:
                # an error code is returned, but it's not maxlag
                return data
        except:
            # if the response doesn't have an error key, it was successful, so return
            return data
        # here's where execution goes after the delay
    # here's where execution goes after maxRetries tries
    print('Failed after ' + str(maxRetries) + ' retries.')
    exit() # just abort the script

# ----------------------------------------------------------------
# authentication

# default API resource URL when a Wikibase/Wikidata instance is installed.
resourceUrl = '/w/api.php'

credentials = retrieveCredentials(credentials_path)
endpointUrl = credentials[0] + resourceUrl
user = credentials[1]
pwd = credentials[2]
#userAgentHeader = credentials[3]

# Instantiate session outside of any function so that it's globally accessible.
session = requests.Session()
# Set default User-Agent header so you don't have to send it with every request
session.headers.update({'User-Agent': user_agent_header})

loginToken = getLoginToken(endpointUrl)
data = logIn(endpointUrl, loginToken, user, pwd)
csrfToken = getCsrfToken(endpointUrl)

# -------------------------------------------
# Beginning of script to process the table

# The input data is in a CSV file (name specified in the configuration section) with only two required columns: 
#`qid` and the UUID column whose name is specified in the configureation section. Other columns may be present, 
#but will be ignored. One can create this table by copying and pasting from a VanderBot upload table using the 
#`property_name_uuid` column associated with `property_name`.

# For information about the wbremoveclaims action, see
# https://www.wikidata.org/w/api.php?action=help&modules=wbremoveclaims
# https://www.wikidata.org/wiki/Special:ApiSandbox#action=wbremoveclaims&claim=Q4115189$D8404CDA-25E4-4334-AF13-A3290BCD9C0N&token=foobar&baserevid=7201010

# Here's what the request JSON looks like.
'''
{
	"action": "wbremoveclaims",
	"format": "json",
	"claim": "Q15397819$7C27786A-5FA6-4813-83B8-8ED8A81FB7D3",
	"token": "5378abbde76544dfb260e49000bf828b6274226d+\\"
}
'''

# Here's what the response JSON looks like.
'''
{
    "pageinfo": {
        "lastrevid": 1632748100
    },
    "success": 1,
    "claims": [
        "Q15397819$7C27786A-5FA6-4813-83B8-8ED8A81FB7D3"
    ]
}
'''

full_error_log = '' # start the full error log

claims_to_delete_frame = pd.read_csv(claims_to_delete_filename, na_filter=False, dtype = str)

for index, claim_row in claims_to_delete_frame.iterrows():
    qid = claim_row['qid']
    uuid = claim_row[claim_uuid_column_name]

    print('deleting:', index, qid, uuid)

    # build the parameter string to be posted to the API
    parameterDictionary = {
        'action': 'wbremoveclaims',
        'format':'json',
        'token': csrfToken
        }

    # The data value has to be turned into a JSON string
    parameterDictionary['claim'] = qid + '$' + uuid
    #print(json.dumps(dataStructure, indent = 2))
    #print(parameterDictionary)

    if maxlag > 0:
        parameterDictionary['maxlag'] = maxlag
    responseData = attempt_post(endpointUrl, parameterDictionary)
    print('Delete confirmation: ', json.dumps(responseData), file=log_object)
    print('', file=log_object)

    if 'error' in responseData:
        error_log += 'Error message from API in row ' + str(index) + ': ' + responseData['error']['info'] + '\n'
        print('Error message from API in row ' + str(index) + ': ' + responseData['error']['info'] + '\n')
        print('failed write due to error from API', file=log_object)
        print('', file=log_object)
        continue # Do not try to extract data from the response JSON. Go on with the next row and leave CSV unchanged.

    # Do not change this value, see top of script for an explanation
    sleep(api_sleep)


if error_log != '': # If there were errors display them
    print(error_log)
    if log_path != '': # if there is logging to a file, write the error log to the file
        print('\n\n' + error_log, file=log_object)
else:
    print('\nNo errors occurred.')
    if log_path != '': # if there is logging to a file, write the error log to the file
        print('\n\nNo errors occurred.', file=log_object)

if log_path != '': # only close the log_object if it's a file (otherwise it's std.out)
    log_object.close()

print('done')
   

deleting: 0 Q116053301 747EF91B-6073-4AC5-85F1-90F607DC0A1D
deleting: 1 Q116053302 BA2C99BD-7226-47EF-852D-2874824508EF
deleting: 2 Q116053303 84FD6ECC-9534-497F-BA49-6FCD8886AD79
deleting: 3 Q116053304 AE21D797-F570-48DF-A8B2-CAFD8E7A9F4C
deleting: 4 Q116053305 9662B2D1-1EE4-45A0-B21F-0F347CF6DD07
deleting: 5 Q116053306 0B325EEB-F227-485C-820D-AF538FC05C0C
deleting: 6 Q116053307 C9E4D36B-99F0-4588-9C5C-1D1C08B04FA1
deleting: 7 Q116053308 8B7B644A-A3A7-4F52-ACBE-1AC60D098A8A
deleting: 8 Q116053309 E020C56B-46E1-48FA-A987-C74EF65F01DC
deleting: 9 Q116053310 3C009CEE-B792-4684-9C4A-9500D72D6DEA
deleting: 10 Q116053311 7ED8CA8E-8057-4462-880E-E6B53B85D210
deleting: 11 Q116053312 BAEF0752-18EA-4172-8D05-93CE637B5A14
deleting: 12 Q116053313 2B5AB582-EE6E-4E74-9F49-4290519373CB
deleting: 13 Q116053313 015482C4-7724-47C8-BBBA-2C223CCC263A
deleting: 14 Q116053313 5AF6A4F5-4D9E-4173-836A-1545B50E7537
deleting: 15 Q116053314 38B1BCDF-E49A-415D-8CED-91B1433161E0
deleting: 16 Q116053314 6DDA5EEC-D