# Extract a network of pages from Wikipedia

In [30]:
import os

In [31]:
verbose = True
very_verbose = True
is_new_page = False

In [32]:
seed_file = os.path.join('data', 'NGI_all_keywords.txt')
seed_filename = os.path.basename(seed_file)
seed_filename_noext = os.path.splitext(seed_filename)[0]

In [33]:
# directory containing the seed articles (./data)
data_folder = 'data'
# directory containing the outlinks of each article (./data/links)
link_folder = os.path.join(data_folder, 'links')
# directory containing the resulting networks (./data/results)
results_folder = os.path.join(data_folder, 'results')

os.makedirs(link_folder, exist_ok=True)
os.makedirs(results_folder, exist_ok=True)

In [34]:
import codecs

log = codecs.open(results_folder + '/extract_network.log', 'a', 'UTF-8') # log file
net_file = codecs.open(results_folder + '/all_network_' + seed_filename_noext + '.csv', 'w', 'UTF-8') #file containing the network with all articles

p_id = 0

first_step ={}

In [35]:
import codecs

def load_seed(file_name):
    """
    Load the seed file, ignoring empty lines and lines starting with #
    """
    dic = {}
    f = codecs.open(file_name, 'r', 'utf-8')
    for line in f:
        el = line.strip('\n')
        if el and el[0] != '#':
            el = el.replace(' ', '_')
            dic[el] = 1
    return dic

In [36]:
load_seed(seed_file)

{'Accountability': 1,
 'Algorithmic_bias': 1,
 'Algorithmic_regulation': 1,
 'Anticipatory_governance': 1,
 'Backward_Compatibility': 1,
 'Child_protection': 1,
 'Circular_economy': 1,
 'Community_network': 1,
 'Competition_law': 1,
 'Computer_and_network_surveillance': 1,
 'Computer-supported_collaboration': 1,
 'Context_awareness': 1,
 'Copyright': 1,
 'Credential_stuffing': 1,
 'Cyber_sovereignty': 1,
 'Cybercrime': 1,
 'Data_literacy': 1,
 'Data_localization': 1,
 'Data_ownership': 1,
 'Data_retention': 1,
 'Data_Sovereignty': 1,
 'Decentralization': 1,
 'Decision-making': 1,
 'Digital_citizen': 1,
 'Digital_commons_(economics)': 1,
 'Digital_divide': 1,
 'Digital_economy': 1,
 'Digital_identity': 1,
 'Digital_labor': 1,
 'Digital_learning': 1,
 'Digital_Single_Market': 1,
 'Digital_transformation': 1,
 'Discrimination': 1,
 'Disruptive_innovation': 1,
 'Distance_education': 1,
 'E-Administration': 1,
 'E-commerce': 1,
 'E-democracy': 1,
 'E-procurement': 1,
 'Echo_chamber_(media)'

In [37]:
from wikitools import wiki
from wikitools import api

# create a Wiki object
site = wiki.Wiki("https://en.wikipedia.org/w/api.php")

# get outlinks of a wikipedia article through wiki api
def get_outlinks_from_api(title):
    p_id = -1
    outlinks = []

    if title == '' or title == ' ':
        return p_id, outlinks

    params = {'action':'query',
              'prop':'revisions',
              'titles': title,
              'rvprop':'content',
              'redirects':1
              }
    request = api.APIRequest(site, params)

    if very_verbose:
        print('    ' + 'query: ' + str(params))

    result = request.query()
    if int(list(result['query']['pages'].keys())[0]) < 1:
        print('ARTICLE NOT FOUND: {}'.format(title))
        log.write('{}\n'.format(title))
        return (p_id, outlinks)

    else:
        outlinks = []
        p_id = list(result['query']['pages'].keys())[0]
        rev = result['query']['pages'][p_id]['revisions'][0]
        content = rev['*']
        links = parse_text(content)
        for l in links:
            target = l.replace(' ', '_')
            outlinks.append(target)
        return (p_id, outlinks)


In [38]:
def get_outlinks(title):
    outlinks_filename = '{}_articles.txt'.format(title.replace('/','.'))
    outlinks_saved = os.path.join(link_folder, outlinks_filename)

    # data saved in the disc
    is_new_page = False
    try:
        with open(outlinks_saved) as f:
            outlinks_checked = f.read().splitlines()
    except IOError as e:
        outlinks_checked = []
        
    if outlinks_checked:
        if verbose:
            print('{} saved links from: {}'.format(len(outlinks_checked), title))

    if not outlinks_checked:
        # get data through wiki API
        is_new_page = True

        print('title: {}'.format(title))
        # p is the page_id
        (p, outlinks) = get_outlinks_from_api(title)
        
        (redirects, outlinks_checked) = check_redirects(outlinks)

    return (is_new_page,outlinks_checked)

In [44]:
def list2params(alist):
    chunk_len = 50
    if len(alist) < 1:
        return ''
    s = ['']
    l = 0
    i = 0

    chunks = [alist[x:x+chunk_len]
              for x in range(0, len(alist), chunk_len)
              ]

    return chunks

In [45]:
# Check redirect in Wikipedia articles/links
def check_redirects(titles):

    redirects = {}
    links = {}
    duplicates = 0

    title_lists = list2params(titles)
    for title_list in title_lists:

        params = {'action':'query', 'titles':title_list, 'redirects':1} #, 'pllimit': 500, 'redirects':1}
        request = api.APIRequest(site, params)
        if very_verbose: print ('   ' + 'query: ' + str(params))
        result = request.query()

        if very_verbose: print(result)

        if 'redirects' in result['query']:
            for redir in result['query']['redirects']:
                redirects[redir['from']] = redir['to']

        for page in result['query']['pages']:
            if page != '-1' and 'ns' in result['query']['pages'][page]:
                if result['query']['pages'][page]['ns'] == 0:
                    link = result['query']['pages'][page]['title'].replace(' ', '_')
                    if link in links:
                        duplicates += 1
                    links[link] = page

    missing = len(titles) - (len(links) + duplicates)
    if very_verbose and missing != 0:
        print('%d missing redirects (%d titles,  %d found, %d duplicates)' %(missing, len(titles), len(links), duplicates))
    return redirects,links

In [56]:
import re

# Regular expressions to find URLs in text
linkSimpleP = re.compile(r'\[\[(.+?)[][|{}/#]')
linkGreedyP = re.compile(r'\[\[([^]^[^}^{^#^/^|]+)')
linkP = re.compile(r'\[\[([^]^[^}^{^#^/^]+?)\s*(?:/[^]^[]*?)?\s*(?:\|[^]^[]*?)?(?:\}\})?\s*\]\]')

def parse_text(content):
    links = {}
    rough_links = re.findall(linkP, content) #get all links

    title_lists = list2params(rough_links)

    for title_list in title_lists:

        params = {'action':'query',
                  'titles':title_list,
                  'redirects':1
                  }
        request = api.APIRequest(site, params)
        
        if very_verbose:
            print ('   ' + 'query: ' + str(params))
    
        result = request.query()

        if very_verbose:
            print ('   ' + 'result: ' + str(result))

        if result['query'].get('pages', None):
            for page in result['query']['pages']:
                if page != '-1' and 'ns' in result['query']['pages'][page]: # Filter just useful links. What is ns?
                    if result['query']['pages'][page]['ns'] == 0:
                        link = result['query']['pages'][page]['title'] #.replace(' ', '_')
                        links[link] = 1

    return links

In [57]:
def extract_network(seed_file):
    # list containing all outlinks of seed pages
    links_to_seedlinks = []
    # read the list of seed defined by the user
    seeds_list = load_seed(seed_file)

    for seed in set(seeds_list.keys()):
        print("--> new seed article: {}\n".format(seed))

        (is_new_page, outlinks_checked) = get_outlinks(seed) # get outlink for current seed
        links_to_seedlinks.append(outlinks_checked) # append this outlinks to the list of all seeds' outlinks

        if is_new_page: ## data not already saved in disk, save it
            page_list_filename = seed.replace('/','.') + '_articles.txt'
            page_list_file = codecs.open(os.path.join(link_folder, page_list_filename),
                                         'w', 'utf-8'
                                         )

            ##########
            # update degrees for seed and current target outlink
            ##########
            if seed in first_step.keys():
                first_step[seed]['out_WP'] = len(outlinks_checked)

        for target in outlinks_checked: # outlinks_checked contains the outlinks of the current keyword
            if seed != target:
                net_file.write( seed + '\t' + target  + '\n') # write edge as pair of nodes (source, target)
                if is_new_page:
                    page_list_file.write(target + '\n')
                try:
                    #### 17.01
                    first_step[seed]['out_degree'] += 1
                    #print('seed outdegree ', seed.decode('utf8'), first_step[seed.decode('utf8')]['out_degree'])
                    if(target in seeds_list):
                        first_step[seed]['links_to_seed'] += 1
                    ####
                except:
                    if(target in seeds_list):
                        first_step[seed] = {'seed': True,
                                            'links_from_seed': 0,
                                            'links_to_seed': 1,
                                            'in_degree': 0,
                                            'out_degree': 1,
                                            'out_WP': len(outlinks_checked)
                                            }
                    else:
                        first_step[seed] = {'seed': True,
                                            'links_from_seed': 0,
                                            'links_to_seed': 0,
                                            'in_degree': 0,
                                            'out_degree': 1,
                                            'out_WP': len(outlinks_checked)
                                            }
                try:
                    first_step[target]['links_from_seed'] += 1
                    first_step[target]['in_degree'] += 1
                    if(target in seeds_list):
                        first_step[target]['seed'] = True
                except:
                    if(target in seeds_list):
                        first_step[target] = {'seed': True,
                                              'links_from_seed': 1,
                                              'links_to_seed': 0,
                                              'in_degree': 1,
                                              'out_degree': 0,
                                              'out_WP': 0
                                              }
                    else:
                        first_step[target] = {'seed': False,
                                              'links_from_seed': 1,
                                              'links_to_seed': 0,
                                              'in_degree': 1,
                                              'out_degree': 0,
                                              'out_WP': 0
                                              }

        if is_new_page:
            page_list_file.close()

    #from the list of outlinks keep only those that are no seeds and remove duplicates
    links_to_seedlinks = list(itertools.chain.from_iterable(links_to_seedlinks))
    links_to_seedlinks = list(set(links_to_seedlinks) - set(seeds_list.keys()) )

    ##########
    # Start new iteraction over outlinks of the seed articles
    ##########
    olink_index = 1 #just a counter
    for title in links_to_seedlinks:
        print("\n\n outlink " + str(olink_index) + " of: " + str(len(links_to_seedlinks)) + "\n\n")
        olink_index += 1
        (is_new_page, outlinks_checked) = get_outlinks(title) # get outlinks of current articles

        if is_new_page: ## data not already saved in disk, save it
            page_list_file_title = codecs.open(link_folder + title.replace('/','.') + '_articles.txt', 'w', 'UTF-8')

            ##########
            # update degress of source and target
            ##########
            first_step[title]['out_WP'] = len(outlinks_checked)
        for target in outlinks_checked: # outlinks_checked contains the outlinks of the current keyword
            if is_new_page:
                page_list_file_title.write(target + '\n')
            if target in seeds_list.keys( ) and target != title:
                net_file.write( title + '\t' + target  + '\n') # write edge as pair of nodes
                first_step[title]['links_to_seed'] += 1
                first_step[title]['out_degree'] += 1
                first_step[target]['in_degree'] += 1
            else:
                if target in links_to_seedlinks and target != title:
                    net_file.write( title + '\t' + target  + '\n') # write edge as pair of nodes
                    first_step[title]['out_degree'] += 1
                    first_step[target]['in_degree'] += 1
        # note that I write inside each if because I write only if the target is either a seed or a outlink of a seed (since we
        # only want the first step degree net)
        ##########
        ##########
        if is_new_page:
            page_list_file_title.close()

    net_file.close()
    # Write degrees in a file
    with open(res_folder + '/first_step_degrees_' + seeds_no_ext +'.csv','wb') as f:
        w = csv.writer(f, delimiter ="\t")
        degrees = first_step.values()[0].keys()
        w.writerow(['Page'] + degrees)
        for page in first_step.keys():
            w.writerow([page] + [first_step[page][degree] for degree in degrees])


In [None]:
extract_network(seed_file)

--> new seed article: Amazon_(company)

1 saved links from: Amazon_(company)
--> new seed article: Wealth_concentration

1 saved links from: Wealth_concentration
--> new seed article: Usability

1 saved links from: Usability
--> new seed article: Algorithmic_bias

1 saved links from: Algorithmic_bias
--> new seed article: Network_transparency

title: Network_transparency
    query: {'action': 'query', 'prop': 'revisions', 'titles': 'Network_transparency', 'rvprop': 'content', 'redirects': 1}
   query: {'action': 'query', 'titles': ['computer network', 'Transparency (human–computer interaction)', 'cloud storage', 'cloud computing', 'X Window System', 'LWN.net', 'centralized database system', 'storage system', 'Distributed Database Management System', 'computer network', 'database management system', 'Internet Layer', 'Internet Layer', 'Data independence', 'Replication transparency', 'Category:Telecommunications', 'Category:Data management'], 'redirects': 1}
   result: {'batchcomplete': 

   result: {'batchcomplete': '', 'query': {'pages': {'16508527': {'pageid': 16508527, 'ns': 0, 'title': 'Digital footprint'}}}}
   query: {'action': 'query', 'titles': ['E-authentication', 'Federated identity', 'Informational self-determination', 'Privacy by design', 'Category:Identity management', 'Category:Identity', 'Category:Digital technology', 'Category:Federated identity', 'Category:Computer access control'], 'redirects': 1}
   result: {'batchcomplete': '', 'query': {'pages': {'30841684': {'pageid': 30841684, 'ns': 14, 'title': 'Category:Computer access control'}}}}
   query: {'action': 'query', 'titles': ['Digital_footprint'], 'redirects': 1}
{'batchcomplete': '', 'query': {'normalized': [{'from': 'Digital_footprint', 'to': 'Digital footprint'}], 'pages': {'16508527': {'pageid': 16508527, 'ns': 0, 'title': 'Digital footprint'}}}}
--> new seed article: Point-to-multipoint_communication

title: Point-to-multipoint_communication
    query: {'action': 'query', 'prop': 'revisions', 

   result: {'batchcomplete': '', 'query': {'pages': {'18806314': {'pageid': 18806314, 'ns': 14, 'title': 'Category:Articles containing video clips'}}}}
   query: {'action': 'query', 'titles': ['Artificial_intelligence', 'Missile_guidance', 'Salience_(neuroscience)'], 'redirects': 1}
{'batchcomplete': '', 'query': {'normalized': [{'from': 'Salience_(neuroscience)', 'to': 'Salience (neuroscience)'}], 'pages': {'4217714': {'pageid': 4217714, 'ns': 0, 'title': 'Salience (neuroscience)'}}}}
2 missing redirects (3 titles,  1 found, 0 duplicates)
--> new seed article: Quantum_network

title: Quantum_network
    query: {'action': 'query', 'prop': 'revisions', 'titles': 'Quantum_network', 'rvprop': 'content', 'redirects': 1}
   query: {'action': 'query', 'titles': ['quantum computing', 'quantum communication', 'qubits', 'quantum gates', 'qubit', 'quantum computing', 'Computer cluster', 'computer cluster', 'quantum communication', 'qubits', 'internet', 'quantum entanglement', 'quantum key distri

   result: {'batchcomplete': '', 'query': {'normalized': [{'from': 'telecommuting', 'to': 'Telecommuting'}], 'pages': {'158555': {'pageid': 158555, 'ns': 0, 'title': 'Telecommuting'}}}}
   query: {'action': 'query', 'titles': ['Plain old telephone service', 'Verizon', 'Northeastern United States', 'Telstra', 'Bharat Sanchar Nigam Limited', 'Reliance Communications', 'Bharti Airtel', 'Life Insurance Corporation of India', 'State Bank of India', 'Nippon Telegraph and Telephone', 'Japan', "FLET's", "FLET's", 'ADSL', 'cable Internet access', 'fiber to the home', 'PSTN', 'IP network', 'United Kingdom', 'BT Group', 'BT Highway', 'universal serial bus', 'France Telecom', 'Private branch exchange', 'Point of sale', 'File:DBP 1988 1368 ISDN.jpg', 'Germany', 'Deutsche Telekom', ':de:Network Termination for ISDN Basic rate Access', 'terminal adapter', 'ADSL', 'VDSL', 'VoIP', 'VDSL2', 'MSAN', 'G.992.3 Annex J', 'OTE', 'Greece', 'Germany', 'PDF', 'Norway', 'Denmark', 'Germany', 'Switzerland', 'Japa

   result: {'batchcomplete': '', 'query': {'normalized': [{'from': 'information revolution', 'to': 'Information revolution'}], 'pages': {'2100075': {'pageid': 2100075, 'ns': 0, 'title': 'Information revolution'}}}}
   query: {'action': 'query', 'titles': ['knowledge economy', 'Peter Drucker', 'Peter Drucker', 'Marc Porat', 'Marc Porat', 'OECD', 'Daniel Bell', 'Daniel Bell', 'Alain Touraine', 'Jean-François Lyotard', 'Jean-François Lyotard', 'Radovan Richta', 'Nico Stehr', 'Alvin Toffler', 'network society', 'Manuel Castells', 'Jan Van Dijk', 'Darin Barney', 'Darin Barney', 'Frank Webster (sociologist)', 'cultural capital', 'corporate capitalism', 'Christian Fuchs (sociologist)', 'Peter Glotz', 'Peter Glotz', 'John Bellamy Foster', 'Wolfgang Fritz Haug', 'Manuel Castells', 'Christian Fuchs (sociologist)', 'Nicholas Garnham', 'Nicholas Garnham', 'Antonio Negri', 'Michael Hardt', 'affective labor', 'David Harvey (geographer)', 'structural unemployment', 'social exclusion', 'deregulation',

   query: {'action': 'query', 'titles': ['eavesdropping', 'Signals intelligence', 'Trusted Computing', 'Nikola Tesla', 'radio controlled', 'Madison Square Garden', 'coherer', 'SIGSALY', 'Winston Churchill', 'Franklin D. Roosevelt', 'white noise', 'one-time pad', 'Code', 'Semaphore line', 'Encryption', 'Steganography', 'anonymity', 'Crowds', 'cellphone', 'Internet cafe', 'Anonymous proxies', 'routing', 'traffic analysis', 'trojan horse', 'keystroke logging', 'spyware', 'antivirus', 'firewall (computing)', 'adware', 'spyware', 'Proxomitron', 'Privoxy', 'computer security', 'Encryption', 'Backdoor (computing)', 'Man-in-the-middle attack', 'key size', 'Opportunistic encryption', 'Code talker', 'authentication', 'anonymity', 'eavesdropping', 'Information-theoretic security', 'Steganography', 'plausible deniability', 'Crowds', 'Tor (anonymity network)', 'I2P', 'Mixminion', 'anonymous P2P'], 'redirects': 1}
   result: {'batchcomplete': '', 'query': {'normalized': [{'from': 'anonymous P2P', 't

   result: {'batchcomplete': '', 'query': {'pages': {'4536496': {'pageid': 4536496, 'ns': 0, 'title': 'TSIG'}}}}
   query: {'action': 'query', 'titles': ['typeface', 'internationalized domain name', 'ISO 10646', 'phishing', 'forward-confirmed reverse DNS', 'exfiltrate', 'Proxy server', 'Tor (anonymity network)', 'VPN', 'Cloudflare', 'Internet Corporation for Assigned Names and Numbers', 'OpenNIC', 'WHOIS', 'Registration Data Access Protocol', 'ICANN', 'country code top-level domain', 'DENIC', 'Generic top-level domain', 'GoDaddy', 'Directi', 'VeriSign', 'Public Interest Registry', 'Request for Comments', 'Internet Engineering Task Force', 'Internet standard', 'Alternative DNS root', 'Comparison of DNS server software', 'DNS hijacking', 'DNS management software', 'DNS over HTTPS', 'DNS over TLS', 'Hierarchical namespace', 'IPv6 brokenness and DNS whitelisting', 'Multicast DNS', 'Public recursive name server', 'resolv.conf', 'Split-horizon DNS', 'List of DNS record types', 'List of manag

   result: {'batchcomplete': '', 'query': {'pages': {'1397': {'pageid': 1397, 'ns': 0, 'title': 'AOL'}}}}
   query: {'action': 'query', 'titles': ['Google Keep', 'note-taking', 'Boy Genius Report', 'Penske Media Corporation', 'Google Translate', 'Time (magazine)', 'YouTube', 'Mashable', 'Google+', 'Google Allo', 'Google Duo', 'Wired (website)', 'Condé Nast', 'TechCrunch', 'AOL', 'The Verge', 'Vox Media', 'Android (operating system)', 'mobile operating system', 'TechCrunch', 'AOL', 'Android Wear', 'The Verge', 'Vox Media', 'Android TV', 'Android Auto', 'TechCrunch', 'AOL', 'Internet of things', 'Android Things', 'The Verge', 'Vox Media', 'Google Chrome', 'Chrome OS', 'Nexus One', 'TechCrunch', 'AOL', 'Ars Technica', 'Condé Nast', 'Pixel (smartphone)', 'The Verge', 'Vox Media', 'Chromebook', 'Chromecast', 'The Verge', 'Vox Media', 'BBC News', 'BBC', 'Google Cardboard', 'virtual reality'], 'redirects': 1}
   result: {'batchcomplete': '', 'query': {'normalized': [{'from': 'virtual reality'

   result: {'batchcomplete': '', 'query': {'pages': {'732792': {'pageid': 732792, 'ns': 14, 'title': 'Category:Internet companies of the United States'}}}}
   query: {'action': 'query', 'titles': ['Category:Internet marketing companies', 'Category:Internet properties established in 1998', 'Category:Mobile phone manufacturers', 'Category:Multinational companies headquartered in the United States', 'Category:Online advertising', 'Category:Technology companies based in the San Francisco Bay Area', 'Category:Technology companies established in 1998', 'Category:University spin-offs', 'Category:Virtual reality companies', 'Category:Web portals', 'Category:Web service providers', 'Category:Webby Award winners', 'Category:World Wide Web', 'Category:Eyewear companies of the United States'], 'redirects': 1}
   result: {'batchcomplete': '', 'query': {'pages': {'55663249': {'pageid': 55663249, 'ns': 14, 'title': 'Category:Eyewear companies of the United States'}}}}
   query: {'action': 'query', 't

   result: {'batchcomplete': '', 'query': {'normalized': [{'from': 'eSports', 'to': 'ESports'}], 'redirects': [{'from': 'ESports', 'to': 'Esports'}], 'pages': {'564204': {'pageid': 564204, 'ns': 0, 'title': 'Esports'}}}}
   query: {'action': 'query', 'titles': ['NFL', 'Thursday Night Football', '2016 NFL season', 'Bloomberg L.P.', 'BuzzFeed', 'Cheddar (TV channel)', 'WME', 'Live Nation Entertainment', 'Major League Baseball', 'MTV', 'BET (TV channel)', 'MTV Video Music Awards', 'MTV Movie & TV Awards', 'BET Awards', 'NFL Network', 'PGA Tour', "The Players' Tribune", 'Ben Silverman', 'Howard T. Owens', 'The Verge', 'Stadium (sports network)', 'Silver Chalice', 'Sinclair Broadcast Group', 'WNBA', 'Alexa Internet', 'web traffic', 'Alexa Internet', 'Compete.com', 'Compete.com', 'statista.com', 'Jeremiah Owyang', 'The New York Times', 'comScore', "Shaquille O'Neal", 'Britney Spears', 'Ashton Kutcher', 'comScore', 'Sysomos', 'Sysomos', 'Gawker', 'Univision Communications', 'Business Insider'

   result: {'batchcomplete': '', 'query': {'pages': {'42556315': {'pageid': 42556315, 'ns': 0, 'title': 'Grabyo'}}}}
   query: {'action': 'query', 'titles': ['Facebook', 'Katy Perry', 'Justin Bieber', 'Barack Obama on social media', 'Barack Obama', 'U.S. President', 'Rihanna', 'Taylor Swift', 'Lady Gaga', 'Ellen DeGeneres', 'Cristiano Ronaldo', 'Association football', 'YouTube', 'Online video platform', 'Justin Timberlake', 'Jack Dorsey', 'Biz Stone', 'Noah Glass (Twitter)', 'selfie', '86th Academy Awards', 'Ellen DeGeneres', 'BBC News', 'Meryl Streep', 'Jared Leto', 'Jennifer Lawrence', 'Meryl Streep', 'Ellen DeGeneres', 'Bradley Cooper', "Peter Nyong'o Jr.", 'Channing Tatum', 'Julia Roberts', 'Kevin Spacey', 'Brad Pitt', "Lupita Nyong'o", 'Angelina Jolie', 'Lego', 'Matt Groening', 'The Simpsons', 'Barack Obama', 'United States presidential election, 2012', 'BBC News', 'Guinness World Records', 'Robert Downey Jr.', 'Guinness World Records', 'Caitlyn Jenner', 'Studio Ghibli', 'Castle i

   result: {'batchcomplete': '', 'query': {'pages': {'24877': {'pageid': 24877, 'ns': 0, 'title': "Pell's equation"}}}}
   query: {'action': 'query', 'titles': ["Grover's algorithm", "Grover's algorithm", "Grover's algorithm", "Grover's algorithm", 'Boolean satisfiability problem', 'Password cracking', 'encryption', 'Symmetric-key algorithm', 'Triple DES', 'Advanced Encryption Standard', 'Quantum simulator', 'collider', 'Adiabatic quantum computation', 'Quantum algorithm for linear systems of equations', 'John Preskill', 'quantum supremacy', 'Google', 'IBM', 'holographic principle', 'Born rule', 'David P. DiVincenzo', "DiVincenzo's criteria", 'decoherence', 'quantum decoherence', 'Nuclear magnetic resonance', 'MRI', 'pulse shaping', 'Quantum threshold theorem', 'topological quantum computer', 'anyon', 'braid theory', 'New Scientist', 'quantum circuit', 'quantum gate', 'One-way quantum computer', 'cluster state', 'Adiabatic quantum computation', 'quantum annealing', 'Hamiltonian (quantu

   result: {'batchcomplete': '', 'query': {'pages': {'171166': {'pageid': 171166, 'ns': 0, 'title': 'Nepal'}}}}
   query: {'action': 'query', 'titles': ['bitly', 'VentureBeat', 'VentureBeat', 'Google Trends', 'real-time web', 'Twitter', 'Facebook', 'Clixtr', 'Pixable', 'Mashable', 'Pinterest', 'cloud computing', 'Application software', 'LinkedIn', 'Monster.com', 'Foursquare (service)', 'Gowalla', 'GPS', 'iPhone', 'Yelp, Inc.', 'lead generation', 'Pew Research', 'Reuters', 'Fishbrain', 'fishing', 'Strava', 'cycling', 'Academia.edu', 'LinkedIn', 'Facebook', 'ResearchGate', 'European Southern Observatory', 'Digital Natives', 'Social networking sites', 'LinkedIn', 'Twitter', 'social capital', 'English, baby!', 'LiveMocha', 'Web 2.0', 'learning network', 'digital literacy', 'Tweeting', 'instant messaging', 'blogging', 'Henry Jenkins', 'participatory culture', 'James Gee', 'affinity spaces', 'Cyberbullying'], 'redirects': 1}
   result: {'batchcomplete': '', 'query': {'pages': {'32492747': {'

   result: {'batchcomplete': '', 'query': {'interwiki': [{'title': 'de:Ad-hoc-Netz', 'iw': 'de'}]}}
   query: {'action': 'query', 'titles': ['Order_One_Network_Protocol'], 'redirects': 1}
{'batchcomplete': '', 'query': {'normalized': [{'from': 'Order_One_Network_Protocol', 'to': 'Order One Network Protocol'}], 'pages': {'1807926': {'pageid': 1807926, 'ns': 0, 'title': 'Order One Network Protocol'}}}}
--> new seed article: EAP

title: EAP
    query: {'action': 'query', 'prop': 'revisions', 'titles': 'EAP', 'rvprop': 'content', 'redirects': 1}
   query: {'action': 'query', 'titles': ['Edgar Allan Poe', 'Literary criticism', 'Bureau of East Asian and Pacific Affairs', 'E.A. Patras', 'ESCP Europe', 'European Workers Party', 'European Association for Psychotherapy', 'University of California Education Abroad Program', 'Eastern Partnership', 'ICC East Asia-Pacific', 'Engineers Against Poverty', 'British Aerospace EAP', 'Electroactive polymers', 'Enterprise Architecture Planning', 'JBoss Ente

   query: {'action': 'query', 'titles': ['Standardization', 'standards organization', 'Commonwealth of Independent States', 'Federal Agency on Technical Regulating and Metrology', 'File:GOST coverpage.jpg', 'Soviet Union', 'Russian language', 'Gosstandart', 'World War II', 'standards organization', 'Commonwealth of Independent States', 'Commonwealth of Independent States', 'Russia', 'Belarus', 'Moldova', 'Kazakhstan', 'Azerbaijan', 'Armenia', 'Kyrgyzstan', 'Uzbekistan', 'Tajikistan', 'Georgia (country)', 'Turkmenistan', 'Interstate Council for Standardization, Metrology and Certification', 'International Organization for Standardization', 'Russian National Standards', 'Ukraine', 'UNIAN', "People's Commissariat", 'Soviet Union', 'Council of Ministers (Soviet Union)', 'File:Знак соответствия.svg', 'GOST 7.67', 'Country code', 'GOST 5284-84', 'Tushonka', 'Russian tube designations', 'GOST 7396', 'Russia', 'Commonwealth of Independent States', 'GOST 10859', 'ASCII', 'Unicode', 'ALGOL', 'Ro

   result: {'batchcomplete': '', 'query': {'pages': {'153059': {'pageid': 153059, 'ns': 0, 'title': 'Henry Cabot Lodge'}}}}
   query: {'action': 'query', 'titles': ['Darius I', 'Ancient Greece', 'Henry Cabot Lodge', 'Rowman & Littlefield', 'free market', 'East Province, Rwanda', 'new public management', 'Organisation for Economic Co-operation and Development', 'Power (sociology)', 'Pluralism (political philosophy)', 'representative government', 'citizens', 'constitution', 'Statute', 'Political party', 'legislature', 'advocacy groups', 'Central government', 'Conservatism', 'Gridlock (politics)', 'Political parties of minorities', 'Parliamentary system', 'Labour Party (UK)', 'Scottish Parliament', 'Scottish National Party', 'participatory democracy', 'globalization', 'Democracy and Political Ignorance', 'George Mason University', 'Ilya Somin', 'Federalism', 'foot voting', 'Great Migration (African American)', 'Jim Crow laws', 'European Union', 'subsidiarity', 'Centre Party (Finland)', 'C

   query: {'action': 'query', 'titles': ['-ise vs -ize', 'foreign relations', 'world', 'transportation', 'communication', 'trade', 'idea', 'culture', 'wiktionary:conflict', 'diplomacy', 'history of globalization', 'steam locomotive', 'steamship', 'jet engine', 'container ship', 'transport', 'telegraph', 'Internet', 'mobile phone', 'telecommunication', 'interdependence', 'Economy', 'History of globalization', 'modernity', 'Europe', 'Age of Discovery', 'New World', 'International Monetary Fund', 'trade', 'Financial transaction', 'capital (economics)', 'investment', 'Human migration', 'global warming', 'water pollution', 'air pollution', 'overfishing', 'business', 'Employment', 'natural environment', 'economic globalization', 'cultural globalization', 'political globalization', 'empire', 'Asia', 'Indian Ocean', 'Sociology', 'Martin Albrow', 'Anthony Giddens', 'social relation'], 'redirects': 1}
   result: {'batchcomplete': '', 'query': {'normalized': [{'from': 'social relation', 'to': 'So

   result: {'batchcomplete': '', 'query': {'normalized': [{'from': 'employment', 'to': 'Employment'}], 'pages': {'314993': {'pageid': 314993, 'ns': 0, 'title': 'Employment'}}}}
   query: {'action': 'query', 'titles': ['migrant worker', 'foreign worker', 'International Labour Organization', 'freedom of movement for workers in the European Union', 'London Youth Games', 'London', 'Financial Times', 'international education', 'international student', 'foreign student', 'immigration', 'transnational marriage', 'marriage', 'citizenship', 'mail', 'electric telegraph', 'transatlantic telegraph cable', 'Internet', 'Facebook', 'social networking service', 'List of virtual communities with more than 100 million active users', 'A.T. Kearney', 'Foreign Policy', 'Ireland', 'Belgium', 'Netherlands', 'Austria', 'Singapore', 'Denmark', 'Sweden', 'Portugal', 'Hungary', 'Finland', 'Singapore', 'Switzerland', 'United States', 'Ireland', 'Denmark', 'Canada', 'Netherlands', 'Australia', 'Austria', 'Sweden',

{'batchcomplete': '', 'query': {'normalized': [{'from': 'Energy_conservation', 'to': 'Energy conservation'}], 'pages': {'478933': {'pageid': 478933, 'ns': 0, 'title': 'Energy conservation'}}}}
11 missing redirects (12 titles,  1 found, 0 duplicates)
--> new seed article: Resilience_(network)

title: Resilience_(network)
    query: {'action': 'query', 'prop': 'revisions', 'titles': 'Resilience_(network)', 'rvprop': 'content', 'redirects': 1}
   query: {'action': 'query', 'titles': ['computer network', 'Service (systems architecture)', 'Fault (technology)', 'normal operation', 'distributed processing', 'Cloud storage', 'video conferencing', 'instant messaging', 'online collaboration', 'survivability', 'Category:Computer networks'], 'redirects': 1}
   result: {'batchcomplete': '', 'query': {'pages': {'737003': {'pageid': 737003, 'ns': 14, 'title': 'Category:Computer networks'}}}}
--> new seed article: Virtual_community

title: Virtual_community
    query: {'action': 'query', 'prop': 'revi

   result: {'batchcomplete': '', 'query': {'pages': {'20598752': {'pageid': 20598752, 'ns': 0, 'title': 'Streisand effect'}}}}
   query: {'action': 'query', 'titles': ['Tiziana Cantone', 'Google Spain v AEPD and Mario Costeja González', 'Category:Human rights', 'Category:Internet and the European Union', 'Category:Internet privacy', 'Category:Freedom of expression', 'Category:Privacy controversies and disputes'], 'redirects': 1}
   result: {'batchcomplete': '', 'query': {'pages': {'39072007': {'pageid': 39072007, 'ns': 14, 'title': 'Category:Privacy controversies and disputes'}}}}
   query: {'action': 'query', 'titles': ['Jennifer_Granick', 'Yale_Law_Journal', 'Streisand_effect'], 'redirects': 1}
{'batchcomplete': '', 'query': {'normalized': [{'from': 'Streisand_effect', 'to': 'Streisand effect'}], 'pages': {'20598752': {'pageid': 20598752, 'ns': 0, 'title': 'Streisand effect'}}}}
2 missing redirects (3 titles,  1 found, 0 duplicates)
--> new seed article: Digital_citizen

title: Digit

   result: {'batchcomplete': '', 'query': {'pages': {'288400': {'pageid': 288400, 'ns': 0, 'title': 'Provenance'}}}}
   query: {'action': 'query', 'titles': ['Public-key cryptography', 'RADIUS', 'Reliance authentication', 'Secret sharing', 'Secure Remote Password protocol', 'Secure Shell', 'Security printing', 'SQRL', 'Strong authentication', 'Tamper-evident technology', 'TCP Wrapper', 'Time-based authentication', 'Two-factor authentication', 'Usability of web authentication systems', 'Woo–Lam', 'Wikipedia:External links', 'Wikipedia:Spam', 'National Institute of Standards and Technology', 'U.S. Department of Commerce', 'Category:Authentication', 'Category:Applications of cryptography', 'Category:Access control', 'Category:Packaging', 'Category:Notary', 'Category:Computer access control'], 'redirects': 1}
   result: {'batchcomplete': '', 'query': {'pages': {'30841684': {'pageid': 30841684, 'ns': 14, 'title': 'Category:Computer access control'}}}}
   query: {'action': 'query', 'titles':

   result: {'batchcomplete': '', 'query': {'pages': {'636268': {'pageid': 636268, 'ns': 0, 'title': 'Botnet'}}}}
   query: {'action': 'query', 'titles': ['malware', 'Computer and network surveillance', 'Telephone tapping', 'Fiber tapping', 'Port scan', 'Idle scan', 'Keystroke logging', 'Screen scraping', 'Backdoor (computing)', 'Denial-of-service attack', 'Spoofing attack', 'Man-in-the-middle attack', 'Man-in-the-browser', 'ARP poisoning', 'Ping flood', 'Ping of death', 'Smurf attack', 'Buffer overflow', 'Heap overflow', 'Stack overflow', 'Format string attack', 'kill chain', 'cyberspace', 'Election Commission of Pakistan', 'cyber security', 'East–West dichotomy', 'Akamai Technologies', "People's Liberation Army", 'cyberwarfare', 'United States', 'Ted Koppel', '2018 FIFA World Cup', ':Category:Computer security software companies', 'contingency plan', 'Common Vulnerabilities and Exposures', 'Computer emergency response team', 'social engineering (security)', 'Hacker (computer security)

   result: {'batchcomplete': '', 'query': {'pages': {'40988836': {'pageid': 40988836, 'ns': 0, 'title': 'IEEE 802.11ai'}}}}
   query: {'action': 'query', 'titles': ['IEEE 802.11aj', 'IEEE 802.11ax', 'IEEE 802.11mc', 'IEEE 802.11a', 'IEEE 802.11b', 'TU (Time Unit)', 'IEEE 802.1', 'cable modem', 'Digital Subscriber Line', 'Internet', 'wireless community network', 'University of California, Berkeley', '802.11', 'Wired Equivalent Privacy', 'Fluhrer, Mantin and Shamir attack', 'RC4', 'AT&T Corporation', 'Media Access Control', 'Wi-Fi Alliance', 'Wi-Fi Protected Access', 'Wi-Fi Protected Access', 'IEEE 802.11i', 'WPA2', 'Advanced Encryption Standard', 'RC4', 'RADIUS', 'EAP-TLS', 'IEEE 802.11w-2009', 'Wi-Fi Protected Setup', 'United States Computer Emergency Readiness Team', 'Apple Inc.', 'iOS', 'People counter', 'Comparison of wireless data standards', 'Fujitsu Ltd. v. Netgear Inc.', 'Gi-Fi', 'LTE-WLAN Aggregation', 'OFDM system comparison table', 'TU (time unit)', 'TV White Space Database',

   result: {'batchcomplete': '', 'query': {'normalized': [{'from': 'market abuse', 'to': 'Market abuse'}], 'pages': {'4141703': {'pageid': 4141703, 'ns': 0, 'title': 'Market abuse'}}}}
   query: {'action': 'query', 'titles': ['conspicuous consumption', 'social status', 'social stratification', 'social status', 'Adam Smith', 'The Theory of Moral Sentiments', 's:The Theory of Moral Sentiments', 'Juliet Schor', 'Robert H. Frank', 'Stavros Niarchos', 'Aristotle Onassis', 'World Bank', 'International Monetary Fund', 'Oded Galor', 'market imperfection', 'Robert Barro', 'International Monetary Fund', 'Joseph Stiglitz', 'aggregate demand', 'Branko Milanovic', 'Kuznets curve', 'Thomas Piketty', 'Capital in the Twenty-First Century', 'Harvard University Press', 'Walter Scheidel', 'Princeton University Press', 'Organisation for Economic Co-operation and Development', 'Branko Milanovic', 'European sovereign-debt crisis', 'Human development (humanity)', 'poverty reduction', 'Millennium Development 

   result: {'batchcomplete': '', 'query': {'redirects': [{'from': 'Oracle v. Google', 'to': 'Oracle America, Inc. v. Google, Inc.'}], 'pages': {'36012401': {'pageid': 36012401, 'ns': 0, 'title': 'Oracle America, Inc. v. Google, Inc.'}}}}
   query: {'action': 'query', 'titles': ['copyrighted', 'Wired (magazine)', 'fair use', 'United States Court of Appeals for the Federal Circuit', 'Bloomberg Businessweek', 'Advanced SCSI programming interface', 'SCSI', 'Cocoa (API)', 'Carbon (API)', 'Macintosh', 'DirectX', 'Microsoft Windows', 'EHLLAPI', 'List of Java APIs', 'Open Database Connectivity', 'Microsoft Windows', 'OpenAL', 'OpenCL', 'OpenGL', 'OpenMP', 'Server Application Programming Interface', 'Simple DirectMedia Layer', 'API testing', 'API writer', 'Calling convention', 'Comparison of application virtual machines', 'Common Object Request Broker Architecture', 'Document Object Model', 'Double-chance function', 'Foreign function interface', 'Interface (computing)', 'Interface control docum

   query: {'action': 'query', 'titles': ['United States', 'freedom of information', 'Journal of Alternative Perspectives in the Social Sciences', 'privacy', 'data protection', 'Legal burden of proof', 'UNESCO', 'Access to Information Day', 'UN General Assembly', 'Albania', 'Australia', 'Freedom of Information Act 1982', 'Azerbaijan', 'Caretaker Government of Bangladesh', 'Right to Information Act', 'Access to information in Bangladesh', 'Vision 2021', 'Election promise', 'Bangladesh Awami League', 'Bangladeshi general election, 2008', 'Belize', 'Brazil', 'Bulgaria', 'Canada', 'Access to Information Act', 'Pierre Trudeau', 'Information Commissioner of Canada', 'Privacy Act (Canada)', 'privacy', 'Crown copyright', 'Privacy Commissioner of Canada', 'personal information', 'Privacy Act (Canada)', 'Access to Information Act', 'Alberta', 'Manitoba', 'Freedom of Information and Protection of Privacy Act (Nova Scotia)', 'Ontario', 'Saskatchewan', 'Quebec', 'Coordination of Access to Informatio

{'batchcomplete': '', 'query': {'normalized': [{'from': 'Chord_(peer-to-peer)', 'to': 'Chord (peer-to-peer)'}], 'pages': {'322132': {'pageid': 322132, 'ns': 0, 'title': 'Chord (peer-to-peer)'}}}}
--> new seed article: Robotic_process_automation

title: Robotic_process_automation
    query: {'action': 'query', 'prop': 'revisions', 'titles': 'Robotic_process_automation', 'rvprop': 'content', 'redirects': 1}
   query: {'action': 'query', 'titles': ['WP:Disruptive editing', 'business process automation', 'software', 'artificial intelligence', 'workflow', 'automation', 'software developer', 'application programming interfaces', 'scripting language', 'graphical user interface', 'graphical user interface testing', 'programming by demonstration', 'email', 'bookkeeping', 'screen scraping', 'Xchanging', 'virtual environment', 'virtualization', 'Harvard Business Review', 'layoff', 'Business Process Outsourcing', 'University of Oxford', 'TEDx', 'University College London', 'Category:Business softw

   result: {'batchcomplete': '', 'query': {'pages': {'10429397': {'pageid': 10429397, 'ns': 14, 'title': 'Category:Unix variants'}}}}
   query: {'action': 'query', 'titles': ['Prentice_Hall', 'Twm', "Samizdat:_And_Other_Issues_Regarding_the_'Source'_of_Open_Source_Code"], 'redirects': 1}
{'batchcomplete': '', 'query': {'normalized': [{'from': "Samizdat:_And_Other_Issues_Regarding_the_'Source'_of_Open_Source_Code", 'to': "Samizdat: And Other Issues Regarding the 'Source' of Open Source Code"}], 'pages': {'976081': {'pageid': 976081, 'ns': 0, 'title': "Samizdat: And Other Issues Regarding the 'Source' of Open Source Code"}}}}
2 missing redirects (3 titles,  1 found, 0 duplicates)
--> new seed article: 5G

title: 5G
    query: {'action': 'query', 'prop': 'revisions', 'titles': '5G', 'rvprop': 'content', 'redirects': 1}
   query: {'action': 'query', 'titles': ['File:5th generation mobile network (5G) logo.jpg', 'List of mobile phone generations', '4G', '3G', '2G', 'ITU', 'gigabit', '3GPP',

   result: {'batchcomplete': '', 'query': {'pages': {'42627538': {'pageid': 42627538, 'ns': 0, 'title': 'Database Workbench'}}}}
   query: {'action': 'query', 'titles': ['DBeaver', 'DBEdit', 'HeidiSQL', 'LibreOffice', 'Microsoft Access', 'PostgreSQL', 'Navicat', 'OpenOffice.org Base', 'phpMyAdmin', ' Sequel Pro', 'Sourceforge', 'SQLBuddy', 'SQLyog', 'syntax highlighting', 'TOAD (software)', 'Webmin', 'command-line interface', 'command line', 'Perl', 'Linux', 'CentOS', 'Debian', 'Fedora (operating system)', 'Ubuntu (operating system)', 'programming language', 'application programming interface', 'library (computing)', 'Visual Studio', 'C Sharp (programming language)', 'Visual Basic', 'ODBC', 'MySQL Connector', 'Active Server Pages', 'Adobe ColdFusion', 'HTSQL', 'Uniform resource locator', 'Fork (software development)', 'MariaDB', 'Application programming interface', 'MySQL Workbench', 'XtraDB', 'InnoDB', 'Aria (storage engine)', 'Percona Server', 'Percona', 'XtraDB', 'InnoDB', 'Drizzle 

   result: {'batchcomplete': '', 'query': {'pages': {'37545': {'pageid': 37545, 'ns': 0, 'title': 'Palm OS'}}}}
   query: {'action': 'query', 'titles': ['Handspring (company)', 'Treo 180', 'Palm OS', 'Treo 180g', 'Graffiti (Palm OS)', 'NTT DoCoMo', 'i-mode', 'mobile internet', 'cHTML', 'HTML', 'mobile payment', 'near-field communication', '1seg', 'mobile television', 'BlackBerry', 'Danger Hiptop', 'T-Mobile', 'Windows Mobile', 'BlackBerry', 'Research In Motion', 'Symbian', 'Psion (company)', 'Europe', 'Nokia Eseries', 'Nokia Nseries', 'iPhone (1st generation)', 'T9 (predictive text)', 'QWERTY', 'virtual keyboard', 'Handwriting recognition', 'Apple Computer', 'iPhone (original)', 'capacitive sensing', 'multi-touch', 'Wireless Application Protocol', 'iTunes', 'Google Maps', '3G', 'Apple Store', 'iPhone 3G', 'App Store (iOS)', 'mobile app', 'Android (operating system)', 'Engadget', 'HTC Dream', 'CNET', 'CBS Interactive', 'Form factor (mobile phones)', 'Microsoft', 'Windows Phone'], 'redir

   result: {'batchcomplete': '', 'query': {'pages': {'8712750': {'pageid': 8712750, 'ns': 0, 'title': 'E-reader'}}}}
   query: {'action': 'query', 'titles': ['Fossbytes', 'Linux Magazine', 'OSNews', 'Category:Smartphones', 'Category:Cloud clients', 'Category:Consumer electronics', 'Category:Information appliances', 'Category:Mobile computers', 'Category:Personal computing', 'Category:Portable computers'], 'redirects': 1}
   result: {'batchcomplete': '', 'query': {'pages': {'1192465': {'pageid': 1192465, 'ns': 14, 'title': 'Category:Portable computers'}}}}
   query: {'action': 'query', 'titles': ['IBM_Simon', 'Palm_OS', 'Windows_Phone', 'Rollable_display', 'Infrared_blaster', 'Ubuntu', 'Xbox_Live', 'Safaricom', 'Nokia', 'E-reader'], 'redirects': 1}
{'batchcomplete': '', 'query': {'pages': {'8712750': {'pageid': 8712750, 'ns': 0, 'title': 'E-reader'}}}}
9 missing redirects (10 titles,  1 found, 0 duplicates)
--> new seed article: Carrier_Ethernet

title: Carrier_Ethernet
    query: {'act

In [52]:
api.APIRequest(site, params)

NameError: name 'params' is not defined