# Extract a network of pages from Wikipedia

In the following example we query MediaWiki's [API](https://www.mediawiki.org/wiki/API:Main_page) to build a network starting from a list of "seed" pages.

In [17]:
# the wikipedia edition we will work ok {lang}.wikipedia.org
lang = 'en'

Some options to regulate the verbosity of the output, ignore them for the moment.

In [18]:
verbose = True
very_verbose = True
is_new_page = False

Our list of seed articles is containe in `privacy_seed_keywords.txt` in the current directory.
We can look at it with `cat`.

In [19]:
! cat privacy_seed_keywords.txt

cat: privacy_seed_keywords.txt: No such file or directory


In [20]:
import os
import pathlib

# the seed file is ./privacy_seed_keywords.txt
seed_file = pathlib.Path('./privacy_seed.txt')

# get the seed file without extension
seed_filename_noext = os.path.splitext(seed_file.name)[0]

We now set up the directory structure where we will save the results from the API, it will look like this:
```
data
├── links
│   ├── articles.txt
│   └── ...
└── results
    ├── extract_network.log
    ├── <seed_name>.csv
    └── network_<seed_name>.csv
```

In [21]:
# directory containing the seed articles (./data)
data_folder = pathlib.Path('data')
# directory containing the outlinks of each article (./data/links)
link_folder = data_folder/'links'
# directory containing the resulting networks (./data/results)
results_folder = data_folder/'results'

os.makedirs(link_folder, exist_ok=True)
os.makedirs(results_folder, exist_ok=True)

In [22]:
# open the log file ./data/results/extract_network.log
log_filename = 'extract_network.log'
log_file = (results_folder/log_filename).open('a')

# otpen the file containing the network ./data/results/<seed>.csv
net_filename = 'network_' + seed_filename_noext + '.csv'
net_file = (results_folder/net_filename).open('w')

We read our seed file in a dictionary, ignoring empty lines and comments (lines starting with #)

In [23]:
def load_seed(file_name):
    """
    Load the seed file, ignoring empty lines and comments (lines starting with #)
    """
    dic = {}
    infile = pathlib.Path(file_name).open('r')
    for line in infile:
        el = line.strip('\n')
        if el and el[0] != '#':
            el = el.replace(' ', '_')
            dic[el] = 1
    return dic

In [24]:
load_seed(seed_file)

{'Internet_privacy': 1,
 'Internet_security': 1,
 'General_Data_Protection_Regulation': 1,
 'Right_to_be_forgotten': 1}

In [25]:
from wikitools import wiki
from wikitools import api

# create a Wiki object
site = wiki.Wiki("https://{lang}.wikipedia.org/w/api.php".format(lang=lang))

# get outlinks of a wikipedia article through wiki api
def get_outlinks_from_api(title):
    p_id = -1
    outlinks = []

    if title == '' or title == ' ':
        return p_id, outlinks

    params = {'action':'query',
              'prop':'revisions',
              'titles': title,
              'rvprop':'content',
              'redirects':1
              }
    request = api.APIRequest(site, params)

    if very_verbose:
        print('    ' + 'query: ' + str(params))

    result = request.query()
    if int(list(result['query']['pages'].keys())[0]) < 1:
        print('ARTICLE NOT FOUND: {}'.format(title))
        log.write('{}\n'.format(title))
        return (p_id, outlinks)

    else:
        outlinks = []
        p_id = list(result['query']['pages'].keys())[0]
        rev = result['query']['pages'][p_id]['revisions'][0]
        content = rev['*']
        links = parse_text(content)
        for l in links:
            target = l.replace(' ', '_')
            outlinks.append(target)
        return (p_id, outlinks)


In [26]:
def get_outlinks(title):
    outlinks_filename = '{}_articles.txt'.format(title.replace('/','.'))
    outlinks_saved = link_folder/outlinks_filename

    # data saved in the disc
    is_new_page = False
    try:
        with outlinks_saved.open('a') as f:
            outlinks_checked = f.read().splitlines()
    except IOError as e:
        outlinks_checked = []
        
    if outlinks_checked:
        if verbose:
            print('{} saved links from: {}'.format(len(outlinks_checked), title))

    if not outlinks_checked:
        # get data through wiki API
        is_new_page = True

        print('title: {}'.format(title))
        # p is the page_id
        (p, outlinks) = get_outlinks_from_api(title)
        
        (redirects, outlinks_checked) = check_redirects(outlinks)

    return (is_new_page,outlinks_checked)

In [27]:
def split_param_list(alist):
    chunk_len = 50
    if len(alist) < 1:
        return ''
    s = ['']
    l = 0
    i = 0

    chunks = [alist[x:x+chunk_len]
              for x in range(0, len(alist), chunk_len)
              ]

    return chunks

In [28]:
# Check redirect in Wikipedia articles/links
def check_redirects(titles):

    redirects = {}
    links = {}
    duplicates = 0

    title_lists = split_param_list(titles)
    for title_list in title_lists:

        params = {'action':'query', 'titles':title_list, 'redirects':1} #, 'pllimit': 500, 'redirects':1}
        request = api.APIRequest(site, params)
        if very_verbose: print ('   ' + 'query: ' + str(params))
        result = request.query()

        if very_verbose: print(result)

        if 'redirects' in result['query']:
            for redir in result['query']['redirects']:
                redirects[redir['from']] = redir['to']

        for page in result['query']['pages']:
            if page != '-1' and 'ns' in result['query']['pages'][page]:
                if result['query']['pages'][page]['ns'] == 0:
                    link = result['query']['pages'][page]['title'].replace(' ', '_')
                    if link in links:
                        duplicates += 1
                    links[link] = page

    missing = len(titles) - (len(links) + duplicates)
    if very_verbose and missing != 0:
        print('%d missing redirects (%d titles,  %d found, %d duplicates)' %(missing, len(titles), len(links), duplicates))
    return redirects,links

In [29]:
import re

# Regular expressions to find URLs in text
linkSimpleP = re.compile(r'\[\[(.+?)[][|{}/#]')
linkGreedyP = re.compile(r'\[\[([^]^[^}^{^#^/^|]+)')
linkP = re.compile(r'\[\[([^]^[^}^{^#^/^]+?)\s*(?:/[^]^[]*?)?\s*(?:\|[^]^[]*?)?(?:\}\})?\s*\]\]')

def parse_text(content):
    links = {}
    rough_links = re.findall(linkP, content) #get all links

    title_lists = split_param_list(rough_links)

    for title_list in title_lists:

        params = {'action': 'query',
                  'titles': title_list,
                  'redirects': 1
                  }
        request = api.APIRequest(site, params)
        
        if very_verbose:
            print ('   ' + 'query: ' + str(params))
    
        result = request.query()

        if very_verbose:
            print ('   ' + 'result: ' + str(result))

        # check that we have received pages as result
        if result['query'].get('pages', None):
            for page in result['query']['pages']:
                if page != '-1' and ('ns' in result['query']['pages'][page]):
                    # Filter just useful links, articles are in ns == 0
                    if result['query']['pages'][page]['ns'] == 0:
                        link = result['query']['pages'][page]['title']
                        links[link] = 1

    return links

In [30]:
import itertools
import csv

csv_fields = [
'seed', 
'links_from_seed',
'links_to_seed',
'in_degree',
'out_degree',
'out_WP'
]

def extract_network(seed_file):

    # list containing all outlinks of seed pages
    links_to_seedlinks = []

    # read the list of seed defined by the user
    seeds_list = load_seed(seed_file)

    first_step = {}

    for seed in set(seeds_list.keys()):
        print("--> new seed article: {}\n".format(seed))

        (is_new_page, outlinks_checked) = get_outlinks(seed) # get outlink for current seed
        links_to_seedlinks.append(outlinks_checked) # append this outlinks to the list of all seeds' outlinks

        if is_new_page: ## data not already saved in disk, save it
            page_list_filename = seed.replace('/','.') + '_articles.txt'
            page_list_file = (link_folder/page_list_filename).open('a+')

            ##########
            # update degrees for seed and current target outlink
            ##########
            if seed in first_step.keys():
                first_step[seed]['out_WP'] = len(outlinks_checked)

        for target in outlinks_checked: # outlinks_checked contains the outlinks of the current keyword
            if seed != target:
                # write edge as pair of nodes (source, target)
                net_file.write(seed + '\t' + target  + '\n')
                if is_new_page:
                    page_list_file.write(target + '\n')
                try:
                    first_step[seed]['out_degree'] += 1
                    if(target in seeds_list):
                        first_step[seed]['links_to_seed'] += 1
                except:
                    if(target in seeds_list):
                        first_step[seed] = {'seed': True,
                                            'links_from_seed': 0,
                                            'links_to_seed': 1,
                                            'in_degree': 0,
                                            'out_degree': 1,
                                            'out_WP': len(outlinks_checked)
                                            }
                    else:
                        first_step[seed] = {'seed': True,
                                            'links_from_seed': 0,
                                            'links_to_seed': 0,
                                            'in_degree': 0,
                                            'out_degree': 1,
                                            'out_WP': len(outlinks_checked)
                                            }
                try:
                    first_step[target]['links_from_seed'] += 1
                    first_step[target]['in_degree'] += 1
                    if(target in seeds_list):
                        first_step[target]['seed'] = True
                except:
                    if(target in seeds_list):
                        first_step[target] = {'seed': True,
                                              'links_from_seed': 1,
                                              'links_to_seed': 0,
                                              'in_degree': 1,
                                              'out_degree': 0,
                                              'out_WP': 0
                                              }
                    else:
                        first_step[target] = {'seed': False,
                                              'links_from_seed': 1,
                                              'links_to_seed': 0,
                                              'in_degree': 1,
                                              'out_degree': 0,
                                              'out_WP': 0
                                              }

        if is_new_page:
            page_list_file.close()

    # from the list of outlinks keep only those that are no seeds and remove duplicates
    links_to_seedlinks = list(itertools.chain.from_iterable(links_to_seedlinks))
    links_to_seedlinks = list(set(links_to_seedlinks) - set(seeds_list.keys()) )

    ##########
    # Start new iteraction over outlinks of the seed articles
    ##########
    olink_index = 1 #just a counter
    for title in links_to_seedlinks:
        print("\n\n outlink " + str(olink_index) + " of: " + str(len(links_to_seedlinks)) + "\n\n")
        olink_index += 1
        (is_new_page, outlinks_checked) = get_outlinks(title) # get outlinks of current articles

        if is_new_page: ## data not already saved in disk, save it
            page_list_file_title_name = title.replace('/','.') + '_articles.txt'
            page_list_file_title = (link_folder/page_list_file_title_name).open('w')

            ##########
            # update degress of source and target
            ##########
            first_step[title]['out_WP'] = len(outlinks_checked)
        for target in outlinks_checked: # outlinks_checked contains the outlinks of the current keyword
            if is_new_page:
                page_list_file_title.write(target + '\n')
            if target in seeds_list.keys( ) and target != title:
                net_file.write( title + '\t' + target  + '\n') # write edge as pair of nodes
                first_step[title]['links_to_seed'] += 1
                first_step[title]['out_degree'] += 1
                first_step[target]['in_degree'] += 1
            else:
                if target in links_to_seedlinks and target != title:
                    net_file.write( title + '\t' + target  + '\n') # write edge as pair of nodes
                    first_step[title]['out_degree'] += 1
                    first_step[target]['in_degree'] += 1
        # note that I write inside each if because I write only if the target is either a seed or a outlink of a seed (since we
        # only want the first step degree net)
        ##########
        ##########
        if is_new_page:
            page_list_file_title.close()

    net_file.close()
    # Write degrees in a file
    output_filename = 'first_step_degrees_{}.csv'.format(seed_filename_noext)
    with (results_folder/output_filename).open('w') as outfile:
        writer = csv.writer(outfile, delimiter ="\t")
        writer.writerow(['Page'] + csv_fields)
        for page in first_step.keys():
            writer.writerow([page] + [first_step[page][degree] for degree in csv_fields])


In [31]:
extract_network(seed_file)

--> new seed article: Right_to_be_forgotten

title: Right_to_be_forgotten
    query: {'action': 'query', 'prop': 'revisions', 'titles': 'Right_to_be_forgotten', 'rvprop': 'content', 'redirects': 1}
   query: {'action': 'query', 'titles': ['European Union', 'Argentina', 'stigmatize', 'human rights', 'freedom of expression', 'right to privacy', 'Internet', 'censorship', 'historical revisionism (negationism)', 'revenge porn', 'search engine', 'Rehabilitation of Offenders Act 1974', 'First Amendment to the United States Constitution', 'Google v González', 'European Union', 'Data Protection Directive', 'human rights law', 'General Data Protection Regulation', 'revenge porn', 'right to privacy', 'jurisdiction', 'European Commission', 'General Data Protection Regulation', 'Google', 'Data Protection Directive', 'YouTube', 'Google Groups', 'Twitter', 'European Court of Justice', 'Google', 'Google v González', 'Spanish Agency of data protection', 'Audiencia Nacional (Spain)', 'European Court of 

   query: {'action': 'query', 'titles': ['European Parliament', 'Council of the European Union', 'Data Protection Directive', 'regulation (European Union)', 'EU law', 'data protection', 'European Union', 'European Economic Area', 'international business', 'Data Protection Directive', 'personal data', 'pseudonymization', 'Data anonymization', 'consent', 'Data portability', 'data breach', 'directive (European Union)', 'Cloud computing', 'European Commission', 'Social networking service', 'IP address', 'mutual legal assistance treaty', 'one-stop shop', 'European Data Protection Board', 'Article 29 Data Protection Working Party', 'Charter of Fundamental Rights of the European Union', 'pseudonymization', 'privacy policy', 'algorithm', 'Data portability', 'algorithm', 'national data protection authority', 'European Union Agency for Network and Information Security', 'pseudonymisation', 'data anonymisation', 'Encryption software', 'Encryption', 'tokenization (data security)', 'data at rest', 

   result: {'batchcomplete': '', 'query': {'pages': {'54109227': {'pageid': 54109227, 'ns': 14, 'title': 'Category:Terms of service'}}}}
   query: {'action': 'query', 'titles': ['Digital_camera', 'Web_navigation', 'Jeff_Flake'], 'redirects': 1}
{'batchcomplete': '', 'query': {'normalized': [{'from': 'Jeff_Flake', 'to': 'Jeff Flake'}], 'pages': {'407926': {'pageid': 407926, 'ns': 0, 'title': 'Jeff Flake'}}}}
2 missing redirects (3 titles,  1 found, 0 duplicates)


 outlink 1 of: 4


title: Streisand_effect
    query: {'action': 'query', 'prop': 'revisions', 'titles': 'Streisand_effect', 'rvprop': 'content', 'redirects': 1}
   query: {'action': 'query', 'titles': ['Malibu, California', 'unintended consequences', 'Internet', 'Reactance (psychology)', 'Barbra Streisand', 'Malibu, California', 'cease and desist', 'illegal number', 'Mirror website', 'File sharing', 'London Free Press', 'Red Herring (magazine)', 'unintended consequences', 'Mike Masnick', 'Techdirt', 'National Public Radio', '

   result: {'batchcomplete': '', 'query': {'redirects': [{'from': 'United States Senate Foreign Relations Subcommittee on European Affairs', 'to': 'United States Senate Foreign Relations Subcommittee on Europe and Regional Security Cooperation'}], 'pages': {'7929507': {'pageid': 7929507, 'ns': 0, 'title': 'United States Senate Foreign Relations Subcommittee on Europe and Regional Security Cooperation'}}}}
   query: {'action': 'query', 'titles': ['United States Senate Foreign Relations Subcommittee on International Development and Foreign Assistance, Economic Affairs and International Environmental Protection, and Peace Corps', 'United States Senate Committee on the Judiciary', 'United States Senate Judiciary Subcommittee on Antitrust, Competition Policy and Consumer Rights', 'United States Senate Judiciary Subcommittee on Immigration, Refugees and Border Security', 'United States Senate Judiciary Subcommittee on Terrorism and Homeland Security', 'United States Senate Special Committee 

   result: {'batchcomplete': '', 'query': {'pages': {'26952279': {'pageid': 26952279, 'ns': 14, 'title': 'Category:Information economy'}}}}


In [32]:
! ls data

links  results
