# Extract a network of pages from Wikipedia

In the following example we query MediaWiki's [API](https://www.mediawiki.org/wiki/API:Main_page) to build a network starting from a list of "seed" pages.

In [1]:
# the wikipedia edition we will work on {lang}.wikipedia.org
lang = 'en'

Our list of seed articles is contained in `right_to_be_forgotten_seed.txt` in the `seeds` directory. There are other seed files in the `seeds` folder if you want to look them up. Now, let's take a look at `right_to_be_forgotten_seed.txt` with `cat`.

In [2]:
# we temporaily cd to the seeds dir and then print the contents of right_to_be_forgotten_seed.txt 
! ( cd 'seeds' && cat right_to_be_forgotten_seed.txt )

Right_to_be_forgotten


In [3]:
import os
import pathlib

# the seed file is ./right_to_be_forgotten_seed.txt
seed_file = pathlib.Path('seeds/right_to_be_forgotten_seed.txt')

# get the seed file without extension
seed_filename_noext = os.path.splitext(seed_file.name)[0]

We now set up the directory structure where we will save the results from the API, it will look like this:
```
data
├── links
│   ├── articles.txt
│   └── ...
└── results
    ├── extract_network.log
    ├── <seed_name>.csv
    └── network_<seed_name>.csv
```

In [4]:
# directory containing the seed articles (./data)
data_folder = pathlib.Path('data')
# directory containing the outlinks of each article (./data/links)
link_folder = data_folder/'links'
# directory containing the resulting networks (./data/results)
results_folder = data_folder/'results'

link_folder.mkdir(parents=True, exist_ok=True)
results_folder.mkdir(parents=True, exist_ok=True)

In [5]:
import logging

# setup the logging to file ./data/results/extract_network.log
log_name = 'extract_network.log'
log_filename = (results_folder/log_name)

logging.basicConfig(filename=log_filename, level=logging.DEBUG)

logging.debug('This message should go to the log file')
logging.info('So should this')
logging.warning('And this, too')


In [6]:
# open the file containing the network ./data/results/<seed>.csv
net_filename = 'network_' + seed_filename_noext + '.csv'
net_file = (results_folder/net_filename).open('w+')

We read our seed file in a dictionary, ignoring empty lines and comments (lines starting with #)

In [7]:
def load_seed(file_name):
    """
    Load the seed file, ignoring empty lines and comments (lines starting with #)
    """
    dic = {}
    infile = pathlib.Path(file_name).open('r')
    for line in infile:
        el = line.strip('\n')
        if el and el[0] != '#':
            el = el.replace(' ', '_')
            dic[el] = 1
    return dic

In [8]:
load_seed(seed_file)

{'Right_to_be_forgotten': 1}

In [9]:
from wikitools import wiki
from wikitools import api

# create a Wiki object
site = wiki.Wiki("https://{lang}.wikipedia.org/w/api.php".format(lang=lang))

# get outlinks of a wikipedia article through wiki api
def get_outlinks_from_api(title):
    p_id = -1
    outlinks = []

    if title == '' or title == ' ':
        return p_id, outlinks

    params = {'action': 'query',
              'prop': 'revisions', 
              'titles': title,
              'rvprop':'content',
              'redirects':1
              }
    request = api.APIRequest(site, params)

    logging.debug('query: {}'.format(params))

    result = request.query()
    if int(list(result['query']['pages'].keys())[0]) < 1:
        logging.warning('ARTICLE NOT FOUND: {}'.format(title))
        return (p_id, outlinks)

    else:
        outlinks = []

        p_id = list(result['query']['pages'].keys())[0]
        rev = result['query']['pages'][p_id]['revisions'][0]
        content = rev['*']

        links = parse_text(content)
        for l in links:
            target = l.replace(' ', '_')
            outlinks.append(target)

        return (p_id, outlinks)


In [10]:
def get_outlinks(title):
    outlinks_filename = '{}_articles.txt'.format(title.replace('/','.'))
    outlinks_saved = link_folder/outlinks_filename

    # data saved in the disc
    is_new_page = False
    try:
        with outlinks_saved.open('r') as f:
            outlinks_checked = f.read().splitlines()
    except IOError as e:
        outlinks_checked = []
        
    if outlinks_checked:
        logging.info('{} saved links from: {}'.format(len(outlinks_checked), title))
    else:
        # get data through wiki API
        is_new_page = True

        # p is the page_id
        (p, outlinks) = get_outlinks_from_api(title)
        
        (redirects, outlinks_checked) = check_redirects(outlinks)

    return (is_new_page, outlinks_checked)

In [11]:
def split_param_list(alist):
    chunk_len = 50
    if len(alist) < 1:
        return ''
    s = ['']
    l = 0
    i = 0

    chunks = [alist[x:x+chunk_len]
              for x in range(0, len(alist), chunk_len)
              ]

    return chunks

In [12]:
# Check redirect in Wikipedia articles/links
def check_redirects(titles):
    redirects = {}
    links = {}
    duplicates = 0

    title_lists = split_param_list(titles)
    for title_list in title_lists:

        params = {'action':'query',
                  'titles': '|'.join(title_list).strip('|'),
                  'redirects': 1
                  }
        request = api.APIRequest(site, params)

    
        logging.debug('query: {}'.format(params))

        result = request.query()
        logging.debug('result: {}'.format(result))

        if 'redirects' in result['query']:
            for redir in result['query']['redirects']:
                redirects[redir['from']] = redir['to']

        for page in result['query']['pages']:
            if page != '-1' and 'ns' in result['query']['pages'][page]:
                if result['query']['pages'][page]['ns'] == 0:
                    link = result['query']['pages'][page]['title'].replace(' ', '_')
                    if link in links:
                        duplicates += 1
                    links[link] = page

    missing = len(titles) - (len(links) + duplicates)
    if missing != 0:
        logging.debug('{} missing redirects '
                      '({} titles,  {} found, {} duplicates)'
                      .format(missing, len(titles), len(links), duplicates)
                      )
    return redirects,links

In [13]:
import re

# Regular expressions to find URLs in text
linkSimpleP = re.compile(r'\[\[(.+?)[][|{}/#]')
linkGreedyP = re.compile(r'\[\[([^]^[^}^{^#^/^|]+)')
linkP = re.compile(r'\[\[([^]^[^}^{^#^/^]+?)\s*(?:/[^]^[]*?)?\s*(?:\|[^]^[]*?)?(?:\}\})?\s*\]\]')

def parse_text(content):
    links = {}
    rough_links = re.findall(linkP, content) #get all links

    title_lists = split_param_list(rough_links)

    for title_list in title_lists:
        params = {'action': 'query',
                  'titles': '|'.join(title_list).strip('|'),
                  'format': 'json',
                  'redirect': 1
                  }
        request = api.APIRequest(site, params)
        
        logging.info('query: {}'.format(params))
    
        result = request.query()

        logging.debug('result: {}'.format(result))

        # check that we have received pages as result
        if result['query'].get('pages', None):
            for page in result['query']['pages']:
                if page != '-1' and ('ns' in result['query']['pages'][page]):
                    # Filter just useful links, articles are in ns == 0
                    if result['query']['pages'][page]['ns'] == 0:
                        link = result['query']['pages'][page]['title']
                        links[link] = 1

    return links

In [14]:
import itertools
import csv

csv_fields = [
'seed', 
'links_from_seed',
'links_to_seed',
'in_degree',
'out_degree',
'out_WP'
]

def extract_network(seed_file):

    # list containing all outlinks of seed pages
    links_to_seedlinks = []

    # read the list of seed defined by the user
    seeds_list = load_seed(seed_file)

    first_step = {}

    for seed in set(seeds_list.keys()):
        print("--> new seed article: {}\n".format(seed))

        (is_new_page, outlinks_checked) = get_outlinks(seed) # get outlink for current seed
        links_to_seedlinks.append(outlinks_checked) # append this outlinks to the list of all seeds' outlinks

        if is_new_page: ## data not already saved in disk, save it
            page_list_filename = seed.replace('/','.') + '_articles.txt'
            page_list_file = (link_folder/page_list_filename).open('a+')

            ##########
            # update degrees for seed and current target outlink
            ##########
            if seed in first_step:
                first_step[seed]['out_WP'] = len(outlinks_checked)

        for target in outlinks_checked: # outlinks_checked contains the outlinks of the current keyword
            if seed != target:
                # write edge as pair of nodes (source, target)
                net_file.write(seed + '\t' + target  + '\n')
                if is_new_page:
                    page_list_file.write(target + '\n')
                try:
                    first_step[seed]['out_degree'] += 1
                    if(target in seeds_list):
                        first_step[seed]['links_to_seed'] += 1
                except:
                    if target in seeds_list:
                        first_step[seed] = {'seed': True,
                                            'links_from_seed': 0,
                                            'links_to_seed': 1,
                                            'in_degree': 0,
                                            'out_degree': 1,
                                            'out_WP': len(outlinks_checked)
                                            }
                    else:
                        first_step[seed] = {'seed': True,
                                            'links_from_seed': 0,
                                            'links_to_seed': 0,
                                            'in_degree': 0,
                                            'out_degree': 1,
                                            'out_WP': len(outlinks_checked)
                                            }
                try:
                    first_step[target]['links_from_seed'] += 1
                    first_step[target]['in_degree'] += 1
                    if(target in seeds_list):
                        first_step[target]['seed'] = True
                except:
                    if target in seeds_list:
                        first_step[target] = {'seed': True,
                                              'links_from_seed': 1,
                                              'links_to_seed': 0,
                                              'in_degree': 1,
                                              'out_degree': 0,
                                              'out_WP': 0
                                              }
                    else:
                        first_step[target] = {'seed': False,
                                              'links_from_seed': 1,
                                              'links_to_seed': 0,
                                              'in_degree': 1,
                                              'out_degree': 0,
                                              'out_WP': 0
                                              }

        if is_new_page:
            page_list_file.close()

    # from the list of outlinks keep only those that are no seeds and remove duplicates
    links_to_seedlinks = list(itertools.chain.from_iterable(links_to_seedlinks))
    links_to_seedlinks = list(set(links_to_seedlinks) - set(seeds_list.keys()) )

    ##########
    # Start new iteraction over outlinks of the seed articles
    ##########
    olink_index = 1 # just a counter
    for title in links_to_seedlinks:
        print("  -> outlink {} ({} of {})"
              .format(title, olink_index, len(links_to_seedlinks))
             )

        olink_index += 1
        (is_new_page, outlinks_checked) = get_outlinks(title) # get outlinks of current articles

        if is_new_page: ## data not already saved in disk, save it
            page_list_file_title_name = title.replace('/','.') + '_articles.txt'
            page_list_file_title = (link_folder/page_list_file_title_name).open('w')

            ##########
            # update degress of source and target
            ##########
            first_step[title]['out_WP'] = len(outlinks_checked)
        for target in outlinks_checked: # outlinks_checked contains the outlinks of the current keyword
            if is_new_page:
                page_list_file_title.write(target + '\n')
            if target in seeds_list.keys( ) and target != title:
                net_file.write( title + '\t' + target  + '\n') # write edge as pair of nodes
                first_step[title]['links_to_seed'] += 1
                first_step[title]['out_degree'] += 1
                first_step[target]['in_degree'] += 1
            else:
                if target in links_to_seedlinks and target != title:
                    net_file.write( title + '\t' + target  + '\n') # write edge as pair of nodes
                    first_step[title]['out_degree'] += 1
                    first_step[target]['in_degree'] += 1
        # note that I write inside each if because I write only if the target is either a seed or a outlink of a seed (since we
        # only want the first step degree net)
        ##########
        ##########
        if is_new_page:
            page_list_file_title.close()

    net_file.close()
    # Write degrees in a file
    output_filename = 'first_step_degrees_{}.csv'.format(seed_filename_noext)
    with (results_folder/output_filename).open('w') as outfile:
        writer = csv.writer(outfile, delimiter ="\t")
        writer.writerow(['Page'] + csv_fields)
        for page in first_step.keys():
            writer.writerow([page] + [first_step[page][degree] for degree in csv_fields])


In [15]:
! ls data

links  results


In [16]:
extract_network(seed_file)

--> new seed article: Right_to_be_forgotten



in a future release, use the new queryGen function instead
for queries requring multiple requests


  -> outlink Right_to_privacy (1 of 106)
  -> outlink Cease_and_desist (2 of 106)
  -> outlink Twitter (3 of 106)
  -> outlink Child_prodigy (4 of 106)
  -> outlink Electronic_Frontier_Foundation (5 of 106)
  -> outlink The_New_Yorker (6 of 106)
  -> outlink Conservative_Party_(UK) (7 of 106)
  -> outlink Deutschlandradio (8 of 106)
  -> outlink Rehabilitation_of_Offenders_Act_1974 (9 of 106)
  -> outlink Historical_negationism (10 of 106)
  -> outlink Hamburg (11 of 106)
  -> outlink Max_Schrems (12 of 106)
  -> outlink Martin_v._Hearst_Corporation (13 of 106)
  -> outlink Federal_Constitutional_Court (14 of 106)
  -> outlink Commission_nationale_de_l'informatique_et_des_libertés (15 of 106)
  -> outlink Google_Groups (16 of 106)
  -> outlink Accountability (17 of 106)
  -> outlink Walter_Sedlmayr (18 of 106)
  -> outlink Larry_Page (19 of 106)
  -> outlink William_James_Sidis (20 of 106)
  -> outlink Consumer_Watchdog (21 of 106)
  -> outlink Audiencia_Nacional (22 of 106)
  -> outli

The result is contained in `./data/results/network_right_to_be_forgotten_seed.csv`. Wa can visualize the first ten rows.

In [17]:
! head ./data/results/network_right_to_be_forgotten_seed.csv

Right_to_be_forgotten	Argentina
Right_to_be_forgotten	Baidu
Right_to_be_forgotten	Cease_and_desist
Right_to_be_forgotten	Censorship
Right_to_be_forgotten	Child_prodigy
Right_to_be_forgotten	Consumer_Watchdog
Right_to_be_forgotten	Data_Protection_Directive
Right_to_be_forgotten	David_Weprin
Right_to_be_forgotten	Delhi_High_Court
Right_to_be_forgotten	Deutschlandradio


If you want to see a visualization of a bigger network visit [this page](https://ngi4eu.github.io/engineroom-data-sprint-notebooks/).