In [188]:
import requests
import os
import sys
import xml.sax
import gc
import json
import bz2
import subprocess
import re
import mwparserfromhell 
import tqdm 
import mwcomposerfromhell

from timeit import default_timer as timer

from bs4 import BeautifulSoup
from itertools import chain
from tensorflow import keras
from keras.utils import get_file
from multiprocessing import Pool 
from multiprocessing.dummy import Pool as Threadpool
from itertools import chain
from functools import partial


In [189]:
base_url = 'https://dumps.wikimedia.org/enwiki/'
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, 'html.parser')

# Find the links that are dates of dumps
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href')]

['../',
 '20230620/',
 '20230701/',
 '20230720/',
 '20230801/',
 '20230820/',
 '20230901/',
 '20230920/',
 'latest/']

In [190]:
# Dump source
dump_url = base_url + '20230620/'

# Retrieve the html
dump_html = requests.get(dump_url).text

'<!DOCTYPE '

In [191]:
# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')

In [192]:
files = []

# Search through all files
for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    # Select the relevant files
    if 'pages-articles' in text:
        files.append((text.split()[0], text.split()[1:]))

[('enwiki-20230620-pages-articles-multistream.xml.bz2', ['20.6', 'GB']),
 ('enwiki-20230620-pages-articles-multistream-index.txt.bz2', ['240.0', 'MB']),
 ('enwiki-20230620-pages-articles-multistream1.xml-p1p41242.bz2',
  ['258.4', 'MB']),
 ('enwiki-20230620-pages-articles-multistream-index1.txt-p1p41242.bz2',
  ['221', 'KB']),
 ('enwiki-20230620-pages-articles-multistream2.xml-p41243p151573.bz2',
  ['345.2', 'MB']),
 ('enwiki-20230620-pages-articles-multistream-index2.txt-p41243p151573.bz2',
  ['638', 'KB']),
 ('enwiki-20230620-pages-articles-multistream3.xml-p151574p311329.bz2',
  ['374.6', 'MB']),
 ('enwiki-20230620-pages-articles-multistream-index3.txt-p151574p311329.bz2',
  ['819', 'KB']),
 ('enwiki-20230620-pages-articles-multistream4.xml-p311330p558391.bz2',
  ['417.6', 'MB']),
 ('enwiki-20230620-pages-articles-multistream-index4.txt-p311330p558391.bz2',
  ['1.3', 'MB']),
 ('enwiki-20230620-pages-articles-multistream5.xml-p558392p958045.bz2',
  ['448.0', 'MB']),
 ('enwiki-2023062

In [193]:
files_to_download = [file[0] for file in files if ('.xml-p' in file[0] and 'multistream' not in file[0])]

['enwiki-20230620-pages-articles1.xml-p1p41242.bz2',
 'enwiki-20230620-pages-articles2.xml-p41243p151573.bz2',
 'enwiki-20230620-pages-articles3.xml-p151574p311329.bz2',
 'enwiki-20230620-pages-articles4.xml-p311330p558391.bz2',
 'enwiki-20230620-pages-articles5.xml-p558392p958045.bz2',
 'enwiki-20230620-pages-articles6.xml-p958046p1483661.bz2',
 'enwiki-20230620-pages-articles7.xml-p1483662p2134111.bz2',
 'enwiki-20230620-pages-articles8.xml-p2134112p2936260.bz2',
 'enwiki-20230620-pages-articles9.xml-p2936261p4045402.bz2',
 'enwiki-20230620-pages-articles10.xml-p4045403p5399366.bz2',
 'enwiki-20230620-pages-articles11.xml-p5399367p6899366.bz2',
 'enwiki-20230620-pages-articles11.xml-p6899367p7054859.bz2',
 'enwiki-20230620-pages-articles12.xml-p7054860p8554859.bz2',
 'enwiki-20230620-pages-articles12.xml-p8554860p9172788.bz2',
 'enwiki-20230620-pages-articles13.xml-p9172789p10672788.bz2',
 'enwiki-20230620-pages-articles13.xml-p10672789p11659682.bz2',
 'enwiki-20230620-pages-articles

In [195]:
url = "https://dumps.wikimedia.org/enwiki/20230620/enwiki-20230620-pages-articles1.xml-p1p41242.bz2"
response = requests.get(url, stream=True)

In [196]:
data_paths = []
file_info = []
keras_home = "/home/"

# Iterate through each file
for file in files_to_download:
    path = keras_home + file
    
    # Check to see if the path exists (if the file is already downloaded)
    if not os.path.exists(keras_home + file):
        print('Downloading')
        # If not, download the file
        data_paths.append(get_file(origin = dump_url + file))
    else:
        data_paths.append(path)
        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        
        # Find the number of articles
        file_number = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
        file_info.append((file.split('-')[-1], file_size, file_number))

In [197]:

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Parse through XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._philosophers = []
        self._article_count = 0
        self._non_matches = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._article_count += 1
            # Search through the page to see if the page is a philosopher
            philosopher = process_article(**self._values, template = 'Infobox philosopher')
            # Append to the list of books
            if philosopher:
                self._philosophers.append(philosopher)

In [240]:
def getLink(string = '[[link]]') -> (str, str):
    name = string.split('|')[0].split('[[')[-1].split(']]')[0].split('#')[0]
    link = 'https://en.wikipedia.org/wiki/' + string.split('|')[0].split('[[')[-1].split(']]')[0].split('#')[0].replace(' ', '_')
    return (name, link)

In [275]:
teste = []
def process_article(title, text, timestamp, template = 'Infobox philosopher'):
    """Process a wikipedia article looking for template"""
    
    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)
    
    # Search through templates for the template
    matches = wikicode.filter_templates(matches = template)
    
    # Filter out errant matches
    matches = [x for x in matches if x.name.strip_code().strip().lower() == template.lower()]
    
    if len(matches) >= 1:

        # Extract information from infobox
        content = {}
            
        for param in matches[0].params:
            if(param.name.strip_code().strip()):
                name_param = param.name.strip_code().strip()
                if (name_param == 'influences' or name_param == 'influenced' or name_param == 'influenced by'):
                    links = [philo for philo in param.value.filter_wikilinks()]
                    s = [getLink(link) for link in links]
                    content[name_param] = s

        return {title: content}

In [276]:
# Object for handling xml
handler = WikiXmlHandler()

count = 0

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(data_paths[0]), 
                         stdout = subprocess.PIPE).stdout):
    parser.feed(line)
    
    count += 1
    if(count % 100000 == 0):
        print(f'Searched through {handler._article_count}.')
    
    # Stop when 3 articles have been found
    #if len(handler._philosophers) > 10:
    #    break

        
print(f'Searched through {handler._article_count} articles to find 3 books.')

Searched through 375.
Searched through 776.
Searched through 1207.
Searched through 1702.
Searched through 2102.
Searched through 2488.
Searched through 2837.
Searched through 3241.
Searched through 3554.
Searched through 4003.
Searched through 4394.
Searched through 4887.
Searched through 5243.
Searched through 5689.
Searched through 6021.
Searched through 6416.
Searched through 6844.
Searched through 7160.
Searched through 7566.
Searched through 7896.
Searched through 8310.
Searched through 8691.
Searched through 9038.
Searched through 9366.
Searched through 9769.
Searched through 10124.
Searched through 10448.
Searched through 10780.
Searched through 11326.
Searched through 11630.
Searched through 11998.
Searched through 12333.
Searched through 12631.
Searched through 13022.
Searched through 13398.
Searched through 13780.
Searched through 14077.
Searched through 14481.
Searched through 14903.
Searched through 15253.
Searched through 15665.
Searched through 16121.
Searched through 16

In [286]:
handler._philosophers

[{'Aristotle': {'influences': [('Plato',
     'https://en.wikipedia.org/wiki/Plato'),
    ('Socrates', 'https://en.wikipedia.org/wiki/Socrates'),
    ('Heraclitus', 'https://en.wikipedia.org/wiki/Heraclitus'),
    ('Parmenides', 'https://en.wikipedia.org/wiki/Parmenides'),
    ('Empedocles', 'https://en.wikipedia.org/wiki/Empedocles'),
    ('Phaleas of Chalcedon',
     'https://en.wikipedia.org/wiki/Phaleas_of_Chalcedon'),
    ('Hippodamus of Miletus',
     'https://en.wikipedia.org/wiki/Hippodamus_of_Miletus'),
    ('Hippias', 'https://en.wikipedia.org/wiki/Hippias')],
   'influenced': [('Averroism', 'https://en.wikipedia.org/wiki/Averroism'),
    ('Avicennism', 'https://en.wikipedia.org/wiki/Avicennism'),
    ('Neo-Aristotelianism (literature)',
     'https://en.wikipedia.org/wiki/Neo-Aristotelianism_(literature)'),
    ('Maimonideanism', 'https://en.wikipedia.org/wiki/Maimonideanism'),
    ('Objectivism', 'https://en.wikipedia.org/wiki/Objectivism'),
    ('Peripatetics', 'https://en

In [298]:
partition_dir = '/home/marialuiza/faculdade/ic/data/wiki/partitions/'
os.makedirs(partition_dir, exist_ok = True)
# Create file name based on partition name
p_str = data_paths[0].split('-')[-1].split('.')[-2]
out_dir = partition_dir + f'{p_str}.json'

# Open the file
with open(out_dir, 'w') as fout:
    # Write as json
    for philosopher in handler._philosophers:
        fout.write(json.dumps(philosopher) + '\n')
        
    print(f'{len(os.listdir(partition_dir))} files processed.', end = '\r')

2 files processed.

In [97]:
def find_philosophers(data_path, limit = None, save = True):
    """Find all the philosopher articles from a compressed wikipedia XML dump.
       `limit` is an optional argument to only return a set number of philosophers.
        If save, philosophers are saved to partition directory based on file name"""

    # Object for handling xml
    handler = WikiXmlHandler()

    # Parsing object
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)

    # Iterate through compressed file
    for i, line in enumerate(subprocess.Popen(['bzcat'], 
                             stdin = open(data_path), 
                             stdout = subprocess.PIPE).stdout):
        try:
            parser.feed(line)
        except StopIteration:
            break
            
        # Optional limit
        if limit is not None and len(handler._philosophers) >= limit:
            return handler._philosophers
    
    if save:
        partition_dir = '/data/wiki/partitions/'
        # Create file name based on partition name
        p_str = data_path.split('-')[-1].split('.')[-2]
        out_dir = partition_dir + f'{p_str}.ndjson'

        # Open the file
        with open(out_dir, 'w') as fout:
            # Write as json
            for philosopher in handler._philosophers:
                fout.write(json.dumps(philosopher) + '\n')
        
        print(f'{len(os.listdir(partition_dir))} files processed.', end = '\r')

    # Memory management
    del handler
    del parser
    gc.collect()
    return None

In [98]:
partitions = [keras_home + file for file in os.listdir(keras_home) if 'xml-p' in file]
len(partitions), partitions[-1]

(64,
 '/home/marialuiza/.keras/datasets/enwiki-20230620-pages-articles21.xml-p37022433p38522432.bz2')

In [99]:
os.cpu_count()

8

In [100]:
# Create a pool of workers to execute processes
pool = Pool(processes = 4)

start = timer()

# Map (service, tasks), applies function to each partition
results = pool.map(find_philosophers, partitions)

pool.close()
pool.join()

end = timer()
print(f'{end - start} seconds elapsed.')

KeyboardInterrupt: 

In [79]:
def read_data(file_path):
    """Read in json data from `file_path`"""
    
    data = []
    
    # Open the file and load in json
    with open(file_path, 'r') as fin:
        for l in fin.readlines():
            data.append(json.loads(l))
            
    return data

In [None]:
#start = timer()

# List of files to read in
saved_files = ['/data/wiki/partitions/' + x for x in os.listdir('/data/wiki/partitions/')]

# Create a threadpool for reading in files
threadpool = Threadpool(processes = 4)

# Read in the files as a list of lists
results = threadpool.map(read_data, saved_files)

# Flatten the list of lists to a single list
philosopher_list = list(chain(*results))

#end = timer()

In [None]:
if not os.path.exists(os.getcwd() + '../data/found_philosophers_filtered.ndjson'):
    with open('../data/found_books_philosophers.ndjson', 'wt') as fout:
        for philosopher in philosopher_list:
             fout.write(json.dumps(philosopher) + '\n')
    print('Philosopher saved.')
else:
    print('Files already saved.')