# Downloading

This downloads as many wikipedia pages as you can stand, one at a time, and then runs some code over those xml dumps to turn them into a tabular format called 'parquet' that is much faster to work with than 

In [3]:
import urllib.request
from pathlib import Path

In [4]:
wikipedia_index = urllib.request.urlopen("https://dumps.wikimedia.org/enwiki/20211020/").read().decode("utf-8")

In [6]:
import re
pages = re.findall("/enwiki/20211020/enwiki-20211020-pages-articles-multistream[0-9]+.xml-p[0-9]+p[0-9]+.bz2", wikipedia_index)
for page in pages[:6]:
    link = "https://dumps.wikimedia.org/" + page
#    print("Downloading " + page)
    destination_file = Path(page.split("/")[-1])
    if destination_file.exists():
        # Download re-download.
        continue
    print("to ", destination_file)        
    urllib.request.urlretrieve(link, page.split("/")[-1])

In [7]:
import bz2

In [14]:
for p in Path(".").glob("*.bz2"):
    print(p)
    dest = p.with_suffix(".parquet")
    if dest.exists() or "pageviews" in dest.name:
        continue

    f = bz2.open(p)

    i = 0
    pages = []

    currently_flushing_to_text = False
    for line in f:
        if i % 10_000 == 0:
            print("line ", i, end = "\r")    
        i += 1
        if line == b'  <page>\n':
            title = None
            last_edit = None
            id = None
            redirect = None
            article_text = b""
        elif line == b'  </page>\n':        
            currently_flushing_to_text = False
            pages.append((title, last_edit, article_text.decode("utf-8", "ignore"), id, redirect))
            if (len(pages) % 100) == 0:
                print(f"                                      {len(pages)} pages", end = "\r")
        elif currently_flushing_to_text:
            if line.endswith(b'</text>\n'):
                currently_flushing_to_text = False
                # Drop the end-of-article code.
                line = line[:-9]
            article_text += line
        else:
            title_match = re.findall(b'^    <title>(.*)</title>\n', line)
            if title_match:
                title = title_match[0]
                continue
            redirect_match = re.findall(b'^    <redirect title="(.*)" />\n', line)
            if redirect_match:
                redirect = redirect_match[0]
                continue
            id_match = re.findall(b'^    <id>([0-9]+)</id>\n', line)
            if id_match:
                id = int(id_match[0])
                continue
            timestamp_match = re.findall(b'^      <timestamp>(.*)</timestamp>\n', line)
            if timestamp_match:
                last_edit = timestamp_match[0]
                continue
            textstart_match = re.findall(b'^      <text[^>]+>(.*)', line)
            if textstart_match:
                article_text = textstart_match[0]
                currently_flushing_to_text = True

    # Now we need to get it out into a more manageable format. For this I use the pyarrow library. This 
    # is a little more idiosyncratic--most people would use pandas foir this.

    import pyarrow as pa
    from pyarrow import parquet as pq
    titles, edit_dates, article_texts, ids, redirects = zip(*pages)
    tab = pa.table({
        "titles": pa.array(titles, pa.string()),
        "edited": pa.array(edit_dates, pa.string()),
        "id": pa.array(ids),
        "text": pa.array(article_texts, pa.string()),
        "redirects": pa.array(redirects, pa.string())
    })
    # Use some heavy-duty modern compression.
    pq.write_table(tab, dest, compression = "ZSTD", compression_level = 7)

enwiki-20211020-pages-articles-multistream6.xml-p958046p1483661.bz2
enwiki-20211020-pages-articles-multistream2.xml-p41243p151573.bz2


In [None]:
pip 

In [12]:
import duckdb

ModuleNotFoundError: No module named 'duckdb'

In [None]:
con = duckdb.connect(":memory:")


<duckdb.DuckDBPyConnection at 0x7f958039aa70>

In [146]:
#con.execute("SELECT titles FROM wiki WHERE text LIKE '%a city in%' LIMIT 10000").df()
con.execute("SELECT LEN(text) AS t, * FROM wiki WHERE LEN(text) > 370000 LIMIT 10").df()

Unnamed: 0,t,titles,edited,id,text,redirects
0,375115,Apple Inc.,2021-10-19T14:40:56Z,856,{{Short description|American multinational tec...,
1,375630,Italy,2021-10-18T17:10:27Z,14532,{{short description|Country in Southern Europe...,
2,372382,Foreign relations of India,2021-10-15T13:49:04Z,14604,{{Short description|Overview of the foreign re...,
3,402442,Pakistan,2021-10-15T09:05:39Z,23235,{{short description|Country in South Asia}}{{a...,
4,380887,2000s,2021-10-19T16:57:03Z,34579,{{Cleanup bare URLs|date=September 2021}}{{sho...,
5,389819,History of Australia,2021-10-19T10:10:32Z,39582,{{short description|Australian history}}{{pp-s...,
6,386344,List of Puerto Ricans,2021-10-19T14:55:47Z,87732,{{Short description|Wikipedia list article}}{{...,
7,459839,List of suicides,2021-10-17T20:06:12Z,143848,{{short description|Wikimedia list article}}{{...,
8,429162,List of municipalities in Michigan,2021-10-09T21:49:38Z,236034,{{short description|List}}[[File:Michigan in U...,
9,525586,Wikipedia:Upload log archive/May 2003,2016-11-05T13:38:54Z,239814,==[[Wikipedia:Upload log]] archive for May 200...,
