## Downloading Wikipedia Articles

##### This notebook implements the downloading of all wikipedia articles

### Finding Files to Download

In [2]:
import requests
from bs4 import BeautifulSoup
from timeit import default_timer as timer
import os

base_url = 'https://dumps.wikimedia.org/enwiki/'
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, 'html.parser')

# Find the links that are dates of dumps
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.text == '20240801/']

dumps_url = base_url + dumps[0]

# Retrieve the html
dump_html = requests.get(dumps_url).text

# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')

files = []
for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    if 'pages-articles' in text:
        files.append((text.split()[0], text.split()[1:]))
        
files_to_download = [file[0] for file in files if '.xml-p' in file[0]]
print(f'There are {len(files_to_download)} files to download.')

There are 134 files to download.


## Downloading Files Using Keras

In [6]:
from keras.utils import get_file

data_paths = []

start = timer()
for file in files_to_download:
    data_paths.append(get_file(file, dumps_url + file))
    
end = timer()
print(f'{round(end - start)} total seconds elapsed.')

Downloading data from https://dumps.wikimedia.org/enwiki/20240801/enwiki-20240801-pages-articles27.xml-p65475910p66975909.bz2
[1m370919756/370919756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 0us/step
Downloading data from https://dumps.wikimedia.org/enwiki/20240801/enwiki-20240801-pages-articles27.xml-p66975910p68475909.bz2
[1m391441028/391441028[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 0us/step
Downloading data from https://dumps.wikimedia.org/enwiki/20240801/enwiki-20240801-pages-articles27.xml-p68475910p69975909.bz2
[1m388200981/388200981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 0us/step
Downloading data from https://dumps.wikimedia.org/enwiki/20240801/enwiki-20240801-pages-articles27.xml-p69975910p71475909.bz2
[1m379624343/379624343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 0us/step
Downloading data from https://dumps.wikimedia.org/enwiki/20240801/enwiki-20240801-pages-articles27.xml-p71475910p72975909.bz2
[1m361059132/3

##### Files will be saved in <mark>/.keras/datasets</mark>

#### The total download time was over 2 hours. The process could be made faster by running in parallel using multithreading or multiprocessing. 