In [1]:
from google.colab import drive
 
drive.mount('/gdrive')
# the project's folder
%cd /gdrive/'My Drive'/wikipedia

Mounted at /gdrive
/gdrive/My Drive/wikipedia


In [2]:
import requests

# Parsing HTML
from bs4 import BeautifulSoup

# File system management
import os

In [4]:
base_url = 'https://dumps.wikimedia.org/enwiki/'
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, 'html.parser')

# Find the links that are dates of dumps
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href')]
dumps

['../',
 '20200920/',
 '20201001/',
 '20201020/',
 '20201101/',
 '20201120/',
 '20201201/',
 '20201220/',
 'latest/']

In [5]:
dump_url = base_url + '20201220/'
# Retrieve the html
dump_html = requests.get(dump_url).text
dump_html[:1000]

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n<head>\n        <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n        <title>enwiki dump progress on 20201220</title>\n        <link rel="stylesheet" type="text/css" href="/dumps.css" />\n        <style type="text/css">\n                .siteinfo {\n                        text-align: center;\n                }\n                li {\n                        list-style-type: none;\n                        padding: 0.5em 1.5em 0.5em 1.5em;\n                        background: #fff;\n                        margin-bottom: 1em;\n                }\n                li li {\n                        background-color: white;\n                        box-shadow: none;\n                        border-top: none;\n                        padding: 0px;\n                        margin-bot

In [6]:
# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')

# Find li elements with the class file
soup_dump.find_all('li', {'class': 'file'}, limit = 10)

[<li class="file"><a href="/enwiki/20201220/enwiki-20201220-pages-articles-multistream.xml.bz2">enwiki-20201220-pages-articles-multistream.xml.bz2</a> 17.8 GB</li>,
 <li class="file"><a href="/enwiki/20201220/enwiki-20201220-pages-articles-multistream-index.txt.bz2">enwiki-20201220-pages-articles-multistream-index.txt.bz2</a> 217.8 MB</li>,
 <li class="file"><a href="/enwiki/20201220/enwiki-20201220-pages-articles-multistream1.xml-p1p41242.bz2">enwiki-20201220-pages-articles-multistream1.xml-p1p41242.bz2</a> 233.7 MB</li>,
 <li class="file"><a href="/enwiki/20201220/enwiki-20201220-pages-articles-multistream-index1.txt-p1p41242.bz2">enwiki-20201220-pages-articles-multistream-index1.txt-p1p41242.bz2</a> 221 KB</li>,
 <li class="file"><a href="/enwiki/20201220/enwiki-20201220-pages-articles-multistream2.xml-p41243p151573.bz2">enwiki-20201220-pages-articles-multistream2.xml-p41243p151573.bz2</a> 315.7 MB</li>,
 <li class="file"><a href="/enwiki/20201220/enwiki-20201220-pages-articles-mult

In [7]:

import bz2
import subprocess

In [8]:
files = []

# Search through all files
for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    # Select the relevant files
    if 'pages-articles' in text:
        files.append((text.split()[0], text.split()[1:]))
        
files[:5]

[('enwiki-20201220-pages-articles-multistream.xml.bz2', ['17.8', 'GB']),
 ('enwiki-20201220-pages-articles-multistream-index.txt.bz2', ['217.8', 'MB']),
 ('enwiki-20201220-pages-articles-multistream1.xml-p1p41242.bz2',
  ['233.7', 'MB']),
 ('enwiki-20201220-pages-articles-multistream-index1.txt-p1p41242.bz2',
  ['221', 'KB']),
 ('enwiki-20201220-pages-articles-multistream2.xml-p41243p151573.bz2',
  ['315.7', 'MB'])]

In [9]:
files_to_download = [file[0] for file in files if '.xml-p' in file[0]]
files_to_download[-5:]

['enwiki-20201220-pages-articles25.xml-p60025656p61525655.bz2',
 'enwiki-20201220-pages-articles25.xml-p61525656p62585850.bz2',
 'enwiki-20201220-pages-articles26.xml-p62585851p63975909.bz2',
 'enwiki-20201220-pages-articles27.xml-p63975910p65475909.bz2',
 'enwiki-20201220-pages-articles27.xml-p65475910p66163728.bz2']

In [11]:
import sys
from keras.utils import get_file
keras_home ='/gdrive/My Drive/wikipedia/final/.keras/datasets/'

In [12]:
data_paths = []
file_info = []
i=0
# Iterate through each file
for file in files_to_download:
  path = keras_home + file
  if not os.path.exists(keras_home + file):
    print('Downloading')
    i=i+1
    if i% 20 == 0:
      print('i',i)
    # If not, download the file
    data_paths.append(get_file(file,  dump_url+ file ))   
    file_articles = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
    file_info.append((file, file_articles))
  else:
    data_paths.append(path)
    file_number = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
    file_info.append((file.split('-')[-1], file_number))

Downloading
Downloading data from https://dumps.wikimedia.org/enwiki/20201220/enwiki-20201220-pages-articles-multistream1.xml-p1p41242.bz2
Downloading
Downloading data from https://dumps.wikimedia.org/enwiki/20201220/enwiki-20201220-pages-articles-multistream2.xml-p41243p151573.bz2
Downloading
Downloading data from https://dumps.wikimedia.org/enwiki/20201220/enwiki-20201220-pages-articles-multistream3.xml-p151574p311329.bz2
Downloading
Downloading data from https://dumps.wikimedia.org/enwiki/20201220/enwiki-20201220-pages-articles-multistream4.xml-p311330p558391.bz2
Downloading
Downloading data from https://dumps.wikimedia.org/enwiki/20201220/enwiki-20201220-pages-articles-multistream5.xml-p558392p958045.bz2
Downloading
Downloading data from https://dumps.wikimedia.org/enwiki/20201220/enwiki-20201220-pages-articles-multistream6.xml-p958046p1483661.bz2
Downloading
Downloading data from https://dumps.wikimedia.org/enwiki/20201220/enwiki-20201220-pages-articles-multistream7.xml-p1483662p2

In [13]:
len(file_info)

118

In [14]:
sorted(file_info, key = lambda x: x[1], reverse = True)

[('enwiki-20201220-pages-articles-multistream11.xml-p5399367p6899366.bz2',
  1499999),
 ('enwiki-20201220-pages-articles-multistream12.xml-p7054860p8554859.bz2',
  1499999),
 ('enwiki-20201220-pages-articles-multistream13.xml-p9172789p10672788.bz2',
  1499999),
 ('enwiki-20201220-pages-articles-multistream14.xml-p11659683p13159682.bz2',
  1499999),
 ('enwiki-20201220-pages-articles-multistream15.xml-p14324603p15824602.bz2',
  1499999),
 ('enwiki-20201220-pages-articles-multistream15.xml-p15824603p17324602.bz2',
  1499999),
 ('enwiki-20201220-pages-articles-multistream16.xml-p17460153p18960152.bz2',
  1499999),
 ('enwiki-20201220-pages-articles-multistream16.xml-p18960153p20460152.bz2',
  1499999),
 ('enwiki-20201220-pages-articles-multistream17.xml-p20570393p22070392.bz2',
  1499999),
 ('enwiki-20201220-pages-articles-multistream17.xml-p22070393p23570392.bz2',
  1499999),
 ('enwiki-20201220-pages-articles-multistream18.xml-p23716198p25216197.bz2',
  1499999),
 ('enwiki-20201220-pages-a

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
file_df = pd.DataFrame(file_info, columns = ['file', 'articles']).set_index('file')


In [17]:
import bz2
import subprocess
import time

data_path = data_paths[16]
data_path

'/root/.keras/datasets/enwiki-20201220-pages-articles-multistream14.xml-p11659683p13159682.bz2'

In [18]:
%%timeit -n 3 -r 3

lines = []
for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
  lines.append(line)
  if i > 1e6:
    break

3 loops, best of 3: 4.74 s per loop


In [19]:
%%timeit -n 3 -r 3

lines = []
for i, line in enumerate(subprocess.Popen(['bzcat'], stdin = open(data_path), stdout = subprocess.PIPE).stdout):
  lines.append(line)
  if i > 1e6:
    break


3 loops, best of 3: 2.95 s per loop


In [20]:
lines = []

for i, line in enumerate(subprocess.Popen(['bzcat'], stdin = open(data_path),stdout = subprocess.PIPE).stdout):
  lines.append(line)
  if i > 5e5:
    break
lines[-102:-79]
        

[b'        <username>DESiegel</username>\n',
 b'        <id>186578</id>\n',
 b'      </contributor>\n',
 b'      <comment>/* top */Replacing links to draftspace with [[WP:AWB]]; may generatre red links, replaced: [[Draft: \xe2\x86\x92 [[</comment>\n',
 b'      <model>wikitext</model>\n',
 b'      <format>text/x-wiki</format>\n',
 b'      <text bytes="3183" xml:space="preserve">{{Infobox American football team\n',
 b'| name = Langley Rams\n',
 b'| logo = Langley Rams Logo.jpg\n',
 b'| logosize = 200px\n',
 b'| helmet =\n',
 b'| helmetsize = \n',
 b'| established = {{Start date|1948}}\n',
 b'| folded = \n',
 b'| location = [[Langley, British Columbia (city)|Langley, British Columbia]]\n',
 b'| stadium = [[McLeod Stadium]]\n',
 b'| coach = \n',
 b'| manager = \n',
 b'| league = [[Canadian Junior Football League]]\n',
 b'| division = B.C. Football Conference\n',
 b'| colours = royal blue and gold\n',
 b'| league_champs = \n',
 b'| div_champs =\n']

In [21]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
  def __init__(self):
    xml.sax.handler.ContentHandler.__init__(self)
    self._buffer = None
    self._values = {}
    self._current_tag = None
    self._pages = []
  def characters(self, content):
    if self._current_tag:
      self._buffer.append(content)
  def startElement(self, name, attrs):
    if name in ('title', 'text', 'timestamp'):
      self._current_tag = name
      self._buffer = []
  def endElement(self, name):
    if name == self._current_tag:
      self._values[name] = ' '.join(self._buffer)
    if name == 'page':
      self._pages.append((self._values['title'], self._values['text']))

In [22]:
# Content handler for Wiki XML
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

handler._pages

[]

In [24]:
lines[-100:-60]

[b'      </contributor>\n',
 b'      <comment>/* top */Replacing links to draftspace with [[WP:AWB]]; may generatre red links, replaced: [[Draft: \xe2\x86\x92 [[</comment>\n',
 b'      <model>wikitext</model>\n',
 b'      <format>text/x-wiki</format>\n',
 b'      <text bytes="3183" xml:space="preserve">{{Infobox American football team\n',
 b'| name = Langley Rams\n',
 b'| logo = Langley Rams Logo.jpg\n',
 b'| logosize = 200px\n',
 b'| helmet =\n',
 b'| helmetsize = \n',
 b'| established = {{Start date|1948}}\n',
 b'| folded = \n',
 b'| location = [[Langley, British Columbia (city)|Langley, British Columbia]]\n',
 b'| stadium = [[McLeod Stadium]]\n',
 b'| coach = \n',
 b'| manager = \n',
 b'| league = [[Canadian Junior Football League]]\n',
 b'| division = B.C. Football Conference\n',
 b'| colours = royal blue and gold\n',
 b'| league_champs = \n',
 b'| div_champs =\n',
 b'| sponsor =\n',
 b'| website = \n',
 b'}}\n',
 b'\n',
 b"The '''Langley Rams''' (formerly the '''South Surrey Rams'

In [27]:
handler._pages

[('1980 in Afghanistan',
  '{{Year in Afghanistan|1980}} \n The following lists events that happened during \'\'\'[[1980]] in [[Afghanistan]]\'\'\'. \n \n Karmal faces increasing friction within the Revolutionary Council and other wings of the government. One of the most striking evidences of Khalq-Parcham feuding comes when Karmal removes his deputy prime minister, Assadullah Sarwari, a prominent Khalqi, and three other Khalq followers from the scene by appointing them as ambassadors. Sarwari, who was once considered a potential Soviet choice to replace Karmal, is named envoy to [[Mongolia]] after a sojourn in the Soviet Union. There are reports of assassinations of Khalqis by Parchamites and vice versa, and bitter interparty fighting is said to have spread to army units and government agencies in various parts of the country. Karmal reshuffles his cabinet, promoting Sultan Ali Keshtmand, a trusted Parchamite colleague, to replace Sarwari as first deputy prime minister. \n \n ==Incumb

In [28]:
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(subprocess.Popen(['bzcat'],stdin = open(data_path), stdout = subprocess.PIPE).stdout):
    parser.feed(line)
    # Stop when 3 articles have been found
    if len(handler._pages) > 2:
      break
        
print([x[0] for x in handler._pages])

['1980 in Afghanistan', 'Memorial Stadium (Savannah)', 'List of the oldest living people']


In [29]:
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(subprocess.Popen(['bzcat'],stdin = open(data_path), stdout = subprocess.PIPE).stdout):
    parser.feed(line)
    # Stop when 50 articles have been found
    if len(handler._pages) > 50:
      break

In [30]:
!pip install mwparserfromhell

Collecting mwparserfromhell
[?25l  Downloading https://files.pythonhosted.org/packages/c6/00/03ccc2676e592f73ce455fd0343eb38d3779878332ba01ef4c0281a7d2a9/mwparserfromhell-0.6-cp36-cp36m-manylinux1_x86_64.whl (174kB)
[K     |█▉                              | 10kB 14.8MB/s eta 0:00:01[K     |███▊                            | 20kB 12.7MB/s eta 0:00:01[K     |█████▋                          | 30kB 9.7MB/s eta 0:00:01[K     |███████▌                        | 40kB 7.5MB/s eta 0:00:01[K     |█████████▍                      | 51kB 4.4MB/s eta 0:00:01[K     |███████████▎                    | 61kB 5.1MB/s eta 0:00:01[K     |█████████████▏                  | 71kB 5.0MB/s eta 0:00:01[K     |███████████████                 | 81kB 5.2MB/s eta 0:00:01[K     |████████████████▉               | 92kB 5.6MB/s eta 0:00:01[K     |██████████████████▊             | 102kB 5.4MB/s eta 0:00:01[K     |████████████████████▋           | 112kB 5.4MB/s eta 0:00:01[K     |█████████████████████

In [31]:
import mwparserfromhell 

print(handler._pages[6][0])

# Create the wiki article
wiki = mwparserfromhell.parse(handler._pages[6][1])

Portal:Spaceflight/Topics


In [32]:
print(type(wiki))
wiki[:100]

<class 'mwparserfromhell.wikicode.Wikicode'>


'{{Portal:Spaceflight/Tab header}} \n \n {{Portal:Spaceflight/box-header|Categories|Portal:Spaceflight/'

In [33]:
wikilinks = [x.title for x in wiki.filter_wikilinks()]
print(f'There are {len(wikilinks)} wikilinks.')
wikilinks[:5]

There are 0 wikilinks.


[]

In [34]:
external_links = [(x.title, x.url) for x in wiki.filter_external_links()]
print(f'There are {len(external_links)} external links.')
external_links[:5]

There are 0 external links.


[]

In [35]:
templates = wiki.filter_templates()
print(f'There are {len(templates)} templates.')
for template in templates:
  print(template.name)

There are 7 templates.
Portal:Spaceflight/Tab header
Portal:Spaceflight/box-header
Portal:Spaceflight/Topics/Categories
Box-footer
Portal:Spaceflight/box-header
Space exploration lists and timelines
Box-footer


In [36]:
infobox = wiki.filter_templates(matches = 'Infobox film')
infobox

[]

In [37]:
import re

def process_article(title, text, timestamp, template = 'Infobox film'):
  # Create a parsing object
  wikicode = mwparserfromhell.parse(text)
  # Search through templates for the template
  matches = wikicode.filter_templates(matches = template)
  # Filter out errant matches
  matches = [x for x in matches if x.name.strip_code().strip().lower() == template.lower()]
  if len(matches) >= 1:
    # Extract internal wikilinks
    wikilinks = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
    # Extract external links
    exlinks = [x.url.strip_code().strip() for x in wikicode.filter_external_links()]
    properties = {param.name.strip_code().strip(): param.value.strip_code().strip() for param in matches[0].params if param.value.strip_code().strip()}
    # Find approximate length of article
    text_length = len(wikicode.strip_code().strip())
    return (title, properties, wikilinks, exlinks, timestamp, text_length)

In [39]:
data_path=data_paths[20]
data_path

'/root/.keras/datasets/enwiki-20201220-pages-articles-multistream15.xml-p17324603p17460152.bz2'

In [40]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
  def __init__(self):
    xml.sax.handler.ContentHandler.__init__(self)
    self._buffer = None
    self._values = {}
    self._current_tag = None
    self._movies = []
    self._article_count = 0
    self._non_matches = []
  def characters(self, content):
    if self._current_tag:
      self._buffer.append(content)
  def startElement(self, name, attrs):
    if name in ('title', 'text', 'timestamp'):
      self._current_tag = name
      self._buffer = []
  def endElement(self, name):
    if name == self._current_tag:
      self._values[name] = ' '.join(self._buffer)
    if name == 'page':
      self._article_count += 1
      # Search through the page to see if the page is a book
      movie = process_article(**self._values, template = 'Infobox film')
      # Append to the list of books
      if movie:
        self._movies.append(movie)

In [41]:
# Object for handling xml
handler = WikiXmlHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
for i, line in enumerate(subprocess.Popen(['bzcat'], stdin = open(data_path),stdout = subprocess.PIPE).stdout):
  parser.feed(line)
  # Stop when 3 articles have been found
  if len(handler._movies) > 2:
    break
        
print(f'Searched through {handler._article_count} articles to find 3 movies.')

Searched through 129 articles to find 3 movies.


In [42]:
handler._movies[1]

('The Dangerous Flirt',
 {'cinematography': 'Lucien Andriot < br > Maynard Rugg',
  'country': 'United States',
  'director': 'Tod Browning',
  'distributor': 'Film Booking Offices of America',
  'language': 'Silent < br > English intertitles',
  'name': 'The Dangerous Flirt',
  'producer': 'Gothic Pictures',
  'runtime': '6 reels 1,614 meters (5,297.014 feet)',
  'starring': 'Evelyn Brent < br > Edward Earle',
  'studio': 'Gothic Pictures',
  'writer': 'Julie Herne < br > Richard Schayer'},
 ['Tod Browning',
  'Julie Herne',
  'Richard Schayer',
  'Evelyn Brent',
  'Edward Earle',
  'Lucien Andriot',
  'Film Booking Offices of America',
  'reel#Motion picture terminology',
  'Silent film',
  'melodrama',
  'Tod Browning',
  'lost film',
  'Evelyn Brent',
  'Edward Earle',
  'Sheldon Lewis',
  'Pierre Gendron (actor)',
  'List of lost films',
  'Category:1924 films',
  'Category:1924 lost films',
  'Category:1924 romantic drama films',
  'Category:American films',
  'Category:American 

In [43]:
keras_home ='/gdrive/My Drive/wikipedia/final/.keras/datasets/'

In [44]:
data_path

'/root/.keras/datasets/enwiki-20201220-pages-articles-multistream14.xml-p13159683p14324602.bz2'

In [3]:
from timeit import default_timer as timer

In [44]:
from timeit import default_timer as timer

start = timer()
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

# Parse the entire file
for i, line in enumerate(subprocess.Popen(['bzcat'], stdin = open(data_path), stdout = subprocess.PIPE).stdout):
  if (i + 1) % 10000 == 0:
    print(f'Processed {i + 1} lines so far.', end = '\r')
  try:
    parser.feed(line)
  except StopIteration:
    break
    
end = timer()
movies = handler._movies

print(f'\nSearched through {handler._article_count} articles.')
print(f'\nFound {len(movies)} movies in {round(end - start)} seconds.')

Processed 2040000 lines so far.
Searched through 33714 articles.

Found 176 movies in 156 seconds.


In [45]:
data_paths

['/root/.keras/datasets/enwiki-20201220-pages-articles-multistream1.xml-p1p41242.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream2.xml-p41243p151573.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream3.xml-p151574p311329.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream4.xml-p311330p558391.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream5.xml-p558392p958045.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream6.xml-p958046p1483661.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream7.xml-p1483662p2134111.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream8.xml-p2134112p2936260.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream9.xml-p2936261p4045402.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream10.xml-p4045403p5399366.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream1

In [46]:
import json

# Save list of books
with open('/gdrive/MyDrive/wikipedia/data/movies1.ndjson', 'wt') as fout:
  for l in movies:
    fout.write(json.dumps(l) + '\n')

In [47]:

import json

# Save list of books
with open('/gdrive/My Drive/wikipedia/data/movies1.ndjson', 'wt') as fout:
  for l in movies:
    fout.write(json.dumps(l) + '\n')

In [48]:
movies_in = []

# Read in list of books
with open('/gdrive/My Drive/wikipedia/data/movies1.ndjson', 'rt') as fin:
  for l in fin.readlines():
    movies_in.append(json.loads(l))

In [49]:
len(movies_in)

176

In [50]:
movies_in[10]

['Junda Iman Gunda',
 {'assistant director': 'Deepak Roy  < br >  Manish Das -- >',
  'director': 'Chandra Mudoi \n < !--',
  'language': 'Assamese',
  'music': 'Dr Hitesh Baruah',
  'name': 'Junda Iman Gunda',
  'producer': 'Pranjal Bharali  < br >  Chabi Bhoralee',
  'released': '7 September 2007 < ref >  < /ref >',
  'screenplay': 'Chandra Mudoi',
  'starring': 'Bikram Rajkhowa  < br >  Angoorlata',
  'writer': 'Bhaben Borah'},
 ['Pranjal Bharali',
  'Angoorlata Deka',
  'Assamese language',
  'Assamese language',
  'romantic comedy',
  'Angoorlata Deka',
  'Debojit Saha',
  'Angoorlata Deka',
  'Film soundtrack',
  'Assamese language',
  'Suren Suror Putek',
  'Zubeen Garg',
  'Debojit Saha',
  'List of Assamese films of the 2000s',
  'Category:2007 films',
  'Category:2007 romantic comedy films',
  'Category:Indian films',
  'Category:Assamese-language films',
  'Category:Films set in Assam',
  'Category:Indian romantic comedy films',
  'Category:2000s Assamese-language films'],
 

In [51]:
import gc
import json

def find_movies(data_path, limit =None, save = True):
  handler = WikiXmlHandler()
  parser = xml.sax.make_parser()
  parser.setContentHandler(handler)
  # Iterate through compressed file
  for i, line in enumerate(subprocess.Popen(['bzcat'], stdin = open(data_path), stdout = subprocess.PIPE).stdout):
    try:
      parser.feed(line)
    except StopIteration:
      break
    if limit is not None and len(handler._movies) >= limit:
      return handler._movies
  if save:
    partition_dir = '/gdrive/My Drive/wikipedia/final/'
    p_str = data_path.split('-')[-1].split('.')[-2]
    out_dir = partition_dir + f'{p_str}.ndjson'
    with open(out_dir, 'w') as fout:
      for movie in handler._movies:
        fout.write(json.dumps(movie) + '\n')
        
    print(f'{len(os.listdir(partition_dir))} files processed.', end = '\r')
  del handler
  del parser
  gc.collect()
  return None

In [52]:
from multiprocessing import Pool 
import tqdm 

# List of lists to single list
from itertools import chain

# Sending keyword arguments in map
from functools import partial
os.cpu_count()

4

In [None]:
files_to_download = [file[0] for file in files if '.xml-p' in file[0]]
files_to_download

['enwiki-20201220-pages-articles-multistream1.xml-p1p41242.bz2',
 'enwiki-20201220-pages-articles-multistream2.xml-p41243p151573.bz2',
 'enwiki-20201220-pages-articles-multistream3.xml-p151574p311329.bz2',
 'enwiki-20201220-pages-articles-multistream4.xml-p311330p558391.bz2',
 'enwiki-20201220-pages-articles-multistream5.xml-p558392p958045.bz2',
 'enwiki-20201220-pages-articles-multistream6.xml-p958046p1483661.bz2',
 'enwiki-20201220-pages-articles-multistream7.xml-p1483662p2134111.bz2',
 'enwiki-20201220-pages-articles-multistream8.xml-p2134112p2936260.bz2',
 'enwiki-20201220-pages-articles-multistream9.xml-p2936261p4045402.bz2',
 'enwiki-20201220-pages-articles-multistream10.xml-p4045403p5399366.bz2',
 'enwiki-20201220-pages-articles-multistream11.xml-p5399367p6899366.bz2',
 'enwiki-20201220-pages-articles-multistream11.xml-p6899367p7054859.bz2',
 'enwiki-20201220-pages-articles-multistream12.xml-p7054860p8554859.bz2',
 'enwiki-20201220-pages-articles-multistream12.xml-p8554860p91727

In [53]:
partitions = [file for file in data_paths if 'xml-p' in file]
len(partitions), partitions[-1]

(118,
 '/root/.keras/datasets/enwiki-20201220-pages-articles27.xml-p65475910p66163728.bz2')

In [51]:
a=[2,3,2,8]
a[1:]

[3, 2, 8]

In [54]:
partitions=partitions[43:]
print(len(partitions))

75


In [None]:
data_paths

['/root/.keras/datasets/enwiki-20201220-pages-articles-multistream1.xml-p1p41242.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream2.xml-p41243p151573.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream3.xml-p151574p311329.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream4.xml-p311330p558391.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream5.xml-p558392p958045.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream6.xml-p958046p1483661.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream7.xml-p1483662p2134111.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream8.xml-p2134112p2936260.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream9.xml-p2936261p4045402.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream10.xml-p4045403p5399366.bz2',
 '/root/.keras/datasets/enwiki-20201220-pages-articles-multistream1

In [None]:
len(data_paths)

118

In [None]:
# Create a pool of workers to execute processes
pool = Pool(processes = 32)

start = timer()

# Map (service, tasks), applies function to each partition
results = pool.map(find_movies, partitions)

pool.close()
pool.join()

end = timer()
print(f'{end - start} seconds elapsed.')

In [5]:
def read_data(file_path):
  data = []
  # Open the file and load in json
  with open(file_path, 'r') as fin:
    for l in fin.readlines():
      data.append(json.loads(l))
  return data

In [6]:
from multiprocessing.dummy import Pool as Threadpool
from itertools import chain
from timeit import default_timer as timer
import json

start = timer()

# List of files to read in
saved_files = ['/gdrive/My Drive/wikipedia/final/' + x for x in os.listdir('/gdrive/My Drive/wikipedia/final/')]

# Create a threadpool for reading in files
threadpool = Threadpool(processes = 10)

# Read in the files as a list of lists
results = threadpool.map(read_data, saved_files)

# Flatten the list of lists to a single list
movie_list = list(chain(*results))

end = timer()

print(f'Found {len(movie_list)} movies in {round(end - start)} seconds.')

Found 151331 moviess in 10 seconds.


In [7]:
if not os.path.exists(os.getcwd() + '/data/movies_wiki.ndjson'):
  with open('/gdrive/My Drive/wikipedia/data/movies_wiki.ndjson', 'wt') as fout:
    for movie in movie_list:
      fout.write(json.dumps(movie) + '\n')
  print('movies saved.')
else:
    print('Files already saved.')

movies saved.
