In [1]:
import sys
import os

sys.path.insert(0, '../src') # add library code to path

In [2]:
from wikiparser import download_enwiki_zips

zip_outdir = "../data/raw/zips"

download_enwiki_zips(1, zip_outdir , False)

Downloading enwiki file: (1/1)


In [3]:
zips = os.listdir(zip_outdir)
zips

['enwiki-20200101-0.7z']

In [4]:
from wikiparser import extract_7zip

extract_outdir = "../data/raw/extracted"

for zip_file in os.listdir(zip_outdir ):
    if zip_file[-2:] == "7z":
        extract_7zip(zip_outdir + "/" + zip_file, extract_outdir)

Extracted file already exists: ../data/raw/extracted/enwiki-20200101-0


In [5]:
class Revision:
    def __init__(self, timestamp, revert, version, contributor, revision_id, revision_parentid, sha1):
        self.timestamp = timestamp
        self.revert = revert
        self.version = version
        self.contributor = contributor
        self.revision_id = revision_id
        self.revision_parentid = revision_parentid
        self.sha1 = sha1
        
    def __repr__(self):
        return [self.timestamp, self.revert, self.version, self.contributor, self.revision_id, self.revision_parentid]
        
    def __str__(self):
        try:
            return "^^^_" + self.timestamp + " " + str(self.revert) + " " + str(self.version) + " " + self.contributor
        except:
            print(self.timestamp)
            print(self.revert)
            print(self.version)
            print(self.contributor)
    
    def get_revision_id(self):
        return self.revision_id

In [12]:
#parse the file, calculate lightdump information, output to outfile
def parse_enwiki_to_lightdump(filepath, outfile, outdir, articles=[]):

    from lxml import etree

    context = etree.iterparse(filepath, tag='{http://www.mediawiki.org/xml/export-0.10/}page', encoding='utf-8')
    nsmap = {'ns': 'http://www.mediawiki.org/xml/export-0.10/'}

    article_count = len(articles) if len(articles) != 0 else -1
    
    revi_header = "^^^_"

    page_dicts = {}
        
    if not os.path.exists(outdir):
        os.makedirs(outdir, exist_ok=True)

    with open(outdir + "/" + outfile, 'w') as file:
        file.write("")

    for event, elem in context:

        page_title = '_'.join(elem.find('ns:title', nsmap).text.split())
        page_id = elem.find('ns:id', nsmap).text
        revisions = elem.findall('ns:revision', nsmap)

        rev_dicts = []
        
        if len(articles) != 0 and page_title not in articles:
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]
            continue
        
#         print(page_title)
#         print(len(revisions))
        
        for revision in revisions:
            rev = {}
            
#             for item in revision:
#                 print(item, item.text)
            
            rev['revision_id'] = revision.find("ns:id", nsmap).text
            rev['revision_parentid'] = revision.find("ns:parentid", nsmap).text if revision.find("ns:parentid", nsmap) != None else None
            rev['timestamp'] = revision.find("ns:timestamp", nsmap).text
            rev['sha1'] = revision.find("ns:sha1", nsmap).text
            rev['text'] = revision.find("ns:text", nsmap).text

            print("Printing text for: " + page_title + " ===================================================")
            print(rev['text'])
            
            #getting contributer info
            contributor_info = revision.find("ns:contributor", nsmap)
            revision_contributor_username = contributor_info.find("ns:username", nsmap).text if contributor_info.find("ns:username", nsmap) != None else None
            revision_contributor_id = contributor_info.find("ns:id", nsmap).text if contributor_info.find("ns:id", nsmap) != None else None
            revision_contributor_ip = contributor_info.find("ns:ip", nsmap).text if contributor_info.find("ns:ip", nsmap) != None else None
            
            if revision_contributor_username != None:
                revision_contributor_username = "_".join(revision_contributor_username.split())
                rev['contributor'] = revision_contributor_username
            elif revision_contributor_ip != None:
                rev['contributor'] = revision_contributor_ip
            else:
                rev['contributor'] = "null"
                
            rev_dicts.append(rev)
            
        rev_dicts.sort(key=lambda x: x['timestamp'])

        version = 1
        page_results = []

        for i in range(len(rev_dicts)):

            temp_rev = Revision(rev_dicts[i]['timestamp'], 0, version, rev_dicts[i]['contributor'], rev_dicts[i]['revision_id'], rev_dicts[i]['revision_parentid'], rev_dicts[i]['sha1'])
            if rev_dicts[i]['revision_parentid'] == None:
                page_results.append(temp_rev)
                version += 1    
            else:
                # find the point we revert to
                min_ind = len(page_results) - 1
                while min_ind > 0:
                    if page_results[min_ind].sha1 == rev_dicts[i]['sha1']:
                        temp_rev.version = page_results[min_ind].version
                        temp_rev.revert = 1
                        page_results.append(temp_rev)
                        break
                    min_ind -= 1

                if min_ind == 0:
                    page_results.append(temp_rev)
                    version += 1
        
        if len(articles) == 0 or page_title in articles:
            article_count -= 1
    
            print("Writing {} {} revisions to lightdump.txt".format(page_title, len(page_results)))
            with open(outdir + "/" + outfile, 'a') as file:
                file.write(page_title.strip() + '\n')
                for i in range(len(page_results) - 1, -1, -1):
                    file.write(page_results[i].__str__() + "\n")
            
                    
        # release uneeded XML from memory
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

        if article_count != -1 and article_count == 0:
            break

    del context

In [13]:

#parse enwiki to lightdump
file_to_parse = "../data/raw/extracted/enwiki-20200101-0"
lightdump_filename = "lightdump.txt"
temp_dir = "../data/temp"
articles = []

parse_enwiki_to_lightdump(file_to_parse , lightdump_filename, temp_dir, articles)

This subject covers

* AssistiveTechnology

* AccessibleSoftware

* AccessibleWeb

* LegalIssuesInAccessibleComputing


#REDIRECT [[Accessible Computing]]

#REDIRECT [[Accessible_computing]]
#REDIRECT [[Computer accessibility]]
#REDIRECT [[Computer accessibility]] {{R from CamelCase}}
#REDIRECT Computer accessibility {{R from CamelCase}}
#REDIRECT [[Computer accessibility]] {{R from CamelCase}}
- <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.4/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.4/ http://www.mediawiki.org/xml/export-0.4.xsd" version="0.4" xml:lang="en">
- <siteinfo>
  <sitename>Wikipedia</sitename> 
  <base>http://en.wikipedia.org/wiki/Main_Page</base> 
  <generator>MediaWiki 1.16wmf4</generator> 
  <case>first-letter</case> 
- <namespaces>
  <namespace key="-2" case="first-letter">Media</namespace> 
  <namespace key="-1" case="first-letter">Special</namespace> 
  <namespace key="0" case="first-let

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Writing Anarchism 18885 revisions to lightdump.txt
#REDIRECT [[Afghanistan/History]]

#REDIRECT [[History of Afghanisatan]]
AfGhanistan was invaded and occupied by the [[Soviet Union]] in 1979. The USSR was forced to withdraw 10 years later by anti-communist mujahidin forces

supplied and trained by the US, Saudi Arabia, Pakistan, and others. Fighting subsequently continued among the various mujahidin factions, but the fundamentalist Islamic

Taliban movement has been able to seize most of the country. In addition to the continuing civil strife, the country suffers from enormous poverty, a crumbling infrastructure,

and widespread live mines. 


#REDIRECT [[History of Afghanistan]]
#REDIRECT [[History of Afghanistan]] {{R from CamelCase}}
#REDIRECT [[History of Afghanistan]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}
Writing AfghanistanHistory 6 revisions to lightdump.txt
Location: Southern Asia, north and west of Pakistan, east of Iran 



Geographic coordinates: 33 00 N, 6

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Writing Autism 10276 revisions to lightdump.txt
In 1990 Albania ended 44 years of xenophobic communist rule and established a multiparty democracy. The transition

has proven difficult as corrupt governments have tried to deal with severe unemployment, the collapse of a fraudulent nationwide

investment scheme, widespread gangsterism, and massive refugee influxes from neighboring Kosovo. 


In 1990 AlbaniA ended 44 years of xenophobic CommunisT rule and established a multiparty DemocracY. The transition has proven difficult as corrupt governments have tried to deal with severe unemployment, the collapse of a fraudulent nationwide investment scheme, widespread gangsterism, and massive refugee influxes from neighboring KosovO. 


#REDIRECT [[Albania/History]]

#REDIRECT [[History of Albania]]
#REDIRECT [[History of Albania]] {{R from CamelCase}}
#REDIRECT [[History of Albania]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}
Writing AlbaniaHistory 6 revisions to lightdump.txt
Popul

Economy - overview: An extremely poor country by European standards, Albania is making the difficult transition to a more

open-market economy. The economy rebounded in 1993-95 after a severe depression accompanying the collapse of the previous

centrally planned system in 1990 and 1991. However, a weakening of government resolve to maintain stabilization policies in the

election year of 1996 contributed to renewal of inflationary pressures, spurred by the budget deficit which exceeded 12%. The collapse

of financial pyramid schemes in early 1997 - which had attracted deposits from a substantial portion of Albania's population -

triggered severe social unrest which led to more than 1,500 deaths, widespread destruction of property, and an 8% drop in GDP. The

new government, installed in July 1997, has taken strong measures to restore public order and to revive economic activity and trade.

The economy continues to be bolstered by remittances of some 20% of the labor force that works 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Writing Albedo 1182 revisions to lightdump.txt
Languages widespread throughout north africa and southwest asia.  Of particular note are Hebrew and Arabic, which have grown through their attachments to the major world religions of Judaism and IsLam.  The following language subfamilies are included:

* SemiticLanguages

* EgyptianLanguages

* BerberLanguages

* CushiticLanguages

* ChadicLanguages


#REDIRECT [[Afro-asiatic languages]]

#REDIRECT [[Afro-Asiatic languages]]
#REDIRECT [[Afro-Asiatic languages]] {{R from CamelCase}}
#REDIRECT [[Afroasiatic languages]] {{R from CamelCase}}
#REDIRECT [[Afroasiatic languages]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}
Writing AfroAsiaticLanguages 6 revisions to lightdump.txt
#REDIRECT [[Artificial language]]

Man, you'd think someone would catch my typos!  Page moved to [[Artificial language]]. :(


#REDIRECT [[Constructed language]]

#REDIRECT [[Constructed language]] {{R from misspelling}}
#REDIRECT [[Constructed language]] {{R f

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Writing A 4710 revisions to lightdump.txt
This is where TimShell or some other person knowledgeable about the ideas of DavidFriedman and others like him should write about their ideas.



The discussion should be encyclopedic, I hope, and not polemic.




This is where TimShell or some other person knowledgeable about the ideas of DavidFriedman and others like him should write about their ideas.



The discussion should be encyclopedic, I hope, and not polemic.



/Talk


#REDIRECT [[Anarcho-capitalism]]

#REDIRECT [[Anarcho-capitalism]]{{R from CamelCase}}
#REDIRECT [[Anarcho-capitalism]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}
Writing AnarchoCapitalism 5 revisions to lightdump.txt
Those who advocate AnarchoCapitalism.


#REDIRECT [[Anarcho-capitalists]]

#REDIRECT [[anarcho-capitalism]]
#REDIRECT [[anarcho-capitalism]]{{R from CamelCase}}
#REDIRECT [[anarcho-capitalism]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}
Writing AnarchoCapitalists 5 revisions to ligh

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Writing Alabama 9311 revisions to lightdump.txt
http://www.cnn.com/TRAVEL/CITY.GUIDES/regional/africa/africa.jpg



'''WesternAfrica'''



[[Benin]]



BurkinaFaso



[[Cameroon]]



[[Ghana]]



[[Mali]]



[[Nigeria]]



[[Senegal]]



[[Togo]]



'''CentralAfrica'''



'''SouthernAfrica'''



----



[http://flag.blackened.net/revolt/africa/accounts/chekov.html An Irish anarchist in Africa] provides a readable and compelling introduction to today's western Africa.


#REDIRECT [[Africa]]

#REDIRECT [[Africa]] {{R from CamelCase}}
Africa
#REDIRECT [[Africa]] {{R from CamelCase}}
#REDIRECT [[Ten Little Niggers]]
#REDIRECT [[Africa]] {{R from CamelCase}}
#REDIRECT [[Africa]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}
Writing AfricA 8 revisions to lightdump.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Writing Achilles 7133 revisions to lightdump.txt
#REDIRECT [[Statistics/Applied]]


#REDIRECT [[Applied statistics]]

#REDIRECT [[Statistics]]
#REDIRECT [[Statistics]]{{R from CamelCase}}
#REDIRECT [[Statistics]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}
Writing AppliedStatistics 5 revisions to lightdump.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Writing Academy_Award_for_Best_Production_Design 1197 revisions to lightdump.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Writing Academy_Awards 8002 revisions to lightdump.txt
#REDIRECT [[Action film]]

#REDIRECT [[Action movie]]

#REDIRECT [[Action movies]]

*[[Heat Film]]

*[[The Rock]]

*[[Con Air]]

*[[The Matrix]]

*[[True Lies]]

*[[Die Hard]]

*[[Terminator]]

*[[Rambo]]


Action [[film]]s usually involve a fairly straightforward story of good guys versus bad guys, where most disputes are resolved by using physical force.  Who exactly the good guys are differs from film to film, but usually they are patriotic and rather conservative Americans, whereas the bad guys are usually either criminals or agents of foreign powers.  In the 1980s and before, they were very often [[communist]]s, which brings some action films fairly close to [[Propaganda Film]]s. 



Action films also constitute very good examples for [[Feminist Film Theory]], because in them, the separation between the physical male who controls the scene and the look and the female, who is almost always the object of the look is very clear. 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



'''International Atomic Time''' ('''TAI''', from the French name '''{{lang|fr|''Temps Atomique International''}}'''<ref>Temps atomique 1975</ref>) is a high-precision atomic [[coordinate time|coordinate]] [[time standard]] based on the notional passage of [[proper time]] on [[Earth]]'s [[geoid]].<ref>{{Cite web|url=http://adsabs.harvard.edu/cgi-bin/nph-bib_query?bibcode=1986CeMec..38..155G |title=Is the International Atomic Time TAI a coordinate time or a proper time? |publisher=Adsabs.harvard.edu |date= |accessdate=8 May 2013}}</ref> It is the basis for [[Coordinated Universal Time]] (UTC), which is used for civil timekeeping all over the Earth's surface, and for [[Terrestrial Time]], which is used for astronomical calculations. {{as of|2015|6|30}} when another [[leap second]] was added,<ref name="Bulletin C 49">{{cite web|url=http://hpiers.obspm.fr/eoppc/bul/bulc/bulletinc.49|first=Danie|last=Gambis|title=Bulletin C 49|publisher=[[IERS]]|location=Paris|date=5 January 2015|accessdate=

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Writing Altruism 2238 revisions to lightdump.txt
:[[NASCAR]]

:[[Formula One]]

:[[Rallies]]: -- [[Monte Carlo Rally]] -- [[Paris-Dakkar]]


:[[NASCAR]]

:[[Formula One]]

:[[Rallies]]: -- [[Monte Carlo Rally]] -- [[Paris-Dakkar]]

:[[Touring Cars]]: -- [[V8 Supercars Australia]] -- [[BTCC]] -- [[DTM]] -- [[STCC]]

:[[CART]]


#REDIRECT [[Auto racing]]

#REDIRECT [[Auto racing]]{{R from CamelCase}}
#REDIRECT [[Auto racing]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}
Writing AutoRacing 5 revisions to lightdump.txt


KeyboardInterrupt: 