# 20. Read JSON documents

In [1]:
# grep -i '"title": "United Kingdom"' data/enwiki-country.json > data/enwiki-united-kingdom.json

# sed 's/\\n/\n/g' data/enwiki-united-kingdom.json > data/uk-prettier.txt

_FILE_PATH = "data/enwiki-united-kingdom.json"

def _read_data(file_object, chunk_size=1024):
    while True:
        data = file_object.read(chunk_size)
        if not data:
            break
        yield data

_uk_content = []
with open(_FILE_PATH) as _input_file:
    for data in _read_data(_input_file):
        _uk_content.append(data)

_uk_text = ''.join(_uk_content)

# 21. Lines with category names

In [2]:
import re
        
def _find_expressions(pattern: str, text: str = _uk_text) -> None:
    return re.finditer(pattern, text)
                
def _display_expressions(expressions: list) -> None:
    for expression in expressions:
        print(expression.group(0))

In [3]:
_CATEGORY_PATTERN = r"\[\[Category:.*?\]\]"
_categories = _find_expressions(_CATEGORY_PATTERN)

# 22. Category names

In [4]:
_category_names = []

for category in _categories:
    category_right_part = category.group(0).partition("Category:")[2]
    category_name = category_right_part.partition("]")[0]
    _category_names.append(category_name)
    
for name in _category_names:
    print(name)

United Kingdom| 
British Islands
Countries in Europe
English-speaking countries and territories
G7 nations
Group of Eight nations
G20 nations
Island countries
Northern European countries
Former member states of the European Union
Member states of NATO
Member states of the Commonwealth of Nations
Member states of the Council of Europe
Member states of the Union for the Mediterranean
Member states of the United Nations
Priority articles for attention after Brexit
Western European countries


# 23. Section structure 

In [5]:
_SECTION_PATTERN = r"={2,6}.*?={2,6}"
_sections = _find_expressions(_SECTION_PATTERN)

for section in _sections:
    section = re.subn(r"=", "", section.group(0))
    name = section[0]
    level = int(section[1]/2)-1
    print(f"Section: {name}\tLevel: {level}")

Section: Etymology and terminology	Level: 1
Section: History	Level: 1
Section: Background	Level: 2
Section: Treaty of Union	Level: 2
Section: From the union with Ireland to the end of the First World War	Level: 2
Section: Between the World Wars	Level: 2
Section: Since the Second World War	Level: 2
Section: Geography	Level: 1
Section: Climate	Level: 2
Section: Administrative divisions	Level: 2
Section: Dependencies	Level: 1
Section: Politics	Level: 1
Section: Government	Level: 2
Section: Devolved administrations	Level: 2
Section: Law and criminal justice	Level: 2
Section: Foreign relations	Level: 2
Section: Military	Level: 2
Section: Economy	Level: 1
Section: Overview	Level: 2
Section: Science and technology	Level: 2
Section: Transport	Level: 2
Section: Energy	Level: 2
Section: Water supply and sanitation	Level: 2
Section: Demographics	Level: 1
Section: Ethnic groups	Level: 2
Section: Languages	Level: 2
Section: Religion	Level: 2
Section: Migration	Level: 2
Section: Education	Level: 2
S

# 24. Media references

In [6]:
_MEDIA_REF_PATTERN = r"\[\[File\:.*?\]\]" 
_media_refs = _find_expressions(_MEDIA_REF_PATTERN)
_display_expressions(_media_refs)

[[File:Royal Coat of Arms of the United Kingdom.svg|x100px]]
[[File:Royal Coat of Arms of the United Kingdom (Scotland).svg|x100px]]
[[File:United States Navy Band - God Save the Queen.ogg]]
[[File:Europe-UK (orthographic projection).svg|frameless]]
[[File:Europe-UK.svg|upright=1.15|frameless]]
[[File:United Kingdom (+overseas territories and crown dependencies) in the World (+Antarctica claims).svg|frameless|upright=1.15]]
[[File:Stonehenge, Condado de Wiltshire, Inglaterra, 2014-08-12, DD 18.JPG|thumb|[[Stonehenge]]
[[File:Bayeux Tapestry WillelmDux.jpg|thumb|left|The [[Bayeux Tapestry]]
[[File:State House- 1620 - St Geo - Bermuda.jpg|thumb|The [[State House, Bermuda|State House]]
[[File:Treaty of Union.jpg|thumb|The [[Treaty of Union]]
[[File:Royal Irish Rifles ration party Somme July 1916.jpg|thumb|left|alt=Black-and-white photo of two dozen men in military uniforms and metal helmets sitting or standing in a muddy trench.|Infantry of the [[Royal Ulster Rifles|Royal Irish Rifles]]
[

# 25. Infobox

In [20]:
_INFOBOX_PATTERN = r"\{\{Infobox country(.*?)\}\}\\n\\n"
_infobox = _find_expressions(_INFOBOX_PATTERN)
_infobox_dict = {}

'''
    Create a dictionnary from the infobox "A = B" with the following regex expression:
        (1) : B = "<!--C-->*{{Y\n Z}}\n"
            OR
        (2) : B = "Y\n" 
    The final \n is not consumed.
'''
_REGEX = r"\\n\|\s(.*?)\s*=\s(((\<\!--.*?--\>)*\{\{.*?\\n .*?\}\})(?=\\n)|(.*?)(?=\\n))"
#BUG : if there is a single line containing {{Y}}, the text read the next lines until (1) is satisfied

for info in _infobox:
    fields = _find_expressions(_REGEX, info.group(0))
    for field in fields:
        _infobox_dict[field.group(1)] = field.group(2)

print(len(_infobox_dict))
for info in _infobox_dict:
    print(f"{info}\t:\t{_infobox_dict[info]}\n")

88
common_name	:	United Kingdom

linking_name	:	the United Kingdom<!--Note: \"the\" required here as this entry used to create wikilinks-->

conventional_long_name	:	United Kingdom of Great Britain and Northern Ireland

image_flag	:	Flag of the United Kingdom.svg

alt_flag	:	A flag featuring both cross and saltire in red, white and blue

other_symbol	:	[[File:Royal Coat of Arms of the United Kingdom.svg|x100px]][[File:Royal Coat of Arms of the United Kingdom (Scotland).svg|x100px]]

other_symbol_type	:	[[Royal coat of arms of the United Kingdom|Royal coats of arms]]:{{#tag:ref |The coat of arms on the left is used in England, Northern Ireland, and Wales; the version on the right is used in Scotland|group=note}}

national_anthem	:	\"[[God Save the Queen]]\"{{#tag:ref |There is no authorised version of the national anthem as the words are a matter of tradition; only the first verse is usually sung.<ref>{{cite web |title=National Anthem |url=https://www.royal.uk/national-anthem |website=O

# 26. Remove emphasis markups

In [21]:
for info in _infobox_dict:
    _infobox_dict[info] = re.sub(r"\'{2,5}(.*?)\'{2,5}", r"\1", _infobox_dict[info])
    print(f"{info}\t:\t{_infobox_dict[info]}\n")

common_name	:	United Kingdom

linking_name	:	the United Kingdom<!--Note: \"the\" required here as this entry used to create wikilinks-->

conventional_long_name	:	United Kingdom of Great Britain and Northern Ireland

image_flag	:	Flag of the United Kingdom.svg

alt_flag	:	A flag featuring both cross and saltire in red, white and blue

other_symbol	:	[[File:Royal Coat of Arms of the United Kingdom.svg|x100px]][[File:Royal Coat of Arms of the United Kingdom (Scotland).svg|x100px]]

other_symbol_type	:	[[Royal coat of arms of the United Kingdom|Royal coats of arms]]:{{#tag:ref |The coat of arms on the left is used in England, Northern Ireland, and Wales; the version on the right is used in Scotland|group=note}}

national_anthem	:	\"[[God Save the Queen]]\"{{#tag:ref |There is no authorised version of the national anthem as the words are a matter of tradition; only the first verse is usually sung.<ref>{{cite web |title=National Anthem |url=https://www.royal.uk/national-anthem |website=Offi

# 27. Remove internal links

In [22]:
for info in _infobox_dict:
    _infobox_dict[info] = re.sub(r"\[\[(.*?)\]\]", r"\1", _infobox_dict[info])
    print(f"{info}\t:\t{_infobox_dict[info]}\n")

common_name	:	United Kingdom

linking_name	:	the United Kingdom<!--Note: \"the\" required here as this entry used to create wikilinks-->

conventional_long_name	:	United Kingdom of Great Britain and Northern Ireland

image_flag	:	Flag of the United Kingdom.svg

alt_flag	:	A flag featuring both cross and saltire in red, white and blue

other_symbol	:	File:Royal Coat of Arms of the United Kingdom.svg|x100pxFile:Royal Coat of Arms of the United Kingdom (Scotland).svg|x100px

other_symbol_type	:	Royal coat of arms of the United Kingdom|Royal coats of arms:{{#tag:ref |The coat of arms on the left is used in England, Northern Ireland, and Wales; the version on the right is used in Scotland|group=note}}

national_anthem	:	\"God Save the Queen\"{{#tag:ref |There is no authorised version of the national anthem as the words are a matter of tradition; only the first verse is usually sung.<ref>{{cite web |title=National Anthem |url=https://www.royal.uk/national-anthem |website=Official web site of

# 28. Remove MediaWiki markups

In [None]:
def _remove_comments(text: str) -> str:
    return re.sub(r"\<\!\-\-.*?\-\-\>", "", text)

def _remove_website_links(text: str) -> str:
    return re.sub(r"\[(?:https?\:\/\/)(.*?)\]", r"\1", text)

def _remove_references(text: str) -> str:
    text = _remove_2markup(text, "ref.*?")
    text = _remove_1markup(text, "ref.*?")
    text = re.sub(r"\#REDIRECT", "", text)
    return re.sub(r"References\: \{\{reflist\}\}", "", text)

def _remove_variable(text: str) -> str:
    text = re.sub(r"\{\{[A-Z]+\}\}", "", text)
    return re.sub(r"\{\{(localurl|fullurl|ns|rp)\:.*?\}\}", "", text)
    
def _remove_citations(text: str) -> str:
    return re.sub(r"\{\{(cite|quote)(.*?)\}\}", r"\1", text)

def _remove_signature(text: str) -> str:
    return re.sub(r"\~\~\~\~", "", text)
    
def _remove_outdent_convert_about(text: str) -> str:
    return re.sub(r"\{\{(Outdent|convert|About)|.*?\}\}", "", text)

def _remove_2markup(text: str, MARK: str) -> str:
    return re.sub(r"\<" + re.escape(MARK) + "\>(.*?)\<\/" + re.escape(MARK) +"\>", r"\1", text)

def _remove_1markup(text: str, MARK: str) -> str:
    return re.sub(r"\<"+ re.escape(MARK) +"\/\>", "", text) 
    
for info in _infobox_dict:
    text = _infobox_dict[info]
    
    text = _remove_comments(text)
    text = _remove_website_links(text)
    text = _remove_references(text)
    text = _remove_variable(text)
    text = _remove_signature(text)
    text = _remove_outdent_convert_about(text)

    text = _remove_2markup(text, "s .*?")
    text = _remove_2markup(text, "u .*?")
    text = _remove_2markup(text, "blockquote")
    text = _remove_2markup(text, "p")
    text = _remove_1markup(text, "br")  
    
    _infobox_dict[info] = text
    print(f"{info}\t:\t{_infobox_dict[info]}\n")

# 29. Country flag

In [28]:
import requests

_URL = "https://en.wikipedia.org/w/api.php"

_PARAMS = {
    "action": "query",
    "format": "json",
    "prop": "imageinfo",
    'iiprop' : 'url',
    "titles": "File:"+_infobox_dict["image_flag"]
}

_S = requests.Session()
_R = _S.get(url=_URL, params=_PARAMS)
_DATA = _R.json()

_PAGES = _DATA["query"]["pages"]

for value in _PAGES.values():
    print(value["imageinfo"][0]["url"])

https://upload.wikimedia.org/wikipedia/en/a/ae/Flag_of_the_United_Kingdom.svg
