### Parsing a Text File to Find Wikipedia Links

The case when we have also Birth and Death dates to help improve disambiguation

In [1]:
import re
from collections import Counter

def pre_process_list(filepath: str):
    with open(filepath) as f:
        rows = f.readlines()

    life_pattern = r"(\d+)\s+(–|-)\s+(\d+)"
    people = []
    for row in rows:
        if '(' in row and ')' in row:
            bracket_open_index = row.index('(')
            bracket_closed_index = row.index(')')
            name = row[:bracket_open_index].strip()
            lifespan = row[bracket_open_index:bracket_closed_index+1].strip()
            if len(row) > bracket_closed_index+1:
                description = row[bracket_closed_index+1:].strip()
            else:
                description = ""
            m = re.search(life_pattern, lifespan)
            if m:
                birth = m.group(1).strip()
                death = m.group(3).strip()
            else:
                birth = -1
                death = -1
            people.append((name, int(birth), int(death)))
            # print("Name:", name)
            # print("Lifespan:", lifespan, birth, death)
            # print("Description:", description)
            # print("---")
        else:
            print("MISSED", row)
    
    return people

def pre_process_latinamerica_list(filepath: str):
    with open(filepath) as f:
        rows = f.readlines()
    people = []
    countries = []
    for row in rows:
        columns = row.split('|')
        name = columns[0].strip()
        country = columns[1].strip()
        countries.append(country)
        birth = -1
        death = -1
        people.append((name, int(birth), int(death)))
    
    [print(x) for x in Counter(countries).most_common()]
    return people


In [15]:
from utils.utils_wiki import get_wikipedia_article, save_wikipedia_page
import time, os
from typing import List, Tuple
from urllib.parse import unquote

def create_ready_filelist(people: List[Tuple], output_list: str, output_dir: str):
    if not os.path.exists(output_dir): os.makedirs(output_dir)
    with open(output_list, "w") as f:
        for name, birth, death in people:
            page = get_wikipedia_article(name, query_restrictions={'birth_year': birth, 'death_year': death})
            if page:
                save_wikipedia_page(page, output_path=f"{output_dir}/{page.title.replace(' ', '_').lower()}.txt", include_metadata=True, include_sections=True, include_infobox=True)
                f.write(f"{page.title} | {page.url}\n")
            time.sleep(5)

def get_person_name(wiki_link: str):
    if "/" not in wiki_link: return None
    name_url = wiki_link.split("/")[-1]
    person_name = unquote(name_url)
    return person_name.replace("_", " ")

def get_pages_from_ready(filepath: str, output_dir: str):
    if not os.path.exists(output_dir): os.makedirs(output_dir)
    wiki_titles = []
    with open(filepath) as f:
        for line in f:
            elems = line.split("|")
            person_wiki_name = get_person_name(elems[1].strip())
            wiki_titles.append(person_wiki_name)
    for title in wiki_titles:
        page = get_wikipedia_article(title)
        if page:
            save_wikipedia_page(page, output_path=f"{output_dir}/{page.title.replace(' ', '_').lower()}.txt", include_metadata=True, include_sections=True, include_infobox=True)
        time.sleep(5)

# women_famous = pre_process_list("resources/15_famous_women.txt")
# create_ready_filelist(women_famous, "resources/15_famous_women.ready.txt", "data/wikipedia/top_women")

# get_pages_from_ready("resources/12_activists_lgbtq.ready.txt", "data/wikipedia/top_lgbtq")

# get_pages_from_ready("resources/50_famous_mexico.ready.txt", "data/wikipedia/top_mexico")

# top_100_people = pre_process_list("resources/top_100_world_most_influential.txt")
# create_ready_filelist(top_100_people, "resources/top_100_world_most_influential.ready.txt", "data/wikipedia/top100")

get_pages_from_ready("resources/100_famous_latinamerica.ready.txt", "data/wikipedia/top_latinamerica")

Options: {'Moctezuma I', 'Isabel Moctezuma', 'Moctezuma II'}
Ordered Options Compund Metric: [RankedArticle(wikipage_title='Moctezuma II', queried_name='Moctezuma II', lev_similarity=1.0, token_overlap=1.0, dates_confidence=-1), RankedArticle(wikipage_title='Moctezuma I', queried_name='Moctezuma II', lev_similarity=0.9565217391304348, token_overlap=0.5, dates_confidence=-1), RankedArticle(wikipage_title='Isabel Moctezuma', queried_name='Moctezuma II', lev_similarity=0.6428571428571428, token_overlap=0.5, dates_confidence=-1)]

Retrieving page for Moctezuma II
Wiki Life Data = (1460 - 1520)
Page Chosen! Confidence Score = 1
Options: {'Malinche (volcano)', 'La Malinche', 'La Llorona'}
Ordered Options Compund Metric: [RankedArticle(wikipage_title='La Malinche', queried_name='La Malinche', lev_similarity=1.0, token_overlap=1.0, dates_confidence=-1), RankedArticle(wikipage_title='Malinche (volcano)', queried_name='La Malinche', lev_similarity=0.5517241379310345, token_overlap=0.5, dates_con

KeyboardInterrupt: 

: 

### Parsing a Wikipedia Page with a Table

The case of the "List of women explorers and travelers" [Link](https://en.wikipedia.org/wiki/List_of_women_explorers_and_travelers)