In [12]:
import requests
from bs4 import BeautifulSoup as bs
import re
from functools import lru_cache
import json

In [4]:
cache = {}
def hashable_cache(f):
    def inner(url, session):
        if url not in cache:
            cache[url] = get_first_paragraph(url, session)
        return cache[url]
    return inner

In [5]:
#@hashable_cache
@lru_cache(maxsize = None)
def get_first_paragraph(wikipedia_url, session_param):
#     print(wikipedia_url)
    req= session_param.get(wikipedia_url) # requests changed by session_param
    content = req.text
    soup = bs(content, 'html')
    
    #remove all the text link
    for a in soup.findAll('a', href=True):
        a.extract()
        
    paragraphs = soup.find_all('p')
    first_paragraph_index = 0
    i = 0
    for paragraph in soup.find_all("p"):   
        if paragraph.find('b') != None:
            first_paragraph_index = i            
            break
        i+=1
    
    first_paragraph = paragraphs[first_paragraph_index].text
    
    if wikipedia_url.startswith('https://en.'): #to reduce conflict with other language characters
        first_paragraph = re.sub(r'[();{}[\]]+', "", first_paragraph)
        sanitized_paragraph = ' '.join(first_paragraph.strip().split())
    else:
        sanitized_paragraph = first_paragraph
   
    return sanitized_paragraph

In [6]:
def get_leaders():
    global countries
    
    root_url = "https://country-leaders.herokuapp.com"
    cookie_url = root_url + "/cookie"
    country_url = root_url + "/countries"
    leaders_url = root_url + "/leaders"

    req_cookies = requests.get(cookie_url)
    cookies=req_cookies.cookies
   
    req_countries = requests.get(country_url, cookies = cookies)
    countries = req_countries.text
#     print(countries)
    
    countries = countries.strip('[, ]')
    countries = countries.split(",")    
    
    session = requests.Session()
    leaders_per_country = {}
    for country in countries:
        country = country.replace('\"', "")
        param = {'country': country}
        
        req_leaders = requests.get(leaders_url, cookies =cookies, params = param)
        
        if req_leaders.status_code == 403:
            cookies=req_cookies.cookies
            req_leaders = requests.get(leaders_url, cookies =cookies, params = param)
            
        content = req_leaders.text    
        content = content.strip('[, ]')
        list_leaders_currentcountry = content.split('}')
#         print(".......... \nleaders info started here \n")
        clean_leader_info_percountry = []
        for leader_info in list_leaders_currentcountry:
            leader_info_clean = leader_info.strip('{, }')
            list_leader_info = leader_info_clean.split(',')
            leader_fname = ""
            leader_lname = ""
            wiki_url = ""
            
            leader_info_dict={}
            
            for info in list_leader_info:                
                if 'wikipedia' in info:
                    wiki_split = info.split(':')
                    wiki_url = (wiki_split[1] + ":" + wiki_split[2]).replace("\"", "")
                elif "first_name" in info:
                    leader_fname = info.split(':')[1].replace("\"", "")
                    
                elif "last_name" in info:
                    leader_lname = info.split(':')[1].replace("\"", "")
                    
                
                if leader_fname == "" or leader_lname == "" or wiki_url == "":
                    continue
                else:
                    break #has to break out from this loop because no neet to travel to all information
            
            #break
            try: 
                first_paragraph = get_first_paragraph(wiki_url, session)  
            except:
                first_paragraph = "first paragraph could not be extracted. Either link not found or link has error"
                  
            print(leader_fname + " " + leader_lname + ",  " + wiki_url )
            print(first_paragraph)
            leader_info_dict['first_name'] = leader_fname
            leader_info_dict['last_name'] = leader_lname
            leader_info_dict['wikipedia_url'] = wiki_url
            leader_info_dict['first_paragraph'] = first_paragraph
            
            clean_leader_info_percountry.append(leader_info_dict) #list of leaders info for current country under the loop
        leaders_per_country[country] = clean_leader_info_percountry
    return leaders_per_country     



In [7]:
# %%time
leaders_per_country = get_leaders()

George Washington,  https://en.wikipedia.org/wiki/George_Washington
George Washington February 22, 1732 – December 14, 1799 was an American military officer, statesman, and who served as the first from 1789 to 1797. Appointed by the as commander of the , Washington led the forces to victory in the and served as the president of the of 1787, which created the and the American federal government. Washington has been called the "" for his manifold leadership in the formative days of the country.
Barack Obama,  https://en.wikipedia.org/wiki/Barack_Obama
Barack Hussein Obama II born August 4, 1961 is an American politician who served as the 44th from 2009 to 2017. A member of the , he was the first African-American president of the United States. Obama previously served as a from from 2005 to 2008 and as an from 1997 to 2004.
Abraham Lincoln,  https://en.wikipedia.org/wiki/Abraham_Lincoln
Abraham Lincoln February 12, 1809 – April 15, 1865 was an American lawyer and statesman who served as t

James Monroe,  https://en.wikipedia.org/wiki/James_Monroe
James Monroe April 28, 1758 – July 4, 1831 was an American statesman, lawyer, diplomat and who served as the fifth from 1817 to 1825. A member of the , Monroe was the last president of the and the coincided with the , concluding the era of American politics. He is perhaps best known for issuing the , a policy of opposing European colonialism in the while effectively asserting U.S. dominance, empire, and hegemony in the hemisphere. He also served as governor of , a member of the , U.S. ambassador to and , the seventh , and the eighth .
John Adams,  https://en.wikipedia.org/wiki/John_Quincy_Adams
John Quincy Adams July 11, 1767 – February 23, 1848 was an American statesman, diplomat, lawyer, and who served as the sixth , from 1825 to 1829. He previously served as the eighth from 1817 to 1825. During his long diplomatic and political career, Adams also served as an ambassador, and as a member of the representing in both chambers. H

Ulysses Grant,  https://en.wikipedia.org/wiki/Ulysses_S._Grant
Ulysses S. Grant born Hiram Ulysses Grant April 27, 1822 – July 23, 1885 was an American military officer and politician who served as the 18th from 1869 to 1877. As , he led the to victory in the in 1865 and thereafter briefly served as . Later, as president, Grant was an effective civil rights executive who signed the bill that created the and worked with to protect during .
William McKinley,  https://en.wikipedia.org/wiki/William_McKinley
William McKinley January 29, 1843 – September 14, 1901 was the 25th , serving from 1897 until . He was president during the of 1898, raised to boost American industry, and rejected the expansionary of , keeping the nation on the .
Stephen Cleveland,  https://en.wikipedia.org/wiki/Grover_Cleveland
Stephen Grover Cleveland March 18, 1837 – June 24, 1908 was an American lawyer and politician who served as the 22nd and 24th from 1885 to 1889 and from 1893 to 1897. Cleveland is the only pres

Georges Theunis,  https://nl.wikipedia.org/wiki/Georges_Theunis
Georges Emile Pierre Léonard Theunis (,   – ,  ) was een   van de .

Aloïs Van de Vyvere,  https://nl.wikipedia.org/wiki/Aloys_Van_de_Vyvere
Aloys Jean Maria Joseph Van de Vyvere (,   - ,  ) was een    en .

Prosper Poullet,  https://nl.wikipedia.org/wiki/Prosper_Poullet
Prosper Antoine Marie Joseph burggraaf Poullet (,   -  ) was een  .

Henri Jaspar,  https://nl.wikipedia.org/wiki/Henri_Jaspar
Henri Jaspar (,   - ,  ) was een   , die twee keer premier van België was.

Jules None,  https://nl.wikipedia.org/wiki/Jules_Renkin
Jules Laurent Jean Louis Renkin (,   - ,  ) was een   .

Paul Van Zeeland,  https://nl.wikipedia.org/wiki/Paul_van_Zeeland
Paul Guillaume van Zeeland (,   – ,  ) was een  advocaat, econoom, katholiek politicus en staatsman.

Achille Van Acker,  https://nl.wikipedia.org/wiki/Achiel_Van_Acker
Achille Honoré (Achiel) Van Acker   ( / ) (,   – aldaar,  ) was een   politicus. Hij was viermaal .

Camille Huys

Sophie Wilmès,  https://nl.wikipedia.org/wiki/Sophie_Wilm%C3%A8s
Sophie Wilmès (,  ) is een   van de . Van 27 oktober 2019 tot 1 oktober 2020 was ze de eerste vrouwelijke  van België. Sinds 1 oktober 2020 is ze vicepremier en minister van Buitenlandse Zaken en Buitenlandse Handel. Om familiale redenen legde ze op 21 april 2022 haar bevoegdheden als minister tijdelijk neer. Op 15 juli 2022 legde ze haar functie definitief neer.

 ,  
first paragraph could not be extracted. Either link not found or link has error
 ,  
first paragraph could not be extracted. Either link not found or link has error
 ,  
first paragraph could not be extracted. Either link not found or link has error
 ,  
first paragraph could not be extracted. Either link not found or link has error
 ,  
first paragraph could not be extracted. Either link not found or link has error


## Saving the information created



Make a function `save()` to call this code easily.

In [13]:
def save():
    for country in countries:
        try:
            country = country.replace('\"', "")
            file_name = "C:/BeCode/LocalRepos/Wikipedea_Scrapper/" + country + "_" + "leaders.json"
            json_file = open(file_name, 'w')
            json_file.write(json.dumps(leaders_per_country.get(country)))
            json_file.close()
        except IOError:
            print("cant write the file content in the country: " + country)
        else:
            print("file successfully written")
            
    

In [14]:
def read_leaders_info(country='us'):
    try:
        file_name = "C:/BeCode/LocalRepos/Wikipedea_Scrapper/" + country + "_" + "leaders.json"
        file_json = open(file_name, 'r')
        data = json.load(file_json)
        file_json.close()
    except IOError:
        print("problem with reading file, check if it exists")
    else:
        return data
    

In [15]:
save()

file successfully written
file successfully written
file successfully written
file successfully written
file successfully written


In [16]:
read_leaders_info()

[{'first_name': 'George',
  'last_name': 'Washington',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/George_Washington',
  'first_paragraph': 'George Washington February 22, 1732 – December 14, 1799 was an American military officer, statesman, and who served as the first from 1789 to 1797. Appointed by the as commander of the , Washington led the forces to victory in the and served as the president of the of 1787, which created the and the American federal government. Washington has been called the "" for his manifold leadership in the formative days of the country.'},
 {'first_name': 'Barack',
  'last_name': 'Obama',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Barack_Obama',
  'first_paragraph': 'Barack Hussein Obama II born August 4, 1961 is an American politician who served as the 44th from 2009 to 2017. A member of the , he was the first African-American president of the United States. Obama previously served as a from from 2005 to 2008 and as an from 1997 to 2004.'},
 {'