In [29]:
import requests
from bs4 import BeautifulSoup as bs
import re
from functools import lru_cache

In [33]:
cache = {}
def hashable_cache(f):
    def inner(url, session):
        if url not in cache:
            cache[url] = f(url, session)
        return cache[url]
    return inner

In [37]:
#@hashable_cache
@lru_cache(maxsize = None)
def get_first_paragraph(wikipedia_url, session_param):
#     print(wikipedia_url)
    req= session_param.get(wikipedia_url) # requests changed by session_param
    content = req.text
    soup = bs(content, 'html')
    
    #remove all the text link
    for a in soup.findAll('a', href=True):
        a.extract()
        
    paragraphs = soup.find_all('p')
    first_paragraph_index = 0
    i = 0
    for paragraph in soup.find_all("p"):   
        if paragraph.find('b') != None:
            first_paragraph_index = i            
            break
        i+=1
    
    first_paragraph = paragraphs[first_paragraph_index].text
    
    if wikipedia_url.starts_with('https://en.'): #to reduce conflict with other language characters
        first_paragraph = re.sub(r'[();{}[\]]+', "", first_paragraph)
        sanitized_paragraph = ' '.join(first_paragraph.strip().split())
        sanitized_paragraph = re.sub(r'II', 'II,',sanitized_paragraph)
    else:
        sanitized_paragraph = first_paragraph
   
    return sanitized_paragraph

In [35]:
def get_leaders():
    global countries
    
    root_url = "https://country-leaders.herokuapp.com"
    cookie_url = root_url + "/cookie"
    country_url = root_url + "/countries"
    leaders_url = root_url + "/leaders"

    req_cookies = requests.get(cookie_url)
    cookies=req_cookies.cookies
   
    req_countries = requests.get(country_url, cookies = cookies)
    countries = req_countries.text
#     print(countries)
    
    countries = countries.strip('[, ]')
    countries = countries.split(",")    
    
    session = requests.Session()
    leaders_per_country = {}
    for country in countries:
        country = country.replace('\"', "")
        param = {'country': country}
        
        req_leaders = requests.get(leaders_url, cookies =cookies, params = param)
        
        if req_leaders.status_code == 403:
            cookies=req_cookies.cookies
            req_leaders = requests.get(leaders_url, cookies =cookies, params = param)
            
        content = req_leaders.text    
        content = content.strip('[, ]')
        list_leaders_currentcountry = content.split('}')
#         print(".......... \nleaders info started here \n")
        clean_leader_info_percountry = []
        for leader_info in list_leaders_currentcountry:
            leader_info_clean = leader_info.strip('{, }')
            list_leader_info = leader_info_clean.split(',')
            leader_fname = ""
            leader_lname = ""
            wiki_url = ""
            
            leader_info_dict={}
            
            for info in list_leader_info:                
                if 'wikipedia' in info:
                    wiki_split = info.split(':')
                    wiki_url = (wiki_split[1] + ":" + wiki_split[2]).replace("\"", "")
                elif "first_name" in info:
                    leader_fname = info.split(':')[1].replace("\"", "")
                    
                elif "last_name" in info:
                    leader_lname = info.split(':')[1].replace("\"", "")
                    
                
                if leader_fname == "" or leader_lname == "" or wiki_url == "":
                    continue
                else:
                    break #has to break out from this loop because no neet to travel to all information
            
            #break
            try: 
                first_paragraph = get_first_paragraph(wiki_url, session)  
            except:
                first_paragraph = "first paragraph could not be extracted. Either link not found or link has error"
                  
            print(leader_fname + " " + leader_lname + ",  " + wiki_url )
            print(first_paragraph)
            leader_info_dict['first_name'] = leader_fname
            leader_info_dict['last_name'] = leader_lname
            leader_info_dict['wikipedia_url'] = wiki_url
            leader_info_dict['first_paragraph'] = first_paragraph
            
            clean_leader_info_percountry.append(leader_info_dict) #list of leaders info for current country under the loop
        leaders_per_country[country] = clean_leader_info_percountry
    return leaders_per_country     



In [36]:
# %%time
leaders_per_country = get_leaders()

Vladimir Putin,  https://ru.wikipedia.org/wiki/%D0%9F%D1%83%D1%82%D0%B8%D0%BD
Влади́мир Влади́мирович Пу́тин род. , , — государственный, и деятель. Действующий , председатель и с 7 мая 2012 года. Ранее занимал должность президента с 7 мая 2000 года по 7 мая 2008 года, также в 1999—2000 и 2008—2012 годах занимал должность . 1997.
Dmitry Medvedev,  https://ru.wikipedia.org/wiki/%D0%9C%D0%B5%D0%B4%D0%B2%D0%B5%D0%B4%D0%B5%D0%B2
Медве́дев — русская фамилия, образованная от прозвища «». Известна с как дворянская фамилия .
Boris Yeltsin,  https://ru.wikipedia.org/wiki/%D0%95%D0%BB%D1%8C%D1%86%D0%B8%D0%BD
Бори́с Никола́евич Е́льцин 1931-02-01, , , , — , , — и , государственный и политический деятель, первый всенародно избранный в ноябре 1991 — июне 1992 года одновременно возглавлял . С марта по май 1992 года исполнял обязанности .
 ,  
first paragraph could not be extracted. Either link not found or link has error
François Hollande,  https://fr.wikipedia.org/wiki/Fran%C3%A7ois_Hollande
Françoi

## Saving the information created



In [14]:
import json

Make a function `save()` to call this code easily.

In [15]:
def save():
    for country in countries:
        try:
            country = country.replace('\"', "")
            file_name = "C:/BeCode/LocalRepos/Wikipedea_Scrapper/" + country + "_" + "leaders.json"
            json_file = open(file_name, 'w')
            json_file.write(json.dumps(leaders_per_country.get(country)))
            json_file.close()
        except IOError:
            print("cant write the file content in the country: " + country)
        else:
            print("file successfully written")
            
    

In [16]:
def read_leaders_info(country='us'):
    try:
        file_name = "C:/BeCode/LocalRepos/Wikipedea_Scrapper/" + country + "_" + "leaders.json"
        file_json = open(file_name, 'r')
        data = json.load(file_json)
        file_json.close()
    except IOError:
        print("problem with reading file, check if it is available")
    else:
        return data
    

In [17]:
save()

file successfully written
file successfully written
file successfully written
file successfully written
file successfully written


In [19]:
read_leaders_info()

[{'first_name': 'George',
  'last_name': 'Washington',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/George_Washington',
  'first_paragraph': 'George Washington February 22, 1732 – December 14, 1799 was an American military officer, statesman, and who served as the first from 1789 to 1797. Appointed by the as commander of the , Washington led the forces to victory in the and served as the president of the of 1787, which created the and the American federal government. Washington has been called the "" for his manifold leadership in the formative days of the country.'},
 {'first_name': 'Barack',
  'last_name': 'Obama',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Barack_Obama',
  'first_paragraph': 'Barack Hussein Obama II, born August 4, 1961 is an American politician who served as the 44th from 2009 to 2017. A member of the , he was the first African-American president of the United States. Obama previously served as a from from 2005 to 2008 and as an from 1997 to 2004.'},
 {