# Data mining of statistics from OECD on the countries of USA and Spain

**Source: https://data.oecd.org/**

In [300]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib3
import ssl
import re

**In case that the server doesn't support "RFC 5746 secure renegotiation" and the client is using OpenSSL 3, one needs to:**

Source: https://stackoverflow.com/questions/71603314/ssl-error-unsafe-legacy-renegotiation-disabled

In [301]:
class CustomHttpAdapter (requests.adapters.HTTPAdapter):
    # "Transport adapter" that allows us to use custom ssl_context.

    def __init__(self, ssl_context=None, **kwargs):
        self.ssl_context = ssl_context
        super().__init__(**kwargs)

    def init_poolmanager(self, connections, maxsize, block=False):
        self.poolmanager = urllib3.poolmanager.PoolManager(
            num_pools=connections, maxsize=maxsize,
            block=block, ssl_context=self.ssl_context)


def get_legacy_session():
    ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
    ctx.options |= 0x4  # OP_LEGACY_SERVER_CONNECT
    session = requests.session()
    session.mount('https://', CustomHttpAdapter(ctx))
    return session

In [302]:
# get the URLs
url_usa = "https://data.oecd.org/united-states.htm" 
url_spain = "https://data.oecd.org/spain.htm"

In [303]:
# get the html data
html_data_usa = get_legacy_session().get(url_usa).text
html_data_spain  = get_legacy_session().get(url_spain).text

In [304]:
# the beautiful soup data
soup_usa = BeautifulSoup(html_data_usa, 'html')
soup_spain = BeautifulSoup(html_data_spain, 'html')

In [408]:
def table_reader(soup_object, country = str):
    """This function takkes the soup object and picks the entries desired."""
    
    i = 0
    table = {}
    headline = []

    for table_i in soup_object.find_all('table'):
        i += 1

        for row in table_i.find("thead").find_all('tr'):
            title = row.find("span").text
            table[i] = pd.DataFrame(columns=["Topic", "Data for {0}".format(country), "Details for {0}".format(country)])
            headline.append(title)

        for row in table_i.find("tbody").find_all('tr'):
            col = row.find_all("td")
            col0_main_entry = col[0].find("a")
            topic = col0_main_entry.find("span").text
            
            col2_main_entry = col[2].find_all('span')
            if len(col2_main_entry) == 0:
                data = 0
                details = 0
            else:
                data = col2_main_entry[0].text
                details = col2_main_entry[2].text

            table[i].loc[len(table[i])] = [topic, data, details]
            
    return headline, table
   

In [409]:
# get the tables
headlines_usa, tables_usa = table_reader(soup_usa,'USA')
headlines_spain, tables_spain = table_reader(soup_spain,'Spain')

In [438]:
headlines_spain[11]

' Society'

In [436]:
tables_spain[12]

Unnamed: 0,Topic,Data for Spain,Details for Spain
0,Fertility rates,1.36,\nTotal\nChildren/woman\n2020\nSpain\n
1,National population distribution,48.7,\nUrban regions\nPercentage\n2014\nSpain\n
2,Permanent immigrant inflows,349 794.0,\nTotal\nNumber\n2019\nSpain\n
3,Poverty rate,0.15,\nTotal\nRatio\n2019\nSpain\n
4,Social spending,28.1,\nPublic\n% of GDP\n2022\nSpain\n


In [127]:
def table_unificator(table1, table2):
    """This function put the tables together."""

    new_table = table1.drop(table1.columns[[0]], axis=1)
    table2 = table2.join(new_table)

    return table2

In [464]:
def pipeline(headlines_spain, headlines_usa, tables_spain, tables_usa):
    """This is a pipeline to put it all together (all the different tables) in one big dataframe."""
    
    for i in range(len(headlines_spain)):
        
        # for 1 common table
        table_x = tables_spain[i+1] 
        table_y = tables_usa[i+1]
        table_z = table_unificator(table_x, table_y)
        
        # adding a title column
        new_col = [headlines_spain[i]] + (table_z.shape[0]-1)*['Nan'] 
        table_z.insert(loc=0, column="Table", value=new_col)
        # now the coomon table is done
        
        if i == 0:
            df = table_z
        else:
            df = pd.concat([df,table_z])
    
    return df

In [465]:
df = pipeline(headlines_spain, headlines_usa, tables_spain, tables_usa)
df

Unnamed: 0,Table,Topic,Data for USA,Details for USA,Data for Spain,Details for Spain
0,Agriculture,Agricultural land,265 266.27,\nPermanent pasture\nThousand hectares\n2019\n...,7 495.98,\nPermanent pasture\nThousand hectares\n2021\n...
1,Nan,Agricultural support,10.5,\nProducer support (PSE)\n% of gross farm rece...,0,0
2,Nan,Crop production,3.61,\nWheat\nTonnes/hectare\n2030\nUnited States\n,0,0
3,Nan,Fish landings,4 161 177.0,\nNational landings in domestic ports\nTonnes\...,312 781.2,\nNational landings in domestic ports\nTonnes\...
4,Nan,Nutrient balance,30.5,\nNitrogen\nKilograms/hectare\n2019\nUnited St...,49.3,\nNitrogen\nKilograms/hectare\n2017\nSpain\n
0,Development,Distribution of net ODA,11 808.8,\nLeast developed countries\nMillion US dollar...,157.4,\nLeast developed countries\nMillion US dollar...
1,Nan,Grants by private agencies and NGOs,4 953.0,\nTotal\nMillion US dollars\n2021\nUnited Stat...,1.9,\nTotal\nMillion US dollars\n2021\nSpain\n
2,Nan,Net ODA,0.2,\nODA grant equivalent\n% of gross national in...,0.3,\nODA grant equivalent\n% of gross national in...
3,Nan,Private flows,159 212.2,\nTotal\nMillion US dollars\n2021\nUnited Stat...,3 799.9,\nTotal\nMillion US dollars\n2021\nSpain\n
4,Nan,Total official and private flows,229 521.9,\nTotal\nMillion US dollars\n2021\nUnited Stat...,7 022.0,\nTotal\nMillion US dollars\n2021\nSpain\n


In [466]:
# export df to a .csv file
df.to_csv("output.csv", index=False)