In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import os
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
ndcs = pd.read_csv("../Data/20240118_countries.csv")
ndcs.head(5)

Unnamed: 0,ISO,Country,NDC,Date,Climate.Watch.HTML.File
0,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html
1,AGO,Angola,1,11/16/20,AGO-first_ndc-EN.html
2,AGO,Angola,2,5/31/21,AGO-revised_first_ndc-EN.html
3,ALB,Albania,1,9/21/16,ALB-first_ndc-EN.html
4,ALB,Albania,2,10/12/21,ALB-revised_first_ndc-EN.html


In [3]:
def extract_ndc(index):
    path_name = "../Data/20240117_ClimateWatch_AllData/NDC_text_HTML/ndc-master" + "/"

In [4]:
pd.read_csv("../Output/20240118_ndc_lines.csv").head(5)

Unnamed: 0,iso,country,ndc,date,html,elem,h1,h2,h3,h4,p,ol,ul,tab,hdr1,hdr2,hdr3,hdr4,line
0,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,10,2,1,0,0,1,0,0,0,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,,21 September 2015
1,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,11,2,1,0,0,2,0,0,0,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,,The Islamic Republic of Afghanistan hereby com...
2,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,13,2,1,0,0,0,0,0,1,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,,Executive Summary Base Year: 2005 Target Years...
3,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,42,2,1,0,0,0,0,1,0,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,,Adaptation: USD 10.785 billion\nMitigation: US...
4,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,45,2,1,0,0,3,0,0,0,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,,\nFigure 1. Greenhouse Gas Emissions for Afgha...


In [5]:
def extract_ndc(file_path):
    path_name = "../Data/20240117_ClimateWatch_AllData/NDC_text_HTML/ndc-master/" + str(file_path)
    with open(path_name, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        
    # Extracting elements
    elements = soup.find_all()
    h1 = soup.find_all('h1')
    h2 = soup.find_all('h2')
    h3 = soup.find_all('h3')
    h4 = soup.find_all('h4')
    p = soup.find_all('p')
    ul = soup.find_all('ul')
    ol = soup.find_all('ol')
    tab = soup.find_all('table')
    
    # Creating a DataFrame to store the information
    struct = pd.DataFrame({
        'elem': range(len(elements)),
        'h1': [0]*len(elements),
        'h2': [0]*len(elements),
        'h3': [0]*len(elements),
        'h4': [0]*len(elements),
        'p': [0]*len(elements),
        'ol': [0]*len(elements),
        'ul': [0]*len(elements),
        'tab': [0]*len(elements)
    })
    
    # Marking the presence of tags
    struct.loc[struct['elem'].isin([i for i, e in enumerate(elements) if e in h1]), 'h1'] = 1
    struct.loc[struct['elem'].isin([i for i, e in enumerate(elements) if e in h2]), 'h2'] = 1
    struct.loc[struct['elem'].isin([i for i, e in enumerate(elements) if e in h3]), 'h3'] = 1
    struct.loc[struct['elem'].isin([i for i, e in enumerate(elements) if e in h4]), 'h4'] = 1
    struct.loc[struct['elem'].isin([i for i, e in enumerate(elements) if e in p]), 'p'] = 1
    struct.loc[struct['elem'].isin([i for i, e in enumerate(elements) if e in ul]), 'ul'] = 1
    struct.loc[struct['elem'].isin([i for i, e in enumerate(elements) if e in ol]), 'ol'] = 1
    struct.loc[struct['elem'].isin([i for i, e in enumerate(elements) if e in tab]), 'tab'] = 1
    
    # Cumulative sums for tags
    struct['h1'] = struct['h1'].cumsum()
    struct['h2'] = struct['h2'].cumsum()
    struct['h3'] = struct['h3'].cumsum()
    struct['h4'] = struct['h4'].cumsum()
    struct['p'] = struct['p'].cumsum() * struct['p']
    struct['ol'] = struct['ol'].cumsum() * struct['ol']
    struct['ul'] = struct['ul'].cumsum() * struct['ul']
    struct['tab'] = struct['tab'].cumsum() * struct['tab']
    
    # Filtering rows with at least one of the specified tags
    struct2 = struct[struct[['p', 'ol', 'ul', 'tab']].sum(axis=1) > 0]

    element_indexed = [(i, e) for i, e in enumerate(elements)]
    # tag_dict = {}
    # for i, e in element_indexed:
    #     soup = BeautifulSoup(str(e), 'html.parser')
    #     tags = soup.find_all()
    #     if len(tags) > 0:
    #         parent_tag = tags[0].name
    #         if tag_dict.get(parent_tag):
    #             tag_dict[parent_tag].add(i)
    #         else:
    #             tag_dict[parent_tag] = set([i])
    
    valid_indice = set(struct2.index)
    def clean_text(string):
        return re.sub(r'\n+', " ", string)
    struct2['line'] = [clean_text(e[1].get_text()) for e in element_indexed if e[0] in valid_indice]
    # Adding header texts

    struct2['hdr1'] = [BeautifulSoup(str(h1[i-1])).get_text() if i>0 else "" for i in struct2['h1']]
    struct2['hdr2'] = [BeautifulSoup(str(h2[i-1])).get_text() if i>0 else "" for i in struct2['h2']]
    struct2['hdr3'] = [BeautifulSoup(str(h3[i-1])).get_text() if i>0 else "" for i in struct2['h3']]
    struct2['hdr4'] = [BeautifulSoup(str(h4[i-1])).get_text() if i>0 else "" for i in struct2['h4']]
    # Adding additional information from the ndcs DataFrame
    struct2['iso'] = ndcs['ISO'].iloc[0]
    struct2['country'] = ndcs['Country'].iloc[0]
    struct2['ndc'] = ndcs['NDC'].iloc[0]
    struct2['date'] = ndcs['Date'].iloc[0]
    struct2['html'] = file_path
    
    return struct2

In [7]:
dfs = []
for file_path in ndcs["Climate.Watch.HTML.File"]:
    df = extract_ndc(file_path)
    dfs.append(df)

result_df = pd.concat(dfs, ignore_index=True)
result_df

Unnamed: 0,elem,h1,h2,h3,h4,p,ol,ul,tab,line,hdr1,hdr2,hdr3,hdr4,iso,country,ndc,date,html
0,10,2,1,0,0,1,0,0,0,21 September 2015,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html
1,11,2,1,0,0,2,0,0,0,The Islamic Republic of Afghanistan hereby com...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html
2,13,2,1,0,0,0,0,0,1,Executive Summary Base Year: 2005 Target Year...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html
3,42,2,1,0,0,0,0,1,0,Adaptation: USD 10.785 billion Mitigation: US...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html
4,45,2,1,0,0,3,0,0,0,Figure 1. Greenhouse Gas Emissions for Afghan...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156,2920,8,30,19,17,954,0,0,0,Central Intelligence Agency – Country Profile ...,9. REFERENCES,Websites,6.5.1. Carbon Markets,International Climate Finance,AFG,Afghanistan,1,11/23/16,AGO-revised_first_ndc-EN.html
1157,2924,8,30,19,17,955,0,0,0,FAO Fishery Country Profile. Link:,9. REFERENCES,Websites,6.5.1. Carbon Markets,International Climate Finance,AFG,Afghanistan,1,11/23/16,AGO-revised_first_ndc-EN.html
1158,2926,8,30,19,17,956,0,0,0,http://omap.africanmarineatlas.org/BIOSPHERE/d...,9. REFERENCES,Websites,6.5.1. Carbon Markets,International Climate Finance,AFG,Afghanistan,1,11/23/16,AGO-revised_first_ndc-EN.html
1159,2929,8,30,19,17,0,21,0,0,GEF. Link: http://www.thegef.org,9. REFERENCES,Websites,6.5.1. Carbon Markets,International Climate Finance,AFG,Afghanistan,1,11/23/16,AGO-revised_first_ndc-EN.html
