In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup

# Directory and file setup
directory_path = '/Users/twc/Research/20221022_Updated_NDCs/Data/20240117_ClimateWatch_AllData/NDC_text_HTML/ndc-master'
file_path = '/Users/twc/Research/20221022_Updated_NDCs/Data/20240118_countries.csv'
data = pd.read_csv(file_path)

# List to hold content
html_contents = []

# Load and parse HTML files
for filename in data['Climate.Watch.HTML.File']:
    full_path = os.path.join(directory_path, filename)
    if os.path.exists(full_path):
        with open(full_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')
            html_contents.append(soup)
    else:
        html_contents.append(None)

# Function to find the closest preceding h1, h2, h3, h4 for any tag and replace line breaks
def find_preceding_tags(tag):
    previous_elements = tag.find_all_previous()
    found_tags = {}
    for element in previous_elements:
        if element.name in ['h1', 'h2', 'h3', 'h4'] and element.name not in found_tags:
            text = element.get_text(strip=True).replace('\n', ' ').replace('\r', ' ')
            found_tags[element.name] = text
        if len(found_tags) == 4:
            break
    return found_tags  # Returns a dictionary of the found headings

def extract_text(tag):
    if tag.name == 'table':
        return ' '.join([cell.get_text(strip=True).replace('\n', ' ').replace('\r', ' ') for cell in tag.find_all('td')])
    elif tag.name in ['ol', 'ul']:
        return ' '.join([item.get_text(strip=True).replace('\n', ' ').replace('\r', ' ') for item in tag.find_all('li')])
    else:  # 'p' and other direct text containers
        return tag.get_text(strip=True).replace('\n', ' ').replace('\r', ' ')

# Extracting data
all_data = []
for i, soup in enumerate(html_contents):
    if soup is not None:
        tags = soup.find_all(['p', 'ol', 'ul', 'table'])
        for tag in tags:
            tag_text = extract_text(tag)
            headings = find_preceding_tags(tag)
            all_data.append({
                'iso': data.loc[i, 'ISO'],
                'country': data.loc[i, 'Country'],
                'ndc': data.loc[i, 'NDC'],
                'date': data.loc[i, 'Date'],
                'html': data.loc[i, 'Climate.Watch.HTML.File'],
                'text_type': tag.name,
                'text_content': tag_text,
                'h1_text': headings.get('h1', None),
                'h2_text': headings.get('h2', None),
                'h3_text': headings.get('h3', None),
                'h4_text': headings.get('h4', None),
            })

# Create DataFrame
df_elements = pd.DataFrame(all_data)
print(df_elements)

elements_file_path = '/Users/twc/Research/20221022_Updated_NDCs/Output/20240416_02_NDC_elements.csv'
df_elements.to_csv(elements_file_path, index=False)

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

# Assuming df_elements is already loaded with the necessary data

# Downloading the required resource for nltk
nltk.download('punkt')

# Function to split text into sentences
def split_into_sentences(text):
    return sent_tokenize(text)

# Expanding the DataFrame to include a row per sentence
all_rows = []
for _, row in df_elements.iterrows():
    sentences = split_into_sentences(row['text_content'])
    for sentence in sentences:
        new_row = row.copy()
        new_row['text_content'] = sentence
        all_rows.append(new_row)

# Create new DataFrame
df_sentences = pd.DataFrame(all_rows)

print(df_sentences)

sentences_file_path = '/Users/twc/Research/20221022_Updated_NDCs/Output/20240416_02_NDC_sentences.csv'
df_sentences.to_csv(sentences_file_path, index=False)