In [None]:
import os
import json
import pandas as pd
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
from io import StringIO
from bs4 import BeautifulSoup
from datetime import datetime
# from openai import OpenAI

# client = OpenAI(
#   api_key="my-api-key"
# )
# def summarize_text(content):
#     response = client.chat.completions.create(
#     model="gpt-4o-mini",
#     store=True,
#     messages=[
#         {"role": "user", "content": "Summarize the following content:\n\n{content}"}
#     ]
#     )
#     return response.choices[0].text.strip()

# Function to extract data from a single PDF as HTML
def extract_pdf_as_html(pdf_path):
    output_string = StringIO()
    with open(pdf_path, 'rb') as fin:
        extract_text_to_fp(
            fin,
            output_string,
            laparams=LAParams(),
            output_type='html',
            codec=None,
        )
    return output_string.getvalue()

# Function to parse HTML and extract structured data
def parse_html_to_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    id_span = soup.find("span", string=lambda text: text and "EnergyPolicy" in text)
    id_value = id_span.text.strip() if id_span else "ID not found"

    date_span = soup.find_all(string=lambda text: "Availableonline" in text)
    raw_date = date_span[0].strip().replace("Availableonline", "") if date_span else "Date not found"
    try:
        parsed_date = datetime.strptime(raw_date, "%d%B%Y")  # Parse "20December2022"
        formatted_date = parsed_date.strftime("%d %B %Y")    # Format to "20 December 2022"
    except ValueError:
        formatted_date = "Invalid date format"
    # Extracting the title
    title_span = soup.find('span', style=lambda s: s and "font-family: CharisSIL" in s and "font-size:13px" in s)
    title = title_span.text.strip() if title_span else "N/A"

    # Extracting authors
    authors_spans = soup.find_all('span', style=lambda s: s and "font-family: CharisSIL" in s and "font-size:10px" in s)
    authors = [span.text.strip() for span in authors_spans if span.text.strip()]
    authors = ", ".join(authors) if authors else "N/A"

    # Extracting content (from "Introduction" to "References")
    intro_span = None
    references_span = None
    
    # Find 'Introduction' span
    for span in soup.find_all('span'):
        if "Introduction" in span.text:
            intro_span = span
            break
    
    # Find 'References' span
    for span in soup.find_all('span'):
        if "References" in span.text:
            references_span = span
            break
    content = ""
    if intro_span and references_span:
        # Get all elements after the intro_span
        found_intro = False
        content_sections = []
        for element in soup.find_all():
            # Start capturing content after the intro span
            if element == intro_span:
                found_intro = True
                continue
            # Stop capturing when references_span is reached
            if element == references_span:
                break
            if found_intro and element.name == 'span':  # Ensure we only extract from span elements
                content_sections.append(element.text.strip())
        
        content = " ".join(content_sections).strip()
    
    # summary = ""
    if not content:
        content = "Content not found"
    # else: 
        # summary = summarize_text(content)

    # Final data structure
    data = {
        "ID rada": id_value.split(")")[1],
        "Datum": formatted_date,
        "Naslov": title,
        "Autori": authors,
        # "Sažetak": summary if content else "No content or out of tokens",
        "Sadržaj": content,
    }
    return data


# Main function to process all PDFs in a folder
def process_pdfs_in_folder(folder_path):
    all_data = []
    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, pdf_file)
            print(f"Processing: {pdf_path}")
            
            # Extract HTML content from the PDF
            html_content = extract_pdf_as_html(pdf_path)
            # Parse HTML content to structured data
            parsed_data = parse_html_to_data(html_content)
            parsed_data["File"] = pdf_file  # Add filename for reference
            all_data.append(parsed_data)
    
    return all_data

# Folder containing PDFs
pdf_folder = "./20"  # Update this with the name of your folder
data = process_pdfs_in_folder(pdf_folder)

# Save extracted data to JSON and CSV
output_folder = "./output_data"
os.makedirs(output_folder, exist_ok=True)

# Save as JSON
json_path = os.path.join(output_folder, "data.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

# Save as CSV
df = pd.DataFrame(data)
csv_path = os.path.join(output_folder, "data.csv")
df.to_csv(csv_path, index=False, encoding="utf-8")

print("Extraction and saving complete!")



Processing: ./test/Wind-farm-energy-surplus-storage-solution-with-second-life-veh_2023_Energy-P.pdf
Extraction and saving complete!
