In [None]:
import os
import json
import pandas as pd
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
from io import StringIO
from bs4 import BeautifulSoup
from datetime import datetime
import xml.etree.ElementTree as ET


""" funkcije koja sprema podatke u XML format """
def save_as_xml(data, output_path):
    root = ET.Element("Documents")
    
    for entry in data:
        document = ET.SubElement(root, "Document")
        
        for key, value in entry.items():
            element = ET.SubElement(document, key)
            element.text = str(value) if value is not None else "N/A"
    
    tree = ET.ElementTree(root)
    tree.write(output_path, encoding="utf-8", xml_declaration=True)
    print(f"Data successfully saved to {output_path}")

""" korištenje AI modela za kreiranje sažetka (ne radi jer se krediti za korištenje plaćaju) """
# from openai import OpenAI

# client = OpenAI(
#   api_key="my-api-key"
# )
# def summarize_text(content):
#     response = client.chat.completions.create(
#     model="gpt-4o-mini",
#     store=True,
#     messages=[
#         {"role": "user", "content": "Summarize the following content:\n\n{content}"}
#     ]
#     )
#     return response.choices[0].text.strip()


""" funkcija koja pronalazi glavni naslov rada bazirano na style atributu """
def find_title(soup):
    divs = soup.find_all('div', style=lambda s: s and "position:absolute;" in s and "border: textbox 1px solid;" in s)

    matches = []
    for div in divs:
        span = div.find('span', style=lambda s: s and "font-family: CharisSIL" in s and "font-size:13px" in s)
        if span:
            matches.append(span.text.strip()) 
    return matches[1] if len(matches) > 1 else "N/A"

""" izvlači pdf sadržaj u obliku HTML-a """
def extract_pdf_as_html(pdf_path):
    output_string = StringIO()
    with open(pdf_path, 'rb') as fin:
        extract_text_to_fp(
            fin,
            output_string,
            laparams=LAParams(),
            output_type='html',
            codec=None,
        )
    return output_string.getvalue()

""" sve funckije izvlačenja podataka se dešavaju ovjde"""
def parse_html_to_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # UID rada
    id_span = soup.find("span", string=lambda text: text and "EnergyPolicy" in text)
    id_value = id_span.text.strip() if id_span else "ID not found"

    # Datum kada je rad izdan
    date_span = soup.find_all(string=lambda text: "Availableonline" in text)
    # Pravilno formatiranje datuma jer ga ne dobijemo u željenom obliku
    raw_date = date_span[0].strip().replace("Availableonline", "") if date_span else "Date not found"
    try:
        parsed_date = datetime.strptime(raw_date, "%d%B%Y")  
        formatted_date = parsed_date.strftime("%d %B %Y")    
    except ValueError:
        formatted_date = "Invalid date format"

    # naslov rada
    title = find_title(soup)

    #lista autora rada
    authors_spans = soup.find_all('span', style=lambda s: s and "font-family: CharisSIL" in s and "font-size:10px" in s)
    authors = [span.text.strip() for span in authors_spans if span.text.strip()]
    authors = ", ".join(authors) if authors else "N/A"


    # izvlaci sadržaj koji se nalazi od naslova "Uvod" do naslova "Reference" 
    intro_span = None
    references_span = None
    
    for span in soup.find_all('span'):
        if "Introduction" in span.text:
            intro_span = span
            break
    
    for span in soup.find_all('span'):
        if "References" in span.text:
            references_span = span
            break
    content = ""
    if intro_span and references_span:
        found_intro = False
        content_sections = []
        for element in soup.find_all():
            if element == intro_span:
                found_intro = True
                continue
            if element == references_span:
                break
            if found_intro and element.name == 'span': 
                content_sections.append(element.text.strip())
        
        content = " ".join(content_sections).strip()
    
    # Dodavanje atributa sažetak koji dobijemo AI procesiranjem sadržaja
    # summary = ""
    if not content:
        content = "Content not found"
    # else: 
        # summary = summarize_text(content)

    data = {
        "ID rada": id_value.split(")")[1] if len(id_value.split(")")) > 1 else "Nepoznato",
        "Datum": formatted_date,
        "Naslov": title,
        "Autori": authors,
        # "Sažetak": summary if content else "No content or out of tokens",
        "Sadržaj": content,
    }
    return data


""" Procesiranje PDF dokumenata (input) """
def process_pdfs_in_folder(folder_path):
    all_data = []
    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, pdf_file)
            print(f"Processing: {pdf_path}")
            
            html_content = extract_pdf_as_html(pdf_path)
            parsed_data = parse_html_to_data(html_content)
            parsed_data["File"] = pdf_file  
            all_data.append(parsed_data)
    
    return all_data

pdf_folder = "./20"
data = process_pdfs_in_folder(pdf_folder)

output_folder = "./output_data"
os.makedirs(output_folder, exist_ok=True)

# Spremi kao JSON
json_path = os.path.join(output_folder, "data.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

# Spremi kao CSV
df = pd.DataFrame(data)
csv_path = os.path.join(output_folder, "data.csv")
df.to_csv(csv_path, index=False, encoding="utf-8")

df['ID rada'] = pd.to_numeric(df['ID rada'], errors='coerce')

# NaN vrijednost -> 0
if df['ID rada'].isna().any():
    print("Warning: Found non-numeric values in 'ID rada'. Filling with 0.")
    df['ID rada'] = df['ID rada'].fillna(0)

# Pretvaranje 'ID rada' u int
df['ID rada'] = df['ID rada'].astype(int)

# Parquet
os.makedirs(output_folder, exist_ok=True)  # Kreiraj folder ako ne postoji
parquet_path = os.path.join(output_folder, "data.parquet")

try:
    df.to_parquet(parquet_path, index=False, engine="pyarrow")
    print(f"Data successfully saved to {parquet_path}")
except Exception as e:
    print(f"Error saving to Parquet: {e}")

# XML
xml_path = os.path.join(output_folder, "data.xml")
save_as_xml(data, xml_path)
print("Extraction and saving complete!")



Processing: ./20/The-political-economy-of-coal-in-Poland--Drivers-and-barriers-_2020_Energy-P.pdf
Processing: ./20/The-impact-of-Nord-Stream-2-on-the-European-gas-market-barga_2020_Energy-Pol.pdf
Processing: ./20/Next-generation-energy-performance-certificates--End-user-nee_2022_Energy-Po.pdf
Processing: ./20/Modelling-thermal-insulation-investment-choice-in-the-EU-via-a-_2022_Energy-.pdf
Processing: ./20/Demand-charge-savings-from-solar-PV-and-energy-storage_2020_Energy-Policy.pdf
Processing: ./20/The-impact-of-electric-vehicles-on-the-future-European-electri_2022_Energy-P.pdf
Processing: ./20/Preferences-for-configurations-of-Positive-Energy-Districts---In_2022_Energy.pdf
Processing: ./20/Consumers--willingness-to-pay-for-second-generation-ethanol_2022_Energy-Poli.pdf
Processing: ./20/A-bigger-bang-for-the-buck--The-impact-of-risk-reduction-on-ren_2023_Energy-.pdf
Processing: ./20/-Someone-will-take-care-of-it---Households--understanding-of-thei_2020_Energ.pdf
Processing: ./20/The-ge