In [None]:
import pandas as pd
import csv
import json
import re
from pypdf import PdfReader
import ollama
import time

PDF_PATH = "/Users/kemalgunay/first_two_pages.pdf"
CSV_PATH = "/Users/kemalgunay/Desktop/extracted_company_data.csv"
MODEL_NAME = "llama3.1"

# ----------------------------------------------------------
# PDF'ten metin çıkarma
# ----------------------------------------------------------
def extract_raw_text(pdf_path):
    reader = PdfReader(pdf_path)
    return " ".join(page.extract_text() or "" for page in reader.pages)

# ----------------------------------------------------------
# Llama yanıtını temizleme ve JSON’a dönüştürme
# ----------------------------------------------------------
def robust_json_parse(text):
    try:
        # ```json``` bloklarını temizle
        text = re.sub(r"```json|```", "", text)
        # Gereksiz açıklama veya metinleri kes
        json_match = re.search(r"\[.*\]", text, re.DOTALL)
        if json_match:
            text = json_match.group(0)
        return json.loads(text)
    except Exception:
        return []

# ----------------------------------------------------------
# Llama ile veri çıkarma (dayanıklı)
# ----------------------------------------------------------
def extract_with_llama(text_chunk, retries=2):
    PROMPT = f"""
    Bu metin bir iş rehberi sayfasıdır. 
    Her satır bir şirketi temsil eder. 
    Lütfen her şirketi JSON formatında çıkar. 
    Çıktı sadece JSON listesi olmalı (örnek: [{{...}}, {{...}}])

    Her şirket için şu alanları çıkar:
    - CompanyName
    - Address
    - PhoneNumbers (liste halinde)
    - Email
    - Website
    - CoreActivity
    - Branches (liste, her biri: {{"BranchLocation": "", "BranchPhoneNumber": ""}})

    Metin:
    ---
    {text_chunk}
    ---
    """

    for attempt in range(retries):
        try:
            response = ollama.chat(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": PROMPT}]
            )
            raw = response['message']['content']
            data = robust_json_parse(raw)
            if data:
                return data
            else:
                print(f"⚠️ Boş veya hatalı JSON — deneme {attempt+1}/{retries}")
                time.sleep(1)
        except Exception as e:
            print(f"LLM hatası (deneme {attempt+1}): {e}")
            time.sleep(1)
    return []

# ----------------------------------------------------------
# Ana akış
# ----------------------------------------------------------
def main_scraper():
    full_raw_text = extract_raw_text(PDF_PATH)
    if not full_raw_text.strip():
        print("❌ PDF boş veya okunamadı.")
        return
    
    chunk_size = 1500
    chunks = [full_raw_text[i:i+chunk_size] for i in range(0, len(full_raw_text), chunk_size)]
    print(f"{len(chunks)} parça işlenecek...")

    fieldnames = [
        "CompanyName", "Address", "PhoneNumbers", "Email", "Website", "CoreActivity"
    ]
    for i in range(1, 21):
        fieldnames += [f"BranchLocation_{i}", f"BranchPhoneNumber_{i}"]

    total = 0

    with open(CSV_PATH, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for i, chunk in enumerate(chunks):
            print(f"🧩 Parça {i+1}/{len(chunks)} işleniyor...")
            companies = extract_with_llama(chunk)
            if not companies:
                print(f"🚫 Parça {i+1}: veri alınamadı.")
                continue
            
            for c in companies:
                row = {
                    "CompanyName": c.get("CompanyName", ""),
                    "Address": c.get("Address", ""),
                    "PhoneNumbers": ", ".join(c.get("PhoneNumbers", [])),
                    "Email": c.get("Email", ""),
                    "Website": c.get("Website", ""),
                    "CoreActivity": c.get("CoreActivity", ""),
                }
                for j, b in enumerate(c.get("Branches", [])):
                    if j < 20:
                        row[f"BranchLocation_{j+1}"] = b.get("BranchLocation", "")
                        row[f"BranchPhoneNumber_{j+1}"] = b.get("BranchPhoneNumber", "")
                writer.writerow(row)
                total += 1

    print(f"\n✅ Tamamlandı: {total} şirket kaydedildi → {CSV_PATH}")

# ----------------------------------------------------------
if __name__ == "__main__":
    main_scraper()
