In [None]:
import os
import re
import json
import pdfplumber
import openai
import getpass
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docx import Document
import time

# --- API Key Setup (unchanged) ---
if "DEEPSEEK_API_KEY" not in os.environ:
    os.environ["DEEPSEEK_API_KEY"] = getpass.getpass("Enter your Deepseek API Key: ")

# --- PDF Extraction (unchanged) ---
def extract_pdf_text(pdf_path):
    full_text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text() or ""
                full_text += text + "\n"
        return full_text.strip()
    except Exception as e:
        print(f"⚠️ Unable to open or process PDF file {pdf_path}: {e}")
        return None

# --- Text Splitting (Updated Defaults) ---
def split_text(text, chunk_size=25000, overlap=500): # Updated defaults
    if not text:
        return []
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""], # Added spaces after punctuation
        length_function=len,
        is_separator_regex=False,
    )
    return splitter.split_text(text)

# --- Deepseek Processing (Updated max_tokens) ---
def gpt_process_text(text_chunk):
    # --- Prompt (unchanged) ---
    prompt = f"""
    You are an AI assistant responsible for extracting sentences relevant to Ultra-High Performance Concrete (UHPC) from research papers. Your task is to identify, clean, and extract relevant sentences while preserving their original structure.

    **Sentence Selection and Cleaning Criteria:**
    1️⃣ Extract sentences that explicitly mention 'UHPC' or 'Ultra-High Performance Concrete' (case-insensitive).
    2️⃣ Extract sentences that discuss UHPC-related topics even if they do not contain 'UHPC'.
    3️⃣ Sentences should be relevant, not generic. Ignore vague statements like “Concrete has high strength.”
    4️⃣ Sentences should be at least 8 words long to ensure meaningful content.
    5️⃣ Exclude references, captions, formulas, and table data to prevent noisy or incomplete extractions.
    6️⃣ Do not modify, rewrite, or summarize extracted sentences. Preserve their original wording, structure, and punctuation.
    7️⃣ If a sentence contains references like [1] or (2023), remove only the reference markers, keeping the rest of the sentence intact.
    Return the cleaned and validated sentences as a JSON list. Ensure the JSON format is strictly adhered to.

    **✅ Output Format (Strict JSON Only):**
    ```json
    {{
        "valid_sentences": ["sentence1", "sentence2", "sentence3"]
    }}
    ```
    """
    try:
        client = openai.OpenAI(
            api_key=os.environ["DEEPSEEK_API_KEY"],
            base_url="https://api.deepseek.com/v1"
        )
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": text_chunk}
            ],
            temperature=0.3,
            max_tokens=7800, # Updated max_tokens
            response_format={"type": "json_object"}
        )
        gpt_output = response.choices[0].message.content.strip()

        # --- JSON Parsing Logic (improved robustness slightly) ---
        def parse_gpt_response(response_text):
            try:
                # Attempt direct JSON parsing first, as response_format should ensure it
                return json.loads(response_text)
            except json.JSONDecodeError as e:
                 print(f"⚠️ Initial JSON parsing error: {e}")
                 print(f"   Attempting cleanup and re-parse...")
                 try:
                    # Fallback: Clean potential markdown and leading/trailing garbage
                    json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
                    if json_match:
                        cleaned_text = json_match.group(0)
                        result = json.loads(cleaned_text)
                        print("   Successfully parsed after regex cleanup.")
                        return result
                    else:
                        print(f"   Could not find valid JSON structure in response: {response_text[:500]}...")
                        return None
                 except json.JSONDecodeError as e2:
                    print(f"   Still failed after cleanup: {e2}")
                    print(f"   Received content (first 500 chars): {response_text[:500]}...")
                    return None # Give up if parsing fails

        result = parse_gpt_response(gpt_output)
        return result.get("valid_sentences", []) if result else []

    # --- Error Handling (unchanged, but RateLimitError sleep is important) ---
    except openai.APIConnectionError as e:
        print(f"❌ Deepseek API connection failed: {e}")
        return []
    except openai.RateLimitError as e:
        print(f"⏳ Deepseek API rate limit, waiting 15 seconds before retry...")
        time.sleep(15)
        return [] # Simplistic handling: skip chunk on rate limit for now
    except openai.APIStatusError as e:
        print(f"❌ Deepseek API status error: {e.status_code}")
        print(f"   Response: {e.response}")
        if "maximum context length" in str(e.response).lower():
             print("   Hint: Possibly exceeded maximum context length!")
        if "maximum number of tokens" in str(e.response).lower():
             print("   Hint: Possibly exceeded maximum output tokens! Output may be truncated.")
        return []
    except Exception as e:
        print(f"❌ Deepseek processing failed: {e}")
        return []

# --- Saving to Doc (unchanged) ---
def save_sentences_to_doc(sentences, doc_path):
    try:
        doc = Document()
        doc.add_heading("Step1_Text Generation (DeepSeek - Optimized)", level=1)
        count = 0
        if isinstance(sentences, list): # Ensure it's a list
            for i, sentence in enumerate(sentences, 1):
                if isinstance(sentence, str):
                    doc.add_paragraph(f"{i}. {sentence}")
                    count += 1
                else:
                    print(f"⚠️ Skipped non-string item at index {i}: {sentence}")
            doc.save(doc_path)
            print(f"✅ Successfully saved {count} sentences to {doc_path}")
        else:
             print(f"❌ Expected a list of sentences, but got: {type(sentences)}")
    except Exception as e:
        print(f"❌ Failed to save Word document: {e}")

# --- Main Program Logic (unchanged structure, uses updated defaults) ---
def main(pdf_paths, output_doc_path):
    all_relevant_sentences = []
    total_files = len(pdf_paths)

    for idx, pdf_path in enumerate(pdf_paths):
        print(f"\n🚀 Processing file {idx+1}/{total_files}: {os.path.basename(pdf_path)}")
        if not os.path.exists(pdf_path):
            print(f"⚠️ File not found, skipping: {pdf_path}")
            continue

        try:
            text = extract_pdf_text(pdf_path)
            if not text:
                print(f"⚠️ Failed to extract text from {os.path.basename(pdf_path)} or file is empty, skipping.")
                continue

            print("📖 Splitting text (chunk_size=25000, overlap=500)...")
            text_chunks = split_text(text)
            print(f"🔹 Text split into {len(text_chunks)} chunks.")

            if not text_chunks:
                 print(f"⚠️ Text splitting produced empty result, skipping file {os.path.basename(pdf_path)}.")
                 continue

            chunk_sentences = []
            for i, chunk in enumerate(text_chunks):
                if len(chunk) > 30000: # Arbitrary safety check slightly above target
                     print(f"  ⚠️ Chunk {i+1} has abnormal size ({len(chunk)} chars), may cause issues. Skipping.")
                     continue

                print(f"🤖 Using Deepseek to process chunk {i+1}/{len(text_chunks)} (max_tokens=7800)...")
                sentences = gpt_process_text(chunk)
                if sentences:
                     chunk_sentences.extend(sentences)
                time.sleep(1.5) # Slightly increased sleep due to larger requests

            print(f"📄 Finished processing {os.path.basename(pdf_path)}, found {len(chunk_sentences)} relevant sentences.")
            all_relevant_sentences.extend(chunk_sentences)

        except Exception as e:
            print(f"❌ Unexpected error while processing {os.path.basename(pdf_path)}: {e}")
            continue

    print(f"\n✅ All files processed, total number of extracted sentences: {len(all_relevant_sentences)}")

    if all_relevant_sentences:
        print("📑 Saving results to document...")
        output_dir = os.path.dirname(output_doc_path)
        try:
            if output_dir and not os.path.exists(output_dir):
                 os.makedirs(output_dir)
                 print(f"   Created output directory: {output_dir}")
            save_sentences_to_doc(all_relevant_sentences, output_doc_path)
        except Exception as e:
             print(f"❌ Error creating output directory or saving file: {e}")
    else:
        print("⏹️ No relevant sentences extracted, output file will not be generated.")


# --- Execution Block (Path generation unchanged) ---
if __name__ == "__main__":
    pdf_base_dir = 
    num_files = 50
    if not os.path.isdir(pdf_base_dir):
        print(f"Error: Specified PDF base directory does not exist: {pdf_base_dir}")
        exit()
    pdf_files = [os.path.join(pdf_base_dir, f"{i}.pdf") for i in range(1, num_files + 1)]

    output_doc =  # Updated output name

    print(f"Will process {len(pdf_files)} PDF files from directory: {pdf_base_dir}")
    print(f"Using settings: chunk_size=25000, overlap=500, max_tokens=7800")
    print(f"Results will be saved to: {output_doc}")

    main(pdf_files, output_doc)