In [None]:
import os
import json
import openai
import docx
from docx import Document
from typing import List, Dict, Any
import time

# Change 1: Use Deepseek API Key
if "DEEPSEEK_API_KEY" not in os.environ:
    os.environ["DEEPSEEK_API_KEY"] = input("Enter your Deepseek API Key: ").strip()

def load_sentences_from_docx(doc_path: str) -> List[str]:
    """Load sentences from a Word document"""
    doc = docx.Document(doc_path)
    sentences = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
    return sentences

def batch_process_sentences(sentences: List[str], batch_size: int = 10) -> List[str]:
    """Process sentences in batches, each time processing batch_size sentences"""
    processed_sentences = []
    
    # Change 2: Deepseek client configuration
    client = openai.OpenAI(
        api_key=os.environ["DEEPSEEK_API_KEY"],
        base_url="https://api.deepseek.com/v1"  # Deepseek API endpoint
    )

    # Process in batches
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        print(f"📦 Processing batch {i//batch_size + 1} ({i+1} to {min(i+batch_size, len(sentences))})")
        
        # Create prompt for each batch
        batch_prompt = create_batch_prompt(batch)
        
        try:
            # Change 3: Increase max_tokens and handle batch processing
            response = client.chat.completions.create(
                model="deepseek-chat", 
                messages=[{"role": "user", "content": batch_prompt}],
                temperature=0.2,
                max_tokens=4096,  # Increase token limit to fit more content
                response_format={"type": "json_object"}
            )
            
            gpt_output = response.choices[0].message.content.strip()
            
            if not gpt_output:
                print(f"⚠️ API returned empty response, skipping this batch")
                continue
                
            # Parse batch results
            try:
                results = json.loads(gpt_output)
                if "results" in results:
                    for result in results["results"]:
                        if result.get("is_kg_worthy"):
                            refined_sentence = result.get("refined_sentence") or result.get("original_sentence")
                            if refined_sentence:
                                processed_sentences.append(refined_sentence)
                                print(f"✅ Refined: {refined_sentence}")
                        else:
                            print(f"❌ Discarded: {result.get('original_sentence', 'Unknown sentence')}")
                else:
                    print(f"⚠️ Returned format does not match expectation: {gpt_output[:100]}...")
            except json.JSONDecodeError as e:
                print(f"⚠️ JSON parsing error: {e}\nReturned content:\n{gpt_output[:200]}...")
                
            # Short delay after each batch to avoid API rate limits
            time.sleep(1)
            
        except Exception as e:
            print(f"⚠️ Batch processing failed: {str(e)}")
            time.sleep(3)  # Wait longer after error
            
    return processed_sentences

def create_batch_prompt(sentences: List[str]) -> str:
    """Create a prompt for one batch of sentences"""
    sentences_json = json.dumps(sentences, ensure_ascii=False, indent=2)
    
    prompt = f"""
    You are an advanced AI assistant trained for academic research.
    Your task is to evaluate and refine multiple research sentences for use in a **Knowledge Graph (KG)** related to **Ultra-High Performance Concrete (UHPC)**.

    ---
    **📌 Task 1: Sentence Judgment**
    1️⃣ **KEEP** sentences that contain **scientific knowledge that directly contributes to KG construction**, such as:
       - Numerical data (e.g., "UHPC has a compressive strength of 150 MPa").
       - Clear relationships between UHPC components, properties, and effects.
       - Experimental findings, performance results, or material compositions.
       - Key processing techniques and their effects on UHPC.
    2️⃣ **REMOVE** sentences that:
       - Are too vague or generic (e.g., "Concrete is widely used in construction").
       - Require extra context to be useful.
       - Reference figures, tables, or sections without details.
       - Lack clear meaning for KG.

    ---
    **📌 Task 2: Sentence Optimization**
    - If a sentence is useful, **refine it to improve clarity, precision, and academic quality**.
    - **Do NOT change the scientific meaning** or introduce unverified data.
    - Ensure each sentence is **grammatically correct, concise, and formal**.

    ---
    **Input Sentences (JSON Array):**
    {sentences_json}

    ---
    **✅ Output Format (Strict JSON Only):**
    Return a JSON object with a "results" array containing an object for each input sentence:
    ```json
    {{
        "results": [
            {{
                "original_sentence": "First sentence",
                "is_kg_worthy": true or false,
                "refined_sentence": "Refined first sentence or null if not worthy"
            }},
            {{
                "original_sentence": "Second sentence",
                "is_kg_worthy": true or false,
                "refined_sentence": "Refined second sentence or null if not worthy"
            }},
            ... and so on for each input sentence
        ]
    }}
    ```
    """
    
    return prompt

def save_sentences_to_doc(sentences: List[str], doc_path: str) -> None:
    """Save processed sentences into a Word document"""
    doc = Document()
    doc.add_heading("Step3_Text_Judgment_Result_2", level=1)
    for i, sentence in enumerate(sentences, 1):
        doc.add_paragraph(f"{i}. {sentence}")
    doc.save(doc_path)
    print(f"✅ Saved file at: {doc_path}")

def main(input_doc_path: str, output_doc_path: str, batch_size: int = 10) -> None:
    """Main function to run the entire process"""
    print("📂 Reading input file...")
    sentences = load_sentences_from_docx(input_doc_path)
    print(f"📜 Number of original sentences: {len(sentences)}")

    print(f"🤖 Deepseek batch processing (batch size = {batch_size})...")
    processed_sentences = batch_process_sentences(sentences, batch_size)
    print(f"✨ Number of valid sentences: {len(processed_sentences)}")

    print("📑 Saving results...")
    save_sentences_to_doc(processed_sentences, output_doc_path)

if __name__ == "__main__":
    input_doc = r"D:\SIT\knowledge\Result\step2_Text_Judgment_Result.docx"
    output_doc = r"D:\SIT\knowledge\Result\step3_Text_Judgment_Result_2.docx"
    batch_size = 10  # Can be adjusted if needed
    main(input_doc, output_doc, batch_size)
