In [None]:
import os
import json
import openai
import docx
from docx import Document
import time
import re

# 1️⃣ Change 1: Use Deepseek API Key
if "DEEPSEEK_API_KEY" not in os.environ:
    os.environ["DEEPSEEK_API_KEY"] = input("Enter your Deepseek API Key: ").strip()

# 2️⃣ Read text generated in Step 2 (unchanged)
def load_sentences_from_docx(doc_path):
    doc = docx.Document(doc_path)
    sentences = []
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            text = " ".join(text.split(". ")[1:]) if text[0].isdigit() else text
            sentences.append(text)
    return sentences

# 3️⃣ Deepseek generates Cypher code and entities - modified batch logic
def generate_cypher_and_entities(sentences, batch_size=10):
    results = []  # Store result objects containing cypher and entities
    total_batches = (len(sentences) + batch_size - 1) // batch_size  # Round up
    
    # Create Deepseek client
    client = openai.OpenAI(
        api_key=os.environ["DEEPSEEK_API_KEY"],
        base_url="https://api.deepseek.com/v1"
    )

    # Keep the original detailed prompt template
    prompt_template = """
    You are an expert in knowledge graph construction.
    Your task is twofold for the given natural language statement:
    1.  Convert the statement into Neo4j Cypher code, primarily using **MERGE** statements to represent the nodes and relationships. Focus on accurately capturing the information within this single sentence.
    2.  Extract and list all unique **entity names** that you used as node identifiers (e.g., the value in `{name: 'EntityName'}`) within the generated Cypher code.

    Follow these instructions strictly:

    ## Identify Entities (Nodes)
    - Extract key entities from the input sentence.
    - Each entity should have an appropriate label:
      - **Material** → (`Material`) e.g., `UHPC`, `Silica Fume`
      - **Product** → (`Product`) e.g., `UHPC-Based Bridge Deck`
      - **Process** → (`Process`) e.g., `Curing`, `Casting`
      - **Property** → (`Property`) e.g., `Compressive Strength`, `Durability`
      - **Effect/Benefit** → (`Benefit`) e.g., `Cost-Effective`, `Reduced Carbon Footprint`
      - Use entity names that closely reflect the terms found in the sentence for now. Precise global normalization and deduplication across sentences will be handled in a later step. Base the `name` property primarily on the text.

    ## Define Relationships
    - Use clear, meaningful relationships between entities.
    - Relationship examples:
     - **Material Usage** → `:USED_IN`
     - **Replacement** → `:REPLACES`
     - **Effects** → `:IMPROVES`, `:DECREASES`, `:CONTRIBUTES_TO`, `:EXHIBITS_PROPERTY`
     - **Processing Methods** → `:HAS_PROCESSING_METHOD`, `:REQUIRES_PROCESS`
     - **Testing & Evaluation** → `:EVALUATED_BY`
     - **Process Execution** → `:UNDERGOES`, `:OCCURS_AT`
     - (Keep other relevant relationship types you had)

    ## Output Format (Strict JSON)
    - You **MUST** return the output ONLY as a single, valid JSON object.
    - Do NOT include any text, explanations, or markdown formatting before or after the JSON object.
    - The JSON object must contain exactly two keys: `"cypher"` and `"entities"`.
    - The value for the `"cypher"` key must be a single string containing all the generated Neo4j Cypher code for the input sentence. Use `\\n` for newlines within the Cypher string. Use `MERGE` statements.
    - The value for the `"entities"` key must be a JSON array of strings. This array should list all unique entity names used as the `name` property value within the generated Cypher code (e.g., if you have `{name: "UHPC"}` and `{name: "Silica Fume"}`, the list should include `"UHPC"` and `"Silica Fume"`).

    ---
    ## 4️⃣ Avoid These Common Mistakes
    ### 1️⃣ Ambiguity in Relationships
    - Relationships must be clearly defined.
    - ✅ Example:  
      MERGE (glass:Material {name: "Glass"})-[:REDUCES]->(internalFriction:Property {name: "Internal Friction"})
    - ❌ Avoid unclear chains like:  
      MERGE (glass:Material {name: "Glass"})-[:REDUCES]->(internalFriction:Property {name: "Internal Friction"})-[:AFFECTS]->(flow:Property {name: "Flow"})
    - Specify whether the effect is increasing or decreasing.

    ---
    ### 2️⃣ Errors
    - Do not assign incorrect properties to entities.
    - ✅ Example:  
      MERGE (uhpc:Product {name: "UHPC"})-[:HAS_PROCESSING_METHOD]->(hotCuring:Process {name: "Hot Curing"})
      MERGE (hotCuring)-[:HAS]->(curingTemperature:Property {name: "Curing Temperature"})
    - ❌ Incorrect:  
      MERGE (uhpc:Product {name: "UHPC"})-[:HAS_PROPERTY]->(curingTemperature:Property {name: "Curing Temperature"})

    ---
    ### 3️⃣ Redundancy
    - Avoid multiple nodes for the same concept (e.g., `"Waste Glass"`, `"Glass Particles"`, `"Fine Glass"` → should all be `"Glass"`).
    - Use a **consistent naming convention** for entities.

    ---
    ### 4️⃣ Inconsistency
    - Use **consistent labels** and **relationship types** for the same entity across different sentences.
    - ✅ **Correct & Consistent**  
      "Glass sand can be efficiently used to produce UHPC."  
      MERGE (glassSand:Material {name: "Glass Sand"})-[:USED_IN]->(uhpc:Product {name: "UHPC"})
      "The glass sand can increase the workability of UHPC."  
      MERGE (glassSand:Material {name: "Glass Sand"})-[:IMPROVES]->(workability:Property {name: "Workability"})
    - ❌ **Incorrect & Inconsistent**  
      "Glass sand can be efficiently used to produce UHPC."  
      MERGE (uhpc:Product {name: "UHPC"})-[:USES]->(glassSand:Material {name: "Glass Sand"})
      "The glass sand can increase the workability of UHPC."  
      MERGE (glassSand:Material {name: "Glass Sand"})-[:CONTRIBUTES_TO]->(workability:Property {name: "Workability"})

    ---
    ## 5️⃣ Cypher Code Generation Example
    ### Example: Glass Sand in UHPC
    **Input:**  
    "Glass sand can be efficiently used to produce UHPC and eliminate the need for quartz sand, yielding a cost-effective and environmentally friendly solution."

    **Expected Output:**
    ```json
    {
      "cypher": "MERGE (glassSand:Material {name: \\"Glass Sand\\"})\nMERGE (quartzSand:Material {name: \\"Quartz Sand\\"})\nMERGE (uhpc:Product {name: \\"UHPC\\"})\nMERGE (costEffectiveness:Benefit {name: \\"Cost-Effective\\"})\nMERGE (environmentalBenefit:Benefit {name: \\"Environmentally Friendly\\"})\n\nMERGE (glassSand)-[:USED_IN]->(uhpc)\nMERGE (glassSand)-[:REPLACES]->(quartzSand)\nMERGE (glassSand)-[:YIELDS]->(costEffectiveness)\nMERGE (glassSand)-[:YIELDS]->(environmentalBenefit)",
      "entities": ["Glass Sand", "Quartz Sand", "UHPC", "Cost-Effective", "Environmentally Friendly"]
    }
    ```

    ---
    Now, please process the following sentence and provide ONLY a valid JSON object with cypher code and entities list:

    "{sentence}"
    """

    # Change: Core fix - process each sentence instead of batch processing
    for i, sentence in enumerate(sentences):
        batch_index = i // batch_size + 1
        sentence_in_batch = i % batch_size + 1
        print(f"🔄 Processing batch {batch_index}/{total_batches} (sentence {i+1}/{len(sentences)})...")
        
        # Create prompt for single sentence
        single_prompt = prompt_template.replace("{sentence}", sentence)
        
        try:
            # Add retry logic
            max_retries = 3
            response = None
            
            for attempt in range(max_retries):
                try:
                    # Add timeout setting
                    response = client.chat.completions.create(
                        model="deepseek-chat",
                        messages=[{"role": "user", "content": single_prompt}],
                        temperature=0.2,  # Keep original temperature
                        max_tokens=8000,  # Keep large token limit
                        top_p=0.95,
                        timeout=60  # Add timeout
                    )
                    break  # Successfully got response, break retry loop
                except Exception as e:
                    if attempt < max_retries - 1:
                        print(f"⚠️ Attempt {attempt+1} failed: {str(e)}, retrying in 5 seconds...")
                        time.sleep(5)
                    else:
                        raise e  # Reached max retries, raise exception
            
            if response:
                response_content = response.choices[0].message.content.strip()
                
                # Fix: more robust JSON parsing
                result = parse_single_response(response_content, sentence)
                if result:
                    print(f"✅ Sentence {i+1} generated successfully!")
                else:
                    print(f"⚠️ Sentence {i+1} parsing failed, using empty result")
                    result = {"sentence": sentence, "cypher": "", "entities": []}
                
                results.append(result)
            
            # Batch success stats
            if (i + 1) % batch_size == 0 or i == len(sentences) - 1:
                start_idx = (batch_index - 1) * batch_size
                end_idx = min(start_idx + batch_size, len(sentences))
                curr_batch = results[start_idx:end_idx]
                valid_count = sum(1 for r in curr_batch if r.get('cypher') and r.get('entities'))
                print(f"✅ Batch {batch_index} results: {valid_count}/{len(curr_batch)} sentences valid")
                
        except Exception as e:
            print(f"⚠️ Sentence {i+1} processing failed: {str(e)}")
            # Add empty result on failure to keep index aligned
            results.append({"sentence": sentence, "cypher": "", "entities": []})
        
        # Pause after each sentence to avoid API throttling
        if i < len(sentences) - 1:
            print("⏱️ Pausing 2 seconds to avoid API throttling...")
            time.sleep(2)
            
        # Longer pause after each completed batch
        if (i + 1) % batch_size == 0 and i < len(sentences) - 1:
            print(f"⏱️ Finished batch {batch_index}/{total_batches}, pausing 5 seconds...")
            time.sleep(5)

    return results

# New function: parse single sentence response
def parse_single_response(response_text, original_sentence):
    # Remove markdown code block markers
    cleaned_text = response_text
    if "```json" in cleaned_text:
        cleaned_text = cleaned_text.replace("```json", "").replace("```", "").strip()
    
    # Try direct JSON parse
    try:
        data = json.loads(cleaned_text)
        if isinstance(data, dict) and "cypher" in data and "entities" in data:
            # Add original sentence for tracking
            data["sentence"] = original_sentence
            return data
    except json.JSONDecodeError:
        # If direct parse fails, try regex extraction
        json_pattern = r'\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\}))*\}))*\}'
        matches = re.finditer(json_pattern, cleaned_text)
        
        for match in matches:
            potential_json = match.group(0)
            try:
                data = json.loads(potential_json)
                if "cypher" in data and "entities" in data:
                    data["sentence"] = original_sentence
                    return data
            except:
                continue
    
    # If not found, try to repair common issues
    try:
        fixed_text = re.sub(r'[\n\r\t]', ' ', cleaned_text)
        fixed_text = re.sub(r'\\(?!["\\/bfnrt])', r'\\\\', fixed_text)  # Fix bad escape chars
        
        start_idx = fixed_text.find('{')
        if start_idx != -1:
            stack = []
            for i, char in enumerate(fixed_text[start_idx:], start_idx):
                if char == '{':
                    stack.append('{')
                elif char == '}':
                    stack.pop()
                    if not stack:
                        json_str = fixed_text[start_idx:i+1]
                        try:
                            data = json.loads(json_str)
                            if "cypher" in data and "entities" in data:
                                data["sentence"] = original_sentence
                                return data
                        except:
                            pass
    except:
        pass
    
    # Final attempt: manually construct basic JSON
    try:
        cypher_pattern = r'"cypher"\s*:\s*"([^"]*(?:\\.[^"]*)*)"'
        entities_pattern = r'"entities"\s*:\s*\[(.*?)\]'
        
        cypher_match = re.search(cypher_pattern, cleaned_text)
        entities_match = re.search(entities_pattern, cleaned_text)
        
        if cypher_match:
            cypher_code = cypher_match.group(1)
            entities = []
            
            if entities_match:
                entities_text = entities_match.group(1)
                entity_pattern = r'"([^"]*(?:\\.[^"]*)*)"'
                entities = re.findall(entity_pattern, entities_text)
            
            return {
                "sentence": original_sentence,
                "cypher": cypher_code,
                "entities": entities
            }
    except:
        pass
    
    return None

# 4️⃣ Save Cypher code and entities to different files
def save_results_to_files(results, cypher_doc_path, entities_doc_path, total_sentences):
    # Save Cypher code
    cypher_doc = Document()
    cypher_doc.add_heading("Step3_Cypher_Code_Generation", level=1)
    
    # Add summary info
    valid_results = [r for r in results if r.get('cypher')]
    cypher_doc.add_paragraph(f"Total sentences: {total_sentences}")
    cypher_doc.add_paragraph(f"Number of generated Cypher codes: {len(valid_results)}")
    cypher_doc.add_paragraph(f"Generation rate: {(len(valid_results)/total_sentences*100):.2f}%")
    cypher_doc.add_paragraph("---")
    
    # Add each Cypher code
    for i, result in enumerate(results, 1):
        if 'sentence' in result:
            cypher_doc.add_paragraph(f"Sentence {i}: {result['sentence']}")
        
        if result.get('cypher'):
            cypher_code = result['cypher'].replace('\\n', '\n')
            cypher_doc.add_paragraph(f"Cypher {i}: {cypher_code}")
        else:
            cypher_doc.add_paragraph(f"Cypher {i}: [No valid code generated]")
        
        cypher_doc.add_paragraph("---")
    
    cypher_doc.save(cypher_doc_path)
    print(f"✅ Cypher code successfully saved to {cypher_doc_path}")
    
    # Save entities
    entities_doc = Document()
    entities_doc.add_heading("Step3_Extracted_Entities", level=1)
    
    all_entities = []
    for result in results:
        if result.get('entities'):
            all_entities.extend(result['entities'])
    
    unique_entities = sorted(set(all_entities))
    
    # Add summary info
    entities_doc.add_paragraph(f"Total sentences: {total_sentences}")
    entities_doc.add_paragraph(f"Extracted entities: {len(all_entities)}")
    entities_doc.add_paragraph(f"Unique entities: {len(unique_entities)}")
    entities_doc.add_paragraph("---")
    
    # Add entities by sentence
    entities_doc.add_heading("Entities by Sentence:", level=2)
    for i, result in enumerate(results, 1):
        if 'sentence' in result:
            entities_doc.add_paragraph(f"Sentence {i}: {result['sentence']}")
        
        if result.get('entities'):
            entity_list = ", ".join(result['entities'])
            entities_doc.add_paragraph(f"Entities {i}: {entity_list}")
        else:
            entities_doc.add_paragraph(f"Entities {i}: [No entities extracted]")
        
        entities_doc.add_paragraph("---")
    
    # Add unique entity list
    entities_doc.add_heading("Unique Entities:", level=2)
    for i, entity in enumerate(unique_entities, 1):
        entities_doc.add_paragraph(f"{i}. {entity}")
    
    entities_doc.save(entities_doc_path)
    print(f"✅ Entities successfully saved to {entities_doc_path}")
    
    # Save complete results as JSON
    json_path = os.path.splitext(cypher_doc_path)[0] + "_complete.json"
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"✅ Complete results saved as JSON: {json_path}")

# 5️⃣ Main program (restructured to process sentences one by one)
def main(input_doc_path, cypher_output_path, entities_output_path, batch_size=10):
    print("📂 Reading input file...")
    sentences = load_sentences_from_docx(input_doc_path)
    print(f"📜 Number of original sentences: {len(sentences)}")

    print(f"🤖 Starting Cypher code generation and entity extraction...")
    results = generate_cypher_and_entities(sentences, batch_size)
    
    valid_cypher_count = sum(1 for r in results if r.get('cypher'))
    valid_entities_count = sum(1 for r in results if r.get('entities') and len(r['entities']) > 0)
    
    print(f"🔍 Valid Cypher code generation rate: {valid_cypher_count}/{len(sentences)} ({(valid_cypher_count/len(sentences)*100):.2f}%)")
    print(f"📊 Valid entity extraction rate: {valid
