In [None]:
import os
import json
import openai
import docx
from docx import Document
import time
import re
from tqdm import tqdm

# Configuration
WORD_DOC_PATH = 
OUTPUT_JSON_PATH =
OUTPUT_DOCX_PATH =
CHUNKS_COUNT = 8  # Split the document into 8 chunks for processing

# Set up API key
if "DEEPSEEK_API_KEY" not in os.environ:
    os.environ["DEEPSEEK_API_KEY"] = input("Enter your Deepseek API Key: ").strip()

# Create Deepseek client
client = openai.OpenAI(
    api_key=os.environ["DEEPSEEK_API_KEY"],
    base_url="https://api.deepseek.com/v1"
)

def load_entities_from_docx(doc_path):
    """Load entity list from Word document"""
    try:
        doc = Document(doc_path)
        entities = []
        
        for para in doc.paragraphs:
            text = para.text.strip()
            if text and not text.startswith("#") and not text.startswith("Step"):
                # Remove numbering if present (e.g., "1.Entity" -> "Entity")
                if re.match(r'^\d+\.', text):
                    text = text.split('.', 1)[1].strip()
                entities.append(text)
                
        print(f"✅ Successfully loaded {len(entities)} entities from document")
        return entities
    except Exception as e:
        print(f"❌ Error loading document: {str(e)}")
        return []

def split_into_chunks(entities, num_chunks):
    """Split the entity list into specified number of chunks"""
    chunk_size = len(entities) // num_chunks
    remainder = len(entities) % num_chunks
    
    chunks = []
    start = 0
    
    for i in range(num_chunks):
        # Add one extra item to some chunks if there's a remainder
        end = start + chunk_size + (1 if i < remainder else 0)
        chunks.append(entities[start:end])
        start = end
        
    return chunks

def process_entity_chunk(entities_chunk, chunk_index):
    """Process a chunk of entities using Deepseek API"""
    # Format the entity list for the prompt
    entities_text = "\n".join([f"{i+1}.{entity}" for i, entity in enumerate(entities_chunk)])
    
    # Create the prompt based on the system role and requirements
    prompt = f"""
You are an Entity-Resolution Specialist in concrete material science. Your job is to take a raw list of terms and return—in strict JSON array form—one unique, standardized Canonical Name for every distinct concept.
Workflow (in order)
1.	Analyze each term
Recognize domain items: cementitious materials, concrete types, properties, reactions, processes and failure modes.
2.	Group equivalent terms
Treat as identical when they differ only by
a)	synonyms or jargon (“Micro silica” = “Silica Fume”)
b)	abbreviations (“SF” = “Silica Fume”)
c)	case, spelling, singular/plural, minor wording.
3.	Select one Canonical Name per group
a)	Choose the standard, unambiguous term used in current literature.
b)	Prefer full names over acronyms—except universally accepted ones (e.g., “UHPC”).
c)	Use the singular form.
d)	Render in Title Case (“Silica Fume”, “Compressive Strength”).
e)	Ungrouped terms: normalize with rules 3 & 4 and keep.
4.	Return output

Your output must be a valid JSON array of strings only, alphabetically sorted, with no additional explanation.

Example:
Input: 
1.Silica Fume  
2.Microsilica  
3.SF  
4.Compressive strength  
5.Compressive Strength  
6.compression strength  
7.UHPC  
8.Fly Ash  
9.fly ash  
10.Curing Process  
11.Aggregate  
12.Aggregates  
13.Silica Fume

Expected Output:
["Aggregate", "Compressive Strength", "Curing Process", "Fly Ash", "Silica Fume", "UHPC"]

IMPORTANT: Respond ONLY with the JSON array, beginning with [ and ending with ], nothing else.
"""

    try:
        print(f"🔄 Processing chunk {chunk_index+1}/{CHUNKS_COUNT} ({len(entities_chunk)} entities)...")
        
        # Call Deepseek API with retry logic
        max_retries = 3
        response = None
        
        for attempt in range(max_retries):
            try:
                response = client.chat.completions.create(
                    model="deepseek-chat",
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.1,  # Lower temperature for more deterministic results
                    max_tokens=4000,
                    top_p=0.95,
                    timeout=120  # Longer timeout for processing larger chunks
                )
                break  # Success, exit retry loop
            except Exception as e:
                if attempt < max_retries - 1:
                    print(f"⚠️ Attempt {attempt+1} failed: {str(e)}, retrying in 10 seconds...")
                    time.sleep(10)
                else:
                    raise e
        
        if response:
            response_content = response.choices[0].message.content.strip()
            
            # Extract JSON array from response
            canonical_entities = extract_json_array(response_content)
            
            if canonical_entities:
                print(f"✅ Successfully processed chunk {chunk_index+1}: Found {len(canonical_entities)} canonical entities")
                return canonical_entities
            else:
                print(f"⚠️ Failed to extract valid JSON from chunk {chunk_index+1}")
                return []
                
    except Exception as e:
        print(f"❌ Error processing chunk {chunk_index+1}: {str(e)}")
        return []

def extract_json_array(text):
    """Extract JSON array from API response, with various fallback methods"""
    # Try direct JSON parsing first
    text = text.strip()
    
    # Remove markdown code blocks if present
    if "```json" in text:
        text = re.sub(r'```json\s*|\s*```', '', text)
    elif "```" in text:
        text = re.sub(r'```\s*|\s*```', '', text)
    
    # Try direct parsing
    try:
        data = json.loads(text)
        if isinstance(data, list):
            return data
    except json.JSONDecodeError:
        pass
    
    # Try regex extraction
    try:
        array_pattern = r'\[(.*?)\]'
        array_match = re.search(array_pattern, text, re.DOTALL)
        
        if array_match:
            array_content = array_match.group(1)
            # Extract string elements
            strings_pattern = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
            strings = re.findall(strings_pattern, array_content)
            
            if strings:
                return strings
    except:
        pass
    
    # Final attempt: manual extraction of quoted strings
    try:
        strings_pattern = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
        strings = re.findall(strings_pattern, text)
        if strings:
            return strings
    except:
        pass
    
    return []

def merge_canonical_entities(all_canonical_chunks):
    """Merge and deduplicate canonical entities from all chunks"""
    # Flatten all chunks
    all_entities = []
    for chunk in all_canonical_chunks:
        all_entities.extend(chunk)
    
    # Normalize and deduplicate
    normalized_entities = {}
    
    for entity in all_entities:
        # Normalize: convert to lowercase for comparison, but keep original case
        normalized_key = entity.lower()
        
        # Keep the best version (prefer longer version if available)
        if normalized_key not in normalized_entities or len(entity) > len(normalized_entities[normalized_key]):
            normalized_entities[normalized_key] = entity
    
    # Get final deduplicated list and sort alphabetically
    final_entities = sorted(normalized_entities.values())
    
    return final_entities

def save_results(canonical_entities, json_path, docx_path):
    """Save results in both JSON and DOCX formats"""
    # Save as JSON
    try:
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(canonical_entities, f, indent=2, ensure_ascii=False)
        print(f"✅ Saved {len(canonical_entities)} canonical entities to {json_path}")
    except Exception as e:
        print(f"❌ Error saving JSON: {str(e)}")
    
    # Save as DOCX
    try:
        doc = Document()
        doc.add_heading("Canonical Entity Names for Concrete Materials Science", level=1)
        doc.add_paragraph(f"Total unique entities: {len(canonical_entities)}")
        doc.add_paragraph(f"Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        # Add a divider
        doc.add_paragraph("=" * 50)
        
        # Add the entities with numbers
        for i, entity in enumerate(canonical_entities, 1):
            doc.add_paragraph(f"{i}. {entity}")
        
        doc.save(docx_path)
        print(f"✅ Saved canonical entities to Word document: {docx_path}")
    except Exception as e:
        print(f"❌ Error saving DOCX: {str(e)}")

def run_entity_resolution():
    """Main function to run the entity resolution process"""
    print("🚀 Starting Entity Resolution for Concrete Materials Science")
    
    # Step 1: Load entities from Word document
    entities = load_entities_from_docx(WORD_DOC_PATH)
    if not entities:
        print("❌ No entities found. Exiting.")
        return
    
    # Step 2: Split entities into chunks
    entity_chunks = split_into_chunks(entities, CHUNKS_COUNT)
    print(f"📊 Split {len(entities)} entities into {len(entity_chunks)} chunks")
    
    # Step 3: Process each chunk
    canonical_chunks = []
    
    for i, chunk in enumerate(entity_chunks):
        canonical_entities = process_entity_chunk(chunk, i)
        canonical_chunks.append(canonical_entities)
        
        # Add delay between chunks to avoid API rate limits
        if i < len(entity_chunks) - 1:
            print("⏱️ Waiting 5 seconds before processing next chunk...")
            time.sleep(5)
    
    # Step 4: Merge results from all chunks
    final_canonical_entities = merge_canonical_entities(canonical_chunks)
    print(f"🎯 Final result: {len(final_canonical_entities)} unique canonical entities")
    
    # Step 5: Save results
    save_results(final_canonical_entities, OUTPUT_JSON_PATH, OUTPUT_DOCX_PATH)
    
    print("✅ Entity Resolution process completed successfully!")

if __name__ == "__main__":
    run_entity_resolution()