In [None]:
import json
from openai import OpenAI
from typing import Dict, List, Set
from pathlib import Path
from collections import defaultdict
import PyPDF2
import os
import re

In [1]:
class ScientificArticleExtractor:
    def __init__(self, api_key: str = None):
        self.client = OpenAI(
            base_url="https://integrate.api.nvidia.com/v1",
            api_key=api_key or "paste your api key here"
        )
        self.existing_entities = {}
        self.valid_types = {
            'theorem', 'equation', 'framework', 'concept', 
            'method', 'policy_based', 'value_based', 'hybrid',
            'algorithm', 'variant', 'improvement', 'base_algorithm',
            'domain', 'benchmark', 'field'
        }
        self.valid_layers = {
            'foundation', 'theoretical', 'algorithmic', 'implementation'
        }

    def load_existing_knowledge(self, knowledge_file: str):
        """Load existing knowledge graph and extract valid entity structures."""
        try:
            with open(knowledge_file, 'r', encoding='utf-8') as f:
                knowledge = json.load(f)
                self.existing_entities = knowledge['entities']
                print(f"Loaded {len(self.existing_entities)} existing entities")
        except Exception as e:
            print(f"Error loading existing knowledge: {e}")
            self.existing_entities = {}

    def extract_pdf_text(self, pdf_path: str) -> str:
        """Extract text from PDF file."""
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() + "\n"
                return text
        except Exception as e:
            print(f"Error reading PDF {pdf_path}: {e}")
            return ""

    def create_extract_prompt(self, text: str, article_reference: str) -> str:
        type_options = "|".join(self.valid_types)
        layer_options = "|".join(self.valid_layers)
        
        return f"""Extract key reinforcement learning entities from this scientific article.
Focus on identifying concepts, methods, or algorithms while maintaining consistency with existing knowledge organization.

Format as JSON:
{{
    "entities": [
        {{
            "id": "unique_snake_case_id",
            "name": "Full Name",
            "type": "{type_options}",
            "definition": "Clear, precise definition under 50 words",
            "domains": ["reinforcement_learning"],
            "properties": [
                {{
                    "name": "layer",
                    "value": "{layer_options}"
                }},
                {{
                    "name": "key_contribution",
                    "value": "Main insight or improvement"
                }},
                {{
                    "name": "scientific_paper",
                    "value": "{article_reference}"  
                }}
            ]
        }}
    ]
}}

Important guidelines:
1. Use ONLY the specified types and layers listed above
2. Skip mathematical formulations and equations
3. Focus on high-level concepts and their practical implications
4. Each entity must have a layer property
5. Keep definitions concise and clear
6. Ensure properties capture key insights without mathematical detail

Text to analyze:
{text[:8000]}
"""

    def update_entity(self, new_entity: Dict, article_reference: str) -> Dict:
        """Update existing entity or create new one with proper layer handling."""
        entity_id = new_entity['id']
        
        if entity_id in self.existing_entities:
            existing = self.existing_entities[entity_id]
            
            # Initialize properties if needed
            if 'properties' not in existing:
                existing['properties'] = []
            
            # Create a set of existing property names
            existing_props = {p['name'] for p in existing['properties']}
            
            # Add new properties while preserving layer
            for prop in new_entity.get('properties', []):
                if prop['name'] == 'layer':
                    continue  # Skip layer property as we keep the original
                if prop['name'] not in existing_props:
                    existing['properties'].append(prop)
            
            return existing
        else:
            # Ensure new entity has required structure
            if 'properties' not in new_entity:
                new_entity['properties'] = []
            
            # Ensure the entity has a layer property
            has_layer = any(p['name'] == 'layer' for p in new_entity['properties'])
            if not has_layer:
                new_entity['properties'].append({
                    'name': 'layer',
                    'value': 'algorithmic'  # Default layer if none specified
                })
            
            return new_entity



    def clean_json_response(self, response_text: str) -> dict:
        """Clean and parse JSON from API response."""
        try:
            # First attempt: direct JSON parsing
            try:
                return json.loads(response_text)
            except json.JSONDecodeError:
                pass

            # Second attempt: find JSON between curly braces
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
                try:
                    return json.loads(json_str)
                except json.JSONDecodeError:
                    pass

            # Third attempt: clean up common issues and try again
            cleaned_text = response_text.strip()
            # Remove any markdown code block markers
            cleaned_text = re.sub(r'```json\s*', '', cleaned_text)
            cleaned_text = re.sub(r'```\s*', '', cleaned_text)
            # Try to find and clean the JSON object
            json_match = re.search(r'\{.*\}', cleaned_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
                # Remove any invalid escape characters
                json_str = json_str.encode('utf-8').decode('unicode_escape')
                return json.loads(json_str)

            print(f"Could not parse JSON response. Raw response:\n{response_text[:500]}...")
            return {"entities": []}

        except Exception as e:
            print(f"Error cleaning JSON: {str(e)}")
            return {"entities": []}

    def process_article(self, pdf_path: str) -> Dict:
        """Process a single scientific article."""
        try:
            article_reference = Path(pdf_path).stem
            text = self.extract_pdf_text(pdf_path)
            if not text:
                return {}
            
            # Print some debugging info about the extracted text
            print(f"\nProcessing {article_reference}")
            print(f"Extracted {len(text)} characters of text")
            
            completion = self.client.chat.completions.create(
                model="nvidia/llama-3.1-nemotron-70b-instruct",
                messages=[{
                    "role": "user",
                    "content": self.create_extract_prompt(text, article_reference)
                }],
                temperature=0.3,
                max_tokens=2048
            )
            
            if not completion.choices:
                print(f"No response received for {article_reference}")
                return {}
                
            response = completion.choices[0].message.content
            print("Received response from API, attempting to parse...")
            
            extracted = self.clean_json_response(response)
            if not extracted:
                print(f"Failed to extract valid JSON from response for {article_reference}")
                return {}
                
            if 'entities' not in extracted:
                print(f"No entities field found in response for {article_reference}")
                return {}

            # Update entities
            updated_entities = {}
            for entity in extracted.get('entities', []):
                if entity.get('type') in self.valid_types:
                    updated = self.update_entity(entity, article_reference)
                    updated_entities[updated['id']] = updated
                else:
                    print(f"Skipping entity with invalid type: {entity.get('type')}")
            
            # Print processing summary
            print(f"Successfully processed {article_reference}:")
            print(f"- Found {len(extracted['entities'])} entities")
            print(f"- Added {len(updated_entities)} valid entities")
            
            return {
                'entities': updated_entities,
                'article_reference': article_reference
            }
            
        except Exception as e:
            print(f"Error processing article {pdf_path}:")
            print(f"Error details: {str(e)}")
            import traceback
            traceback.print_exc()
            return {}


    def process_articles_directory(self, articles_dir: str, output_file: str):
        """Process all PDF articles in directory and update knowledge graph."""
        articles_path = Path(articles_dir)
        all_new_entities = {}
        
        for pdf_file in articles_path.glob("*.pdf"):
            print(f"Processing article: {pdf_file.name}")
            article_data = self.process_article(str(pdf_file))
            
            if article_data and 'entities' in article_data:
                all_new_entities.update(article_data['entities'])
        
        # Merge with existing knowledge
        merged_entities = {**self.existing_entities, **all_new_entities}
        
        # Create updated knowledge graph
        updated_knowledge = {
            'entities': merged_entities,
            'metadata': {
                'total_entities': len(merged_entities),
                'new_entities_added': len(all_new_entities)
            }
        }
        
        # Save updated knowledge graph
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(updated_knowledge, f, indent=2)
        
        return updated_knowledge

def main():
    extractor = ScientificArticleExtractor()
    
    # Load existing knowledge graph
    knowledge_file = r"path to entities.json"
    extractor.load_existing_knowledge(knowledge_file)
    
    # Process new articles
    articles_dir = r"directory containing pdf articles"
    output_file = r"updated_knowledge_graph.json"
    
    try:
        updated_knowledge = extractor.process_articles_directory(articles_dir, output_file)
        print("\nKnowledge Graph Update Complete:")
        print(f"Total Entities: {updated_knowledge['metadata']['total_entities']}")
        print(f"New Entities Added: {updated_knowledge['metadata']['new_entities_added']}")
        print(f"Output saved to: {output_file}")
        
    except Exception as e:
        print(f"Error during knowledge graph update: {e}")

if __name__ == "__main__":
    main()

Loaded 64 existing entities
Processing article: 1702.02302.pdf

Processing 1702.02302
Extracted 28382 characters of text
Received response from API, attempting to parse...
Successfully processed 1702.02302:
- Found 5 entities
- Added 5 valid entities
Processing article: 1702.03118.pdf

Processing 1702.03118
Extracted 39504 characters of text
Received response from API, attempting to parse...
Successfully processed 1702.03118:
- Found 6 entities
- Added 6 valid entities
Processing article: 1702.06054.pdf

Processing 1702.06054
Extracted 64399 characters of text
Received response from API, attempting to parse...
Successfully processed 1702.06054:
- Found 6 entities
- Added 6 valid entities
Processing article: 1702.07490.pdf

Processing 1702.07490
Extracted 32639 characters of text
Received response from API, attempting to parse...
Successfully processed 1702.07490:
- Found 5 entities
- Added 5 valid entities
Processing article: 1702.08635.pdf

Processing 1702.08635
Extracted 44402 charac

Traceback (most recent call last):
  File "C:\Users\thinkpad\AppData\Local\Temp\ipykernel_33228\2173312714.py", line 185, in process_article
    completion = self.client.chat.completions.create(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\thinkpad\anaconda3\envs\langc\Lib\site-packages\openai\_utils\_utils.py", line 275, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\thinkpad\anaconda3\envs\langc\Lib\site-packages\openai\resources\chat\completions.py", line 829, in create
    return self._post(
           ^^^^^^^^^^^
  File "c:\Users\thinkpad\anaconda3\envs\langc\Lib\site-packages\openai\_base_client.py", line 1278, in post
    return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\thinkpad\anaconda3\envs\langc\Lib\site-packages\openai\_base_client.py", line 955, in requ

Received response from API, attempting to parse...
Successfully processed 1902.05772:
- Found 6 entities
- Added 6 valid entities
Processing article: 2001.03359.pdf

Processing 2001.03359
Extracted 48000 characters of text
Received response from API, attempting to parse...
Successfully processed 2001.03359:
- Found 6 entities
- Added 6 valid entities
Processing article: 2001.05012.pdf

Processing 2001.05012
Extracted 61421 characters of text
Received response from API, attempting to parse...
Successfully processed 2001.05012:
- Found 4 entities
- Added 4 valid entities
Processing article: 2001.06487.pdf

Processing 2001.06487
Extracted 70916 characters of text
Received response from API, attempting to parse...
Successfully processed 2001.06487:
- Found 6 entities
- Added 6 valid entities
Processing article: 2002.00444.pdf

Processing 2002.00444
Extracted 116124 characters of text
Received response from API, attempting to parse...
Successfully processed 2002.00444:
- Found 6 entities
- 

Traceback (most recent call last):
  File "C:\Users\thinkpad\AppData\Local\Temp\ipykernel_33228\2173312714.py", line 185, in process_article
    completion = self.client.chat.completions.create(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\thinkpad\anaconda3\envs\langc\Lib\site-packages\openai\_utils\_utils.py", line 275, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\thinkpad\anaconda3\envs\langc\Lib\site-packages\openai\resources\chat\completions.py", line 829, in create
    return self._post(
           ^^^^^^^^^^^
  File "c:\Users\thinkpad\anaconda3\envs\langc\Lib\site-packages\openai\_base_client.py", line 1278, in post
    return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\thinkpad\anaconda3\envs\langc\Lib\site-packages\openai\_base_client.py", line 955, in requ


Processing 2101.05970
Extracted 55409 characters of text
Received response from API, attempting to parse...
Successfully processed 2101.05970:
- Found 5 entities
- Added 5 valid entities
Processing article: 2101.08074.pdf

Processing 2101.08074
Extracted 35614 characters of text
Received response from API, attempting to parse...
Successfully processed 2101.08074:
- Found 5 entities
- Added 5 valid entities
Processing article: 2102.00735.pdf

Processing 2102.00735
Extracted 27371 characters of text
Received response from API, attempting to parse...
Successfully processed 2102.00735:
- Found 5 entities
- Added 5 valid entities
Processing article: 2102.03467.pdf

Processing 2102.03467
Extracted 30202 characters of text
Received response from API, attempting to parse...
Skipping entity with invalid type: neural_architecture
Successfully processed 2102.03467:
- Found 6 entities
- Added 5 valid entities
Processing article: 2102.05249.pdf

Processing 2102.05249
Extracted 24969 characters of 

In [1]:
class LayeredRelationshipExtractor:
    def __init__(self, api_key: str = None):
        self.client = OpenAI(
            base_url="https://integrate.api.nvidia.com/v1",
            api_key=api_key or "your api key here"
        )
        self.existing_relationships = []
        self.original_entities = {}

    def load_existing_data(self, original_kg_file: str, relationships_file: str):
        """Load existing knowledge graph and relationships."""
        try:
            # Load original knowledge graph
            with open(original_kg_file, 'r', encoding='utf-8') as f:
                original_data = json.load(f)
                self.original_entities = original_data['entities']
            print(f"Loaded {len(self.original_entities)} original entities")

            # Load existing relationships
            with open(relationships_file, 'r', encoding='utf-8') as f:
                rel_data = json.load(f)
                self.existing_relationships = rel_data.get('relationships', [])
            print(f"Loaded {len(self.existing_relationships)} existing relationships")
        except Exception as e:
            print(f"Error loading existing data: {e}")
            self.original_entities = {}
            self.existing_relationships = []

    def determine_layer(self, entity_data: Dict) -> str:
        """Determine which layer an entity belongs to based on its properties."""
        if 'type' in entity_data:
            entity_type = entity_data['type'].lower()
            
            if entity_type in ['theorem', 'equation', 'principle', 'proof', 'definition', 
                             'framework', 'concept', 'property', 'space', 'function']:
                return 'foundation_layer'
            elif entity_type in ['value_based', 'policy_based', 'model_based', 'hybrid', 'method']:
                return 'method_layer'
            elif entity_type in ['algorithm', 'base_algorithm', 'variant', 'improvement', 'extension']:
                return 'algorithm_layer'
            elif entity_type in ['field', 'benchmark', 'use_case', 'environment', 'task', 'domain']:
                return 'application_layer'
        
        return 'foundation_layer'

    def create_relationship_prompt(self, entity_id: str, entity: Dict, all_entities: Dict) -> str:
        source_layer = self.determine_layer(entity)
        
        entities_by_layer = {
            'foundation_layer': [],
            'method_layer': [],
            'algorithm_layer': [],
            'application_layer': []
        }
        
        for eid, e in all_entities.items():
            if eid != entity_id:
                layer = self.determine_layer(e)
                entities_by_layer[layer].append({
                    'id': eid,
                    'name': e['name'],
                    'type': e.get('type', '')
                })

        return f"""Identify relationships between this entity and others, focusing only on type and direction.

SOURCE ENTITY ({source_layer}):
ID: {entity_id}
Name: {entity['name']}
Type: {entity.get('type', '')}
Definition: {entity.get('definition', '')}

POTENTIAL TARGET ENTITIES:

Foundation Layer:
{json.dumps(entities_by_layer['foundation_layer'], indent=2)}

Method Layer:
{json.dumps(entities_by_layer['method_layer'], indent=2)}

Algorithm Layer:
{json.dumps(entities_by_layer['algorithm_layer'], indent=2)}

Application Layer:
{json.dumps(entities_by_layer['application_layer'], indent=2)}

Return ONLY JSON in this format:
{{
    "relationships": [
        {{
            "source": "{entity_id}",
            "source_layer": "{source_layer}",
            "target": "target_entity_id",
            "target_layer": "layer_name",
            "type": "relationship_type",
            "direction": "up|down|same|across"
        }}
    ]
}}

Direction:
- "up" for relationships to higher layers
- "down" for relationships to lower layers
- "same" for relationships within the same layer
- "across" for cross-layer relationships
"""

    def clean_json_response(self, response_text: str) -> dict:
        """Clean and parse JSON from API response."""
        try:
            import re
            json_match = re.search(r'(\{.*\})', response_text, re.DOTALL)
            if json_match:
                return json.loads(json_match.group(1))
            return {"relationships": []}
        except Exception as e:
            print(f"Error cleaning JSON: {e}")
            return {"relationships": []}

    def extract_relationships(self, updated_kg_file: str, output_dir: str):
        """Extract relationships only for new entities."""
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        # Load updated knowledge graph
        with open(updated_kg_file, 'r', encoding='utf-8') as f:
            updated_data = json.load(f)
            updated_entities = updated_data['entities']

        # Identify new entities
        new_entities = {
            entity_id: entity 
            for entity_id, entity in updated_entities.items()
            if entity_id not in self.original_entities
        }
        print(f"Found {len(new_entities)} new entities to process")

        new_relationships = []
        layer_statistics = {
            'foundation_layer': {'total': 0, 'connected': 0},
            'method_layer': {'total': 0, 'connected': 0},
            'algorithm_layer': {'total': 0, 'connected': 0},
            'application_layer': {'total': 0, 'connected': 0}
        }

        # Process only new entities
        for i, (entity_id, entity) in enumerate(new_entities.items(), 1):
            print(f"Processing new entity {i}/{len(new_entities)}: {entity_id}")
            
            entity_layer = self.determine_layer(entity)
            layer_statistics[entity_layer]['total'] += 1
            
            try:
                completion = self.client.chat.completions.create(
                    model="nvidia/llama-3.1-nemotron-70b-instruct",
                    messages=[{
                        "role": "user", 
                        "content": self.create_relationship_prompt(entity_id, entity, updated_entities)
                    }],
                    temperature=0.3,
                    max_tokens=2048
                )
                
                if completion.choices:
                    extracted = self.clean_json_response(completion.choices[0].message.content)
                    
                    if 'relationships' in extracted and extracted['relationships']:
                        layer_statistics[entity_layer]['connected'] += 1
                        new_relationships.extend(extracted['relationships'])
            
            except Exception as e:
                print(f"Error processing entity {entity_id}: {e}")
                continue

        # Combine with existing relationships
        all_relationships = self.existing_relationships + new_relationships

        # Remove duplicates
        seen = set()
        unique_relationships = []
        for rel in all_relationships:
            rel_key = (rel['source'], rel['target'], rel['type'])
            if rel_key not in seen:
                seen.add(rel_key)
                unique_relationships.append(rel)

        output = {
            "relationships": unique_relationships,
            "metadata": {
                "total_relationships": len(unique_relationships),
                "new_relationships_added": len(new_relationships),
                "relationship_types": sorted(list(set(rel['type'] for rel in unique_relationships))),
                "layer_statistics": layer_statistics,
                "layer_connections": {
                    'up': sum(1 for rel in new_relationships if rel['direction'] == 'up'),
                    'down': sum(1 for rel in new_relationships if rel['direction'] == 'down'),
                    'same': sum(1 for rel in new_relationships if rel['direction'] == 'same'),
                    'across': sum(1 for rel in new_relationships if rel['direction'] == 'across')
                }
            }
        }

        output_file = Path(output_dir) / "relationships.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(output, f, indent=2)

        return output

def main():
    extractor = LayeredRelationshipExtractor()
    
    # Load existing data
    original_kg_file = r"path to entities.json"
    original_rel_file = r"path to relationships.json"
    extractor.load_existing_data(original_kg_file, original_rel_file)
    
    # Process new relationships
    updated_kg_file = r"path to \updated_knowledge_graph.json"
    output_dir = r"path to your output dir"
    
    try:
        result = extractor.extract_relationships(updated_kg_file, output_dir)
        print(f"\nRelationship extraction complete!")
        print(f"Total relationships: {result['metadata']['total_relationships']}")
        print(f"New relationships added: {result['metadata']['new_relationships_added']}")
        print("\nNew Layer Connections:")
        for direction, count in result['metadata']['layer_connections'].items():
            print(f"- {direction}: {count} relationships")
        print(f"\nResults saved to: {output_dir}/relationships.json")
        
    except Exception as e:
        print(f"Error during relationship extraction: {e}")

if __name__ == "__main__":
    main()

Loaded 64 original entities
Loaded 252 existing relationships
Found 220 new entities to process
Processing new entity 1/220: deep_reinforcement_learning
Processing new entity 2/220: autonomous_braking_system
Processing new entity 3/220: reward_function_design
Processing new entity 4/220: sigmoid_weighted_linear_unit
Error cleaning JSON: Expecting property name enclosed in double quotes: line 23 column 46 (char 829)
Processing new entity 5/220: derivative_sigmoid_weighted_linear_unit
Processing new entity 6/220: td_lambda
Processing new entity 7/220: sarsa_lambda
Processing new entity 8/220: on_policy_learning_with_eligibility_traces
Processing new entity 9/220: dqn
Processing new entity 10/220: fine_grained_action_repetition
Processing new entity 11/220: asynchronous_advantage_actor_critic
Processing new entity 12/220: trust_region_policy_optimization
Processing new entity 13/220: deep_deterministic_policy_gradients
Processing new entity 14/220: temporal_abstractions
Processing new ent

In [5]:
class EntityConsolidator:
    def __init__(self, api_key: str = None):
        self.client = OpenAI(
            base_url="https://integrate.api.nvidia.com/v1",
            api_key=api_key or "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"
        )
        self.id_mapping = {}

    def clean_json_response(self, response_text: str) -> dict:
        """Clean and parse JSON from API response."""
        try:
            # Remove markdown code blocks
            text = re.sub(r'```json\s*', '', response_text)
            text = re.sub(r'```\s*', '', text)
            
            # Find the JSON object
            match = re.search(r'\{.*\}', text, re.DOTALL)
            if match:
                json_str = match.group(0)
                # Parse the JSON
                try:
                    return json.loads(json_str)
                except json.JSONDecodeError as e:
                    print(f"Error parsing JSON: {e}")
                    print(f"Problematic JSON string: {json_str[:200]}...")
                    return {"duplicate_groups": []}
            else:
                print("No JSON object found in response")
                return {"duplicate_groups": []}
        except Exception as e:
            print(f"Error cleaning JSON response: {e}")
            return {"duplicate_groups": []}

    def create_duplicate_detection_prompt(self, entities: Dict) -> str:
        entities_list = []
        for entity_id, entity in entities.items():
            entities_list.append({
                "id": entity_id,
                "name": entity["name"],
                "type": entity["type"],
                "definition": entity.get("definition", "")
            })

        return f"""Analyze these entities and identify ONLY EXACT duplicate concepts.
Return the response as clean JSON without any markdown formatting.

STRICT GUIDELINES:
1. Only group entities that are literally the same thing (e.g., "DQN" and "Deep Q-Network")
2. DO NOT group variants or improvements 
3. DO NOT group general concepts with specific implementations
4. If unsure, DO NOT group them
5. Mark any uncertain groups with "REMOVED" in the reason

Entities:
{json.dumps(entities_list, indent=2)}

RETURN EXACTLY THIS FORMAT WITHOUT ANY MARKDOWN:
{{
    "duplicate_groups": [
        {{
            "main_id": "id_to_keep",
            "duplicate_ids": ["id1", "id2"],
            "reason": "Explanation of why these are EXACTLY the same"
        }}
    ]
}}"""

    def validate_merge_group(self, group: Dict) -> bool:
        """Validate if a merge group should be processed."""
        # Skip if marked as removed
        if "REMOVED" in group['reason'].upper():
            print(f"Skipping group {group['main_id']} - marked as REMOVED")
            return False
            
        # Skip if mentions specific concepts
        skip_keywords = ['variant', 'improvement', 'extension', 'specific', 
                        'implementation', 'enhancement', 'version', 'type', 
                        'framework', 'architecture', 'application']
        
        if any(keyword in group['reason'].lower() for keyword in skip_keywords):
            print(f"Skipping group {group['main_id']} - contains restrictive keywords")
            return False
            
        return True

    def merge_entities(self, entity_groups: List[Dict], entities: Dict) -> Dict:
        """Merge duplicate entities into consolidated entries."""
        merged_entities = entities.copy()
        self.id_mapping.clear()
        
        # Filter groups first
        valid_groups = [group for group in entity_groups if self.validate_merge_group(group)]
        print(f"\nProcessing {len(valid_groups)} valid merge groups (filtered from {len(entity_groups)} total groups)")
        
        for group in valid_groups:
            print(f"\nMerging group:")
            print(f"Main ID: {group['main_id']}")
            print(f"Duplicates: {', '.join(group['duplicate_ids'])}")
            print(f"Reason: {group['reason']}")
            
            main_id = group['main_id']
            
            # Skip if main_id doesn't exist
            if main_id not in merged_entities:
                print(f"Warning: Main ID {main_id} not found in entities")
                continue
            
            main_entity = merged_entities[main_id]
            merged_definitions = set([main_entity.get('definition', '')])
            merged_properties = main_entity.get('properties', [])
            merged_papers = set()
            
            for dup_id in group['duplicate_ids']:
                self.id_mapping[dup_id] = main_id
                
                if dup_id in merged_entities:
                    dup_entity = merged_entities[dup_id]
                    
                    # Merge definitions
                    if 'definition' in dup_entity and dup_entity['definition']:
                        merged_definitions.add(dup_entity['definition'])
                    
                    # Merge properties
                    for prop in dup_entity.get('properties', []):
                        if prop not in merged_properties:
                            merged_properties.append(prop)
                            if prop.get('name') == 'scientific_paper':
                                merged_papers.add(prop['value'])
                    
                    # Remove duplicate entity
                    del merged_entities[dup_id]

            # Update main entity
            main_entity['definition'] = ' || '.join(filter(None, merged_definitions))
            main_entity['properties'] = merged_properties

        return merged_entities

    def update_relationships(self, relationships_data: Dict) -> Dict:
        """Update relationships to use new entity IDs after merging."""
        updated_relationships = []
        seen = set()
        
        for rel in relationships_data.get('relationships', []):
            source = self.id_mapping.get(rel['source'], rel['source'])
            target = self.id_mapping.get(rel['target'], rel['target'])
            
            # Create unique key for relationship
            rel_key = (source, target, rel['type'])
            
            if rel_key not in seen:
                seen.add(rel_key)
                rel_copy = rel.copy()
                rel_copy['source'] = source
                rel_copy['target'] = target
                updated_relationships.append(rel_copy)
        
        return {
            'relationships': updated_relationships,
            'metadata': relationships_data.get('metadata', {})
        }

    def consolidate_knowledge_graph(self, entities_file: str, relationships_file: str, 
                                  output_entities_file: str, output_relationships_file: str):
        """Main consolidation process."""
        try:
            # Load files
            with open(entities_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                original_entities = data['entities']

            with open(relationships_file, 'r', encoding='utf-8') as f:
                relationships_data = json.load(f)

            print(f"Loaded {len(original_entities)} entities and {len(relationships_data['relationships'])} relationships")
            
            # Get API response
            completion = self.client.chat.completions.create(
                model="nvidia/llama-3.1-nemotron-70b-instruct",
                messages=[{
                    "role": "user", 
                    "content": self.create_duplicate_detection_prompt(original_entities)
                }],
                temperature=0.3,
                max_tokens=2048
            )

            if not completion.choices:
                print("No response from API")
                return
                
            response_json = self.clean_json_response(completion.choices[0].message.content)
            duplicate_groups = response_json['duplicate_groups']
            
            # Process duplicates
            merged_entities = self.merge_entities(duplicate_groups, original_entities)
            updated_relationships = self.update_relationships(relationships_data)
            
            # Save results
            output_data = {
                'entities': merged_entities,
                'metadata': {
                    'original_count': len(original_entities),
                    'final_count': len(merged_entities),
                    'duplicates_merged': len(original_entities) - len(merged_entities),
                    'merge_groups': duplicate_groups
                }
            }
            
            with open(output_entities_file, 'w', encoding='utf-8') as f:
                json.dump(output_data, f, indent=2)
                
            with open(output_relationships_file, 'w', encoding='utf-8') as f:
                json.dump(updated_relationships, f, indent=2)
            
            print(f"\nConsolidation complete:")
            print(f"Original entities: {len(original_entities)}")
            print(f"Final entities: {len(merged_entities)}")
            print(f"Duplicates merged: {len(original_entities) - len(merged_entities)}")
            print(f"Original relationships: {len(relationships_data['relationships'])}")
            print(f"Final relationships: {len(updated_relationships['relationships'])}")
            
        except Exception as e:
            print(f"Error during consolidation: {e}")
            import traceback
            traceback.print_exc()

def main():
    consolidator = EntityConsolidator()
    
    entities_file = r"enhancement\updated_knowledge_graph.json"
    relationships_file = r"enhancement\relationships.json"
    output_entities_file = r"enhancement\consolidated_knowledge_graph.json"
    output_relationships_file = r"enhancement\consolidated_relationships.json"
    
    consolidator.consolidate_knowledge_graph(
        entities_file, relationships_file,
        output_entities_file, output_relationships_file
    )

if __name__ == "__main__":
    main()

Loaded 284 entities and 1655 relationships
Skipping group proximal_policy_optimization - marked as REMOVED
Skipping group policy_gradient_methods - marked as REMOVED

Processing 3 valid merge groups (filtered from 5 total groups)

Merging group:
Main ID: deep_q_network
Duplicates: dqn, deep_q_networks
Reason: All three entities refer to the same Deep Q-Network (DQN) concept, with 'Deep Q-Networks' being a plural form and 'DQN' being an abbreviation.

Merging group:
Main ID: deep_reinforcement_learning
Duplicates: deep_reinforcement_learning_drl
Reason: Both entities refer to the same concept of Deep Reinforcement Learning, with 'DRL' being an abbreviation.

Merging group:
Main ID: markov_decision_process
Duplicates: markov_decision_process_mdp
Reason: Both entities refer to the same Markov Decision Process concept, with 'MDP' being an abbreviation.

Consolidation complete:
Original entities: 284
Final entities: 280
Duplicates merged: 4
Original relationships: 1655
Final relationships: 

In [1]:
from neo4j import GraphDatabase
import json

class ConsolidatedGraphBuilder:
    def __init__(self, uri="bolt://localhost:7687", user="neo4j", password="buJX-vxk@v8tkF2"):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def create_node(self, tx, entity_id, entity_data):
        # Convert properties list to string array
        properties_list = []
        if entity_data.get('properties'):
            for prop in entity_data['properties']:
                prop_str = f"{prop['name']}: {prop['value']}"
                properties_list.append(prop_str)

        # Create node properties
        node_props = {
            'id': entity_id,
            'name': entity_data['name'],
            'type': entity_data['type'],
            'definition': entity_data['definition'],
            'domains': entity_data.get('domains', []),
            'properties': properties_list,
            'scientific_papers': [p['value'] for p in entity_data.get('properties', []) 
                                if p.get('name') == 'scientific_paper']
        }

        # Create dynamic label from type (clean it for Neo4j label rules)
        type_label = ''.join(c for c in entity_data['type'].title() if c.isalnum())
        
        # Handle domains differently
        if entity_data['type'].lower() == 'domain':
            query = """
            MERGE (n:Domain {id: $id})
            SET n = $node_props
            """
        else:
            query = f"""
            MERGE (n:Concept:{type_label} {{id: $id}})
            SET n = $node_props
            """
        
        tx.run(query, id=entity_id, node_props=node_props)

        # Create paper nodes and relationships
        if node_props['scientific_papers']:
            for paper in node_props['scientific_papers']:
                paper_query = """
                MERGE (p:Paper {id: $paper_id})
                SET p.name = $paper_id
                WITH p
                MATCH (n)
                WHERE n.id = $entity_id
                MERGE (n)-[r:REFERENCED_IN]->(p)
                """
                tx.run(paper_query, paper_id=paper, entity_id=entity_id)

    def create_relationships(self, tx, relationships_data):
        try:
            relationships = relationships_data.get('relationships', [])
            
            for rel in relationships:
                # More thorough relationship type cleaning
                rel_type = rel['type'].upper()\
                    .replace(' ', '_')\
                    .replace('(', '')\
                    .replace(')', '')\
                    .replace('-', '_')\
                    .replace('.', '_')\
                    .replace(':', '_')
                
                # Ensure valid Neo4j relationship type
                rel_type = ''.join(c for c in rel_type if c.isalnum() or c == '_')
                rel_type = '_'.join(filter(None, rel_type.split('_')))
                
                query = f"""
                MATCH (source)
                WHERE source.id = $source
                MATCH (target)
                WHERE target.id = $target
                MERGE (source)-[r:{rel_type}]->(target)
                SET r.source_layer = $source_layer
                SET r.target_layer = $target_layer
                SET r.direction = $direction
                SET r.original_type = $original_type
                """
                
                tx.run(query, 
                    source=rel['source'],
                    target=rel['target'],
                    source_layer=rel['source_layer'],
                    target_layer=rel['target_layer'],
                    direction=rel.get('direction', 'none'),
                    original_type=rel['type']
                )
                
        except Exception as e:
            print(f"Error processing relationship: {e}")
            print(f"Problematic relationship type: {rel['type']}")
            print(f"Cleaned relationship type: {rel_type}")
            raise

    def build_graph(self, entities_file, relationships_file):
        # Load data
        with open(entities_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            entities = data['entities']

        with open(relationships_file, 'r', encoding='utf-8') as f:
            relationships_data = json.load(f)

        with self.driver.session() as session:
            # Clear existing data
            print("Clearing existing data...")
            session.run("MATCH (n) DETACH DELETE n")

            # Create nodes
            print("Creating nodes...")
            total_entities = len(entities)
            for i, (entity_id, entity_data) in enumerate(entities.items(), 1):
                print(f"Processing entity {i}/{total_entities}: {entity_id}")
                session.execute_write(self.create_node, entity_id, entity_data)

            # Create relationships
            print("Creating relationships...")
            total_rels = len(relationships_data['relationships'])
            for i, rel in enumerate(relationships_data['relationships'], 1):
                print(f"Processing relationship {i}/{total_rels}")
                session.execute_write(self.create_relationships, relationships_data)

            # Add indices
            print("Creating indices...")
            session.execute_write(self.create_indices)

            # Add metadata
            print("Adding metadata...")
            session.execute_write(self.add_metadata)

    def add_metadata(self, tx):
        queries = [
            """
            MATCH (n)
            WHERE n:Concept OR n:Domain
            SET n.degree = COUNT {(n)--()}
            """,
            """
            MATCH (n)
            WHERE n:Concept OR n:Domain
            SET n.in_degree = COUNT {(n)<--()}
            """,
            """
            MATCH (n)
            WHERE n:Concept OR n:Domain
            SET n.out_degree = COUNT {(n)-->()}
            """
        ]
        for query in queries:
            tx.run(query)

    def create_indices(self, tx):
        queries = [
            "CREATE INDEX concept_type_idx IF NOT EXISTS FOR (n:Concept) ON (n.type)",
            "CREATE INDEX concept_name_idx IF NOT EXISTS FOR (n:Concept) ON (n.name)",
            "CREATE INDEX concept_id_idx IF NOT EXISTS FOR (n:Concept) ON (n.id)",
            "CREATE INDEX domain_id_idx IF NOT EXISTS FOR (n:Domain) ON (n.id)",
            "CREATE INDEX domain_name_idx IF NOT EXISTS FOR (n:Domain) ON (n.name)",
            "CREATE INDEX paper_id_idx IF NOT EXISTS FOR (n:Paper) ON (n.id)"
        ]
        for query in queries:
            tx.run(query)

    def close(self):
        self.driver.close()

def main():
    # File paths
    ENTITIES_FILE = r"enhancement\consolidated_knowledge_graph.json"
    RELATIONSHIPS_FILE = r"enhancement\consolidated_relationships.json"
    
    # Initialize and build graph
    graph = ConsolidatedGraphBuilder()
    try:
        print("Building consolidated knowledge graph...")
        graph.build_graph(ENTITIES_FILE, RELATIONSHIPS_FILE)
        print("Knowledge graph built successfully!")
    except Exception as e:
        print(f"Error building graph: {e}")
        import traceback
        traceback.print_exc()
    finally:
        graph.close()

if __name__ == "__main__":
    main()

Building consolidated knowledge graph...
Clearing existing data...
Creating nodes...
Processing entity 1/280: reinforcement_learning
Processing entity 2/280: markov_decision_process
Processing entity 3/280: value_function
Processing entity 4/280: policy
Processing entity 5/280: temporal_difference_learning
Processing entity 6/280: k_armed_bandit_problem
Processing entity 7/280: exploration_exploitation_tradeoff
Processing entity 8/280: epsilon_greedy_method
Processing entity 9/280: upper_confidence_bound_action_selection
Processing entity 10/280: gradient_bandit_algorithm
Processing entity 11/280: associative_search
Processing entity 12/280: reward_hypothesis
Processing entity 13/280: bellman_equation
Processing entity 14/280: bellman_optimality_equation
Processing entity 15/280: monte_carlo_methods
Processing entity 16/280: generalized_policy_iteration
Processing entity 17/280: importance_sampling
Processing entity 18/280: td_prediction
Processing entity 19/280: td_0
Processing entity