In [None]:
import json
from openai import OpenAI
from typing import Dict, List, Set
from pathlib import Path
from collections import defaultdict

In [None]:
class RLEntityExtractor:
    def __init__(self, api_key: str = None):
        self.client = OpenAI(
            base_url="https://integrate.api.nvidia.com/v1",
            api_key=api_key or "paste your api key here"
        )
        # Track entities across chapters for cross-referencing
        self.entity_appearances = defaultdict(set)
        self.domain_connections = defaultdict(set)

    def create_extract_prompt(self, section_text: str, chapter: str, section: str) -> str:
        return f"""Extract key RL entities and their relationships from this text section. Focus on core concepts, domains, and clear relationships. Format as JSON:

{{
    "entities": [
        {{
            "id": "unique_snake_case_id",
            "name": "Full Concept Name",
            "type": "concept|algorithm|method|principle|domain",
            "definition": "Clear, precise definition under 50 words",
            "domains": ["psychology", "neuroscience", "mathematics", etc],
            "properties": [
                {{
                    "name": "property_name",
                    "value": "property_value",
                    "type": "characteristic|parameter|constraint|requirement"
                }}
            ],
            "source": {{
                "chapter": "{chapter}",
                "section": "{section}",
                "context": "Brief context where this appears"
            }}
        }}
    ],
    "relationships": [
        {{
            "source_id": "entity_id",
            "target_id": "entity_id",
            "type": "is_part_of|implements|uses|relates_to|influences",
            "description": "Brief description of relationship"
        }}
    ],
    "domains_discussed": ["list", "of", "domains"],
    "key_equations": [
        {{
            "id": "equation_id",
            "name": "Equation Name",
            "latex": "LaTeX representation",
            "description": "What this equation represents"
        }}
    ]
}}

Extract ONLY well-defined, clearly stated concepts. Skip ambiguous or unclear references.
Focus on reinforcement learning concepts and their connections to other domains.

Text to analyze:
{section_text}"""

    def clean_json_response(self, response_text: str) -> dict:
        """Clean and parse JSON from API response."""
        try:
            # Find JSON content between markers if present
            import re
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
                return json.loads(json_str)
            return {}
        except Exception as e:
            print(f"Error cleaning JSON: {e}")
            return {}

    def update_cross_references(self, entities: List[Dict], chapter: str) -> None:
        """Update entity appearance tracking."""
        for entity in entities:
            entity_id = entity['id']
            self.entity_appearances[entity_id].add(chapter)
            
            # Track domain connections
            if 'domains' in entity:
                for domain in entity['domains']:
                    self.domain_connections[domain].add(entity_id)

    def process_section(self, section_text: str, chapter: str, section: str) -> Dict:
        """Process a single section and extract entities."""
        try:
            completion = self.client.chat.completions.create(
                model="nvidia/llama-3.1-nemotron-70b-instruct",
                messages=[{
                    "role": "user", 
                    "content": self.create_extract_prompt(section_text, chapter, section)
                }],
                temperature=0.3,
                max_tokens=2048
            )
            
            if completion.choices:
                response_text = completion.choices[0].message.content
                extracted = self.clean_json_response(response_text)
                
                if 'entities' in extracted:
                    self.update_cross_references(extracted['entities'], chapter)
                
                return extracted
            
            return {}

        except Exception as e:
            print(f"Error processing section: {e}")
            return {}

    def process_chapter_file(self, file_path: Path) -> Dict:
        """Process a single chapter file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                sections = json.load(f)
            
            chapter_data = {
                'chapter_id': file_path.stem,
                'entities': [],
                'relationships': [],
                'domains': set(),
                'equations': []
            }
            
            for section_id, content in sections.items():
                print(f"Processing {file_path.stem} - {section_id}")
                section_data = self.process_section(
                    content, 
                    chapter=file_path.stem, 
                    section=section_id
                )
                
                if section_data:
                    chapter_data['entities'].extend(section_data.get('entities', []))
                    chapter_data['relationships'].extend(section_data.get('relationships', []))
                    chapter_data['domains'].update(section_data.get('domains_discussed', []))
                    chapter_data['equations'].extend(section_data.get('key_equations', []))
            
            # Convert sets to lists for JSON serialization
            chapter_data['domains'] = list(chapter_data['domains'])
            
            return chapter_data
            
        except Exception as e:
            print(f"Error processing chapter file {file_path}: {e}")
            return {}

    def generate_knowledge_graph(self, input_dir: str, output_dir: str):
        """Generate complete knowledge graph from all chapters."""
        input_path = Path(input_dir)
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        # Process all chapter files
        all_chapters_data = []
        for file in sorted(input_path.glob("chapter_*.json")):
            chapter_data = self.process_chapter_file(file)
            if chapter_data:
                all_chapters_data.append(chapter_data)

        # Create final knowledge graph with cross-references
        knowledge_graph = {
            'entities': {},
            'relationships': [],
            'domain_hierarchy': {},
            'cross_references': dict(self.entity_appearances),
            'domain_connections': dict(self.domain_connections),
            'metadata': {
                'total_chapters': len(all_chapters_data),
                'total_entities': sum(len(c['entities']) for c in all_chapters_data),
                'total_relationships': sum(len(c['relationships']) for c in all_chapters_data)
            }
        }

        # Merge entities and relationships from all chapters
        for chapter in all_chapters_data:
            for entity in chapter['entities']:
                entity_id = entity['id']
                if entity_id not in knowledge_graph['entities']:
                    knowledge_graph['entities'][entity_id] = entity
                else:
                    # Merge appearances and properties
                    existing = knowledge_graph['entities'][entity_id]
                    existing['source'] = [existing['source']] if isinstance(existing['source'], dict) else existing['source']
                    existing['source'].append(entity['source'])

            knowledge_graph['relationships'].extend(chapter['relationships'])

        # Save the complete knowledge graph
        output_file = output_path / "rl_knowledge_graph.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(knowledge_graph, f, indent=2)

        return knowledge_graph

def main():
    extractor = RLEntityExtractor()
    input_dir = r"path to folder containing json files for chapters of the book we processed in the previous step "
    output_dir = r"path to output directory"
    
    try:
        knowledge_graph = extractor.generate_knowledge_graph(input_dir, output_dir)
        print(f"\nKnowledge Graph Generation Complete:")
        print(f"Total Entities: {knowledge_graph['metadata']['total_entities']}")
        print(f"Total Relationships: {knowledge_graph['metadata']['total_relationships']}")
        print(f"Output saved to: {output_dir}/rl_knowledge_graph.json")
        
    except Exception as e:
        print(f"Error during knowledge graph generation: {e}")

if __name__ == "__main__":
    main()

In [5]:
import json
from typing import Dict, List
from openai import OpenAI

class LayeredRelationshipExtractor:
    def __init__(self, api_key: str = None):
        self.client = OpenAI(
            base_url="https://integrate.api.nvidia.com/v1",
            api_key=api_key or "paste your api key here"
        )

    def determine_layer(self, entity_data: Dict) -> str:
        """Determine which layer an entity belongs to based on its properties."""
        if 'type' in entity_data:
            entity_type = entity_data['type'].lower()
            
            # Mathematical and theoretical concepts
            if entity_type in ['theorem', 'equation', 'principle', 'proof', 'definition', 
                             'framework', 'concept', 'property', 'space', 'function']:
                return 'foundation_layer'
            
            # Methods and approaches
            elif entity_type in ['value_based', 'policy_based', 'model_based', 'hybrid', 'method']:
                return 'method_layer'
            
            # Algorithms and implementations
            elif entity_type in ['algorithm', 'base_algorithm', 'variant', 'improvement', 'extension']:
                return 'algorithm_layer'
            
            # Applications and domains
            elif entity_type in ['field', 'benchmark', 'use_case', 'environment', 'task', 'domain']:
                return 'application_layer'
        
        # Default to foundation layer if unclear
        return 'foundation_layer'

    def create_relationship_prompt(self, entity_id: str, entity: Dict, all_entities: Dict) -> str:
        # Determine the layer of the source entity
        source_layer = self.determine_layer(entity)
        
        # Create layer-specific entity groupings
        entities_by_layer = {
            'foundation_layer': [],
            'method_layer': [],
            'algorithm_layer': [],
            'application_layer': []
        }
        
        for eid, e in all_entities.items():
            if eid != entity_id:
                layer = self.determine_layer(e)
                entities_by_layer[layer].append({
                    'id': eid,
                    'name': e['name'],
                    'type': e.get('type', '')
                })

        return f"""Analyze this entity and identify its relationships with other entities, considering their respective layers in the RL knowledge hierarchy.

SOURCE ENTITY (from {source_layer}):
ID: {entity_id}
Name: {entity['name']}
Type: {entity.get('type', '')}
Definition: {entity.get('definition', '')}
Properties: {json.dumps(entity.get('properties', []), indent=2)}
Source: {json.dumps(entity.get('source', {}), indent=2)}

POTENTIAL TARGET ENTITIES BY LAYER:

Foundation Layer (Mathematical & Theoretical Concepts):
{json.dumps(entities_by_layer['foundation_layer'], indent=2)}

Method Layer (Approaches):
{json.dumps(entities_by_layer['method_layer'], indent=2)}

Algorithm Layer (Implementations):
{json.dumps(entities_by_layer['algorithm_layer'], indent=2)}

Application Layer (Domains & Use Cases):
{json.dumps(entities_by_layer['application_layer'], indent=2)}

Return ONLY valid JSON that follows this format exactly:
{{
    "relationships": [
        {{
            "source": "{entity_id}",
            "source_layer": "{source_layer}",
            "target": "target_entity_id",
            "target_layer": "layer_name",
            "type": "descriptive_relationship_type",
            "direction": "up|down|same|across",
            "evidence": {{
                "text": "exact text snippet that shows this relationship",
                "location": "definition|property|source"
            }}
        }}
    ]
}}

IMPORTANT:
1. Only extract relationships that are explicitly stated in the text
2. Use specific, descriptive relationship types based on the actual content
3. Always include the supporting evidence
4. Note the direction:
   - "up" for relationships to higher layers
   - "down" for relationships to lower layers
   - "same" for relationships within the same layer
   - "across" for relationships that cross layers non-hierarchically
5. Consider layer-appropriate relationships:
   - Foundation → Method: "enables", "provides basis for"
   - Method → Algorithm: "is implemented by", "guides"
   - Algorithm → Application: "is applied to", "solves"
   - Same layer: "relates to", "extends", "similar to"
   - Cross-layer: "inspired by", "analogous to"
"""

    def clean_json_response(self, response_text: str) -> dict:
        """Clean and parse JSON from API response."""
        try:
            # Try to find JSON between ```json and ``` markers
            import re
            json_block = re.search(r'```json\s*(.*?)\s*```', response_text, re.DOTALL)
            if json_block:
                response_text = json_block.group(1)
            else:
                # If no code blocks, try to find content between { and }
                json_match = re.search(r'(\{.*\})', response_text, re.DOTALL)
                if json_match:
                    response_text = json_match.group(1)

            return json.loads(response_text)
        except Exception as e:
            print(f"Error cleaning JSON: {e}")
            print(f"Problematic text: {response_text[:200]}...")
            return {"relationships": []}

    def extract_relationships(self, input_file: str):
        """Extract relationships for all entities while preserving layer information."""
        # Read input file
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            entities = data['entities']

        all_relationships = []
        layer_statistics = {
            'foundation_layer': {'total': 0, 'connected': 0},
            'method_layer': {'total': 0, 'connected': 0},
            'algorithm_layer': {'total': 0, 'connected': 0},
            'application_layer': {'total': 0, 'connected': 0}
        }

        # Process each entity
        total_entities = len(entities)
        for i, (entity_id, entity) in enumerate(entities.items(), 1):
            print(f"Processing entity {i}/{total_entities}: {entity_id}")
            
            # Track layer statistics
            entity_layer = self.determine_layer(entity)
            layer_statistics[entity_layer]['total'] += 1
            
            try:
                # Generate prompt for this entity
                prompt = self.create_relationship_prompt(entity_id, entity, entities)
                
                # Get relationships from LLM
                completion = self.client.chat.completions.create(
                    model="nvidia/llama-3.1-nemotron-70b-instruct",
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.3,
                    max_tokens=2048
                )
                
                if completion.choices:
                    response_text = completion.choices[0].message.content
                    extracted = self.clean_json_response(response_text)
                    
                    if 'relationships' in extracted and extracted['relationships']:
                        layer_statistics[entity_layer]['connected'] += 1
                        all_relationships.extend(extracted['relationships'])
            
            except Exception as e:
                print(f"Error processing entity {entity_id}: {e}")
                continue

        # Remove duplicates while preserving order
        seen = set()
        unique_relationships = []
        for rel in all_relationships:
            rel_key = (rel['source'], rel['target'], rel['type'])
            if rel_key not in seen:
                seen.add(rel_key)
                unique_relationships.append(rel)

        # Analyze layer connections
        layer_connections = {
            'up': 0,
            'down': 0,
            'same': 0,
            'across': 0
        }
        for rel in unique_relationships:
            if 'direction' in rel:
                layer_connections[rel['direction']] += 1

        # Save to file
        output = {
            "relationships": unique_relationships,
            "metadata": {
                "total_relationships": len(unique_relationships),
                "relationship_types": sorted(list(set(rel['type'] for rel in unique_relationships))),
                "total_entities_involved": len(set(
                    entity_id 
                    for rel in unique_relationships 
                    for entity_id in [rel['source'], rel['target']]
                )),
                "layer_statistics": layer_statistics,
                "layer_connections": layer_connections
            }
        }

        with open('relationships.json', 'w', encoding='utf-8') as f:
            json.dump(output, f, indent=2)

        return output

def main():
    extractor = LayeredRelationshipExtractor()
    input_file = r"rl_kg\output\rl_knowledge_graph.json"
    
    try:
        result = extractor.extract_relationships(input_file)
        print(f"\nRelationship extraction complete!")
        print(f"Total relationships found: {result['metadata']['total_relationships']}")
        print("\nLayer Statistics:")
        for layer, stats in result['metadata']['layer_statistics'].items():
            print(f"- {layer}: {stats['connected']}/{stats['total']} entities connected")
        print("\nLayer Connections:")
        for direction, count in result['metadata']['layer_connections'].items():
            print(f"- {direction}: {count} relationships")
        print("\nResults saved to relationships.json")
        
    except Exception as e:
        print(f"Error during relationship extraction: {e}")

if __name__ == "__main__":
    main()

Processing entity 1/64: reinforcement_learning
Processing entity 2/64: markov_decision_process
Error cleaning JSON: Expecting ',' delimiter: line 14 column 13 (char 520)
Problematic text: {
    "relationships": [
        {
            "source": "markov_decision_process",
            "source_layer": "foundation_layer",
            "target": "value_function",
            "target_layer": ...
Processing entity 3/64: value_function
Processing entity 4/64: policy
Processing entity 5/64: temporal_difference_learning
Processing entity 6/64: k_armed_bandit_problem
Processing entity 7/64: exploration_exploitation_tradeoff
Processing entity 8/64: epsilon_greedy_method
Error cleaning JSON: Expecting ',' delimiter: line 23 column 117 (char 912)
Problematic text: {
    "relationships": [
        {
            "source": "epsilon_greedy_method",
            "source_layer": "algorithm_layer",
            "target": "exploration_exploitation_tradeoff",
            ...
Processing entity 9/64: upper_confid