# KG Pipeline Test - Movies Dataset

Test complet du pipeline Knowledge Graph:
1. Parsing CSV
2. Extraction d'entit√©s (Claude)
3. Extraction de relations (Claude)
4. Stockage Neo4j
5. Visualisation statistiques

In [None]:
import sys
sys.path.insert(0, '/app')

from pathlib import Path
import json
from loguru import logger

from src.kg.parsers.csv_parser import CSVParser
from src.kg.agents.entity_extractor_agent import EntityExtractorAgent
from src.kg.agents.relation_extractor_agent import RelationExtractorAgent
from src.kg.services.neo4j_service import get_neo4j_service
from src.kg.services.pipeline_orchestrator import get_orchestrator
from src.kg.models.document import Document, DocumentFormat

## 1. Parse CSV

In [None]:
# Load CSV file
csv_file = Path("/app/data/test_datasets/movies_sample.csv")

parser = CSVParser()
df, metadata = parser.parse(csv_file)

print(f"\nüìä CSV Metadata:")
print(json.dumps(metadata, indent=2, default=str))

print(f"\nüìã DataFrame:")
display(df)

## 2. Extract Entities

In [None]:
# Convert to records
records = parser.to_records(df)

# Extract entities
entity_agent = EntityExtractorAgent()
entities = await entity_agent.extract_entities_batch(
    records=records,
    metadata=metadata,
    source_filename=csv_file.name
)

print(f"\n‚úÖ Extracted {len(entities)} entities\n")

# Group by type
entities_by_type = {}
for entity in entities:
    entity_type = entity.type.value
    if entity_type not in entities_by_type:
        entities_by_type[entity_type] = []
    entities_by_type[entity_type].append(entity.name)

for entity_type, names in entities_by_type.items():
    print(f"\n{entity_type} ({len(names)}):")
    for name in names[:10]:  # Show first 10
        print(f"  - {name}")

## 3. Extract Relations

In [None]:
# Extract relations
relation_agent = RelationExtractorAgent()
relations = await relation_agent.extract_relations_batch(
    records=records,
    entities=entities,
    metadata=metadata,
    source_filename=csv_file.name
)

print(f"\n‚úÖ Extracted {len(relations)} relations\n")

# Group by type
relations_by_type = {}
for relation in relations:
    rel_type = relation.type.value
    if rel_type not in relations_by_type:
        relations_by_type[rel_type] = []
    relations_by_type[rel_type].append(
        f"{relation.from_entity} -> {relation.to_entity}"
    )

for rel_type, rels in relations_by_type.items():
    print(f"\n{rel_type} ({len(rels)}):")
    for rel in rels[:10]:  # Show first 10
        print(f"  {rel}")

## 4. Store in Neo4j

In [None]:
# Get Neo4j service
neo4j = get_neo4j_service()

# Store entities
print("\nüì¶ Storing entities...")
entity_ids = neo4j.create_entities_batch(entities)
print(f"‚úÖ Stored {len(entity_ids)} entities")

# Store relations
print("\nüîó Storing relations...")
relation_ids = neo4j.create_relations_batch(relations)
print(f"‚úÖ Stored {len(relation_ids)} relations")

## 5. Graph Statistics

In [None]:
# Get graph stats
stats = neo4j.get_graph_stats()

print(f"\nüìä Graph Statistics:")
print(json.dumps(stats, indent=2))

## 6. Test Complete Pipeline

In [None]:
# Clear graph first (optional)
# neo4j.clear_graph()

# Create document
document = Document(
    filename=csv_file.name,
    format=DocumentFormat.CSV,
    size_bytes=csv_file.stat().st_size
)

# Run complete pipeline
print("\nüöÄ Running complete pipeline...\n")

orchestrator = get_orchestrator()
result = await orchestrator.process_csv_file(csv_file, document)

print("\n‚úÖ Pipeline completed!")
print(f"\nüìä Results:")
print(json.dumps(result, indent=2, default=str))

## 7. Query Graph Data

In [None]:
# Get graph visualization data
graph_data = neo4j.get_graph_data(limit=50)

print(f"\nüìä Graph Data:")
print(f"Nodes: {len(graph_data['nodes'])}")
print(f"Edges: {len(graph_data['edges'])}")

print(f"\nSample nodes:")
for node in graph_data['nodes'][:5]:
    print(f"  - {node['label']}: {node['properties'].get('name', 'N/A')}")

print(f"\nSample edges:")
for edge in graph_data['edges'][:5]:
    print(f"  - {edge['type']}")

## 8. Test Specific Entity Query

In [None]:
# Query a specific entity
entity_name = "Christopher Nolan"
entity_data = neo4j.get_entity_by_name(entity_name)

if entity_data:
    print(f"\n‚úÖ Found entity: {entity_name}")
    print(json.dumps(entity_data, indent=2))
else:
    print(f"\n‚ùå Entity not found: {entity_name}")

## 9. Cleanup (Optional)

In [None]:
# Uncomment to clear the graph
# neo4j.clear_graph()
# print("\n‚úÖ Graph cleared")