In [2]:
# Cài đặt thư viện
!pip install -q langchain langchain-community langchain-core neo4j transformers accelerate bitsandbytes torch pydantic

In [None]:
# Import libraries
import json
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain_community.graphs import Neo4jGraph
from tqdm import tqdm
import time
from pydantic import BaseModel, Field
from typing import List

In [None]:
# Setup Neo4j connection
NEO4J_URI = "neo4j+s://0c367113.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "gTO1K567hBLzkRdUAhhEb-UqvBjz0i3ckV3M9v_-Nio"

graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

In [None]:
chunk_file = "/kaggle/input/vietnam-history-chunks/chapter10_chunk.json"

with open(chunk_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

chunks = data['chunks']

In [None]:
# Load Qwen2.5 model từ Hugging Face
print("Đang load model Qwen2.5-7B-Instruct...")

model_name = "Qwen/Qwen2.5-7B-Instruct"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model 
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)


In [None]:
# Define Pydantic schemas cho validation
class Entity(BaseModel):
    """Schema cho một entity"""
    name: str = Field(description="Tên thực thể")
    type: str = Field(description="Loại thực thể: PERSON, LOCATION, ORGANIZATION, EVENT, TIME")
    description: str = Field(description="Mô tả về thực thể")

class Relationship(BaseModel):
    """Schema cho một relationship"""
    source: str = Field(description="Tên entity nguồn")
    target: str = Field(description="Tên entity đích")
    type: str = Field(description="Loại quan hệ")
    description: str = Field(description="Mô tả quan hệ")

class KnowledgeGraph(BaseModel):
    """Schema cho toàn bộ knowledge graph output"""
    entities: List[Entity] = Field(default_factory=list, description="Danh sách entities")
    relationships: List[Relationship] = Field(default_factory=list, description="Danh sách relationships")

print("Pydantic schemas đã sẵn sàng!")
print(f"\nSchema: {KnowledgeGraph.model_json_schema()}")

In [None]:
# Prompt template để extract entities và relationships
EXTRACTION_PROMPT = """Phân tích văn bản lịch sử Việt Nam sau và trích xuất entities và relationships.

QUY TẮC:
- Chỉ trích xuất thông tin CÓ TRONG văn bản
- Entity types: PERSON, LOCATION, ORGANIZATION, EVENT, TIME
- Relationship phải rõ ràng và có trong văn bản

VĂN BẢN:
{text}

Hãy trả về kết quả theo ĐÚNG định dạng JSON sau (ĐẢM BẢO cú pháp JSON hợp lệ):
{{
  "entities": [
    {{"name": "Hồ Chí Minh", "type": "PERSON", "description": "Chủ tịch nước"}},
    {{"name": "Việt Nam Dân chủ Cộng hòa", "type": "ORGANIZATION", "description": "Nhà nước"}}
  ],
  "relationships": [
    {{"source": "Hồ Chí Minh", "target": "Việt Nam Dân chủ Cộng hòa", "type": "LÃNH ĐẠO", "description": "Chủ tịch nước"}}
  ]
}}

CHỈ TRẢ VỀ JSON, KHÔNG GIẢI THÍCH THÊM:"""

def generate_structured(prompt: str) -> str:
    """Generate text với Qwen model"""
    messages = [
        {"role": "system", "content": "Bạn là một chuyên gia phân tích văn bản lịch sử Việt Nam. Trả về kết quả dưới dạng JSON hợp lệ."},
        {"role": "user", "content": prompt}
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512,  # Giảm để nhanh hơn
            temperature=0.1,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1
        )
    
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

In [None]:
def extract_knowledge(text: str, max_retries: int = 3) -> dict:
    """Extract entities và relationships từ text - validate với Pydantic, có retry khi lỗi"""
    
    for attempt in range(max_retries):
        try:
            # Tạo prompt
            prompt = EXTRACTION_PROMPT.format(text=text)
            
            # Generate response
            response = generate_structured(prompt)
            
            # Parse JSON từ response
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
                data = json.loads(json_str)
                
                # Validate với Pydantic
                kg = KnowledgeGraph(**data)
                
                # Convert to dict
                result = {
                    "entities": [e.model_dump() for e in kg.entities],
                    "relationships": [r.model_dump() for r in kg.relationships]
                }
                
                # Thành công
                if attempt > 0:
                    print(f"Retry thành công sau {attempt + 1} lần thử")
                return result
            else:
                print(f"Không tìm thấy JSON (lần {attempt + 1}/{max_retries})")
                if attempt == max_retries - 1:
                    return {"entities": [], "relationships": []}
                continue
            
        except json.JSONDecodeError as e:
            print(f"JSON decode error (lần {attempt + 1}/{max_retries}): {e}")
            if attempt == max_retries - 1:
                return {"entities": [], "relationships": []}
            continue
            
        except Exception as e:
            print(f"Error (lần {attempt + 1}/{max_retries}): {e}")
            if attempt == max_retries - 1:
                return {"entities": [], "relationships": []}
            continue
    
    return {"entities": [], "relationships": []}

In [None]:
# Hàm thêm entities và relationships vào Neo4j
def add_to_graph(knowledge: dict, chunk_metadata: dict):
    """Thêm knowledge vào Neo4j graph"""
    
    # 1. Thêm entities
    for entity in knowledge.get('entities', []):
        name = entity.get('name', '').strip()
        entity_type = entity.get('type', 'UNKNOWN')
        description = entity.get('description', '')
        
        if not name:
            continue
        
        query = f"""
        MERGE (e:{entity_type} {{name: $name}})
        ON CREATE SET 
            e.description = $description,
            e.first_seen_chapter = $chapter,
            e.first_seen_page = $page
        ON MATCH SET
            e.description = CASE WHEN e.description IS NULL OR e.description = '' 
                                THEN $description ELSE e.description END
        RETURN e
        """
        
        try:
            graph.query(query, {
                'name': name,
                'description': description,
                'chapter': chunk_metadata.get('chapter'),
                'page': chunk_metadata.get('page_start')
            })
        except Exception as e:
            print(f"Error adding entity {name}: {e}")
    
    # 2. Thêm relationships
    for rel in knowledge.get('relationships', []):
        source = rel.get('source', '').strip()
        target = rel.get('target', '').strip()
        rel_type = rel.get('type', 'RELATED_TO').replace(' ', '_').upper()
        description = rel.get('description', '')
        
        if not source or not target:
            continue
        
        query = f"""
        MATCH (s {{name: $source}})
        MATCH (t {{name: $target}})
        MERGE (s)-[r:{rel_type}]->(t)
        ON CREATE SET 
            r.description = $description,
            r.chapter = $chapter,
            r.page = $page
        RETURN r
        """
        
        try:
            graph.query(query, {
                'source': source,
                'target': target,
                'description': description,
                'chapter': chunk_metadata.get('chapter'),
                'page': chunk_metadata.get('page_start')
            })
        except Exception as e:
            print(f"Error adding relationship {source}->{target}: {e}")


In [None]:
# Xóa dữ liệu cũ
clear_old_data = False  # Đổi thành True nếu muốn xóa

if clear_old_data:
    graph.query("MATCH (n) DETACH DELETE n")
    print("Đã xóa dữ liệu cũ")
else:
    print("Giữ nguyên dữ liệu cũ")

In [None]:
# Process only chunks from start_chunk to end_chunk
start_chunk = 0  
end_chunk = 30  
total_entities = 0
total_relationships = 0
errors = 0

num_chunks_to_process = min(end_chunk, len(chunks)) - start_chunk
print(f"Bắt đầu xử lý chunks từ {start_chunk} đến {end_chunk-1} (tổng {num_chunks_to_process})...")
for i in tqdm(range(start_chunk, min(end_chunk, len(chunks))), desc="Processing chunks"):
    chunk = chunks[i]
    
    try:
        # Extract knowledge
        knowledge = extract_knowledge(chunk['content'])
        
        # Add to graph
        add_to_graph(knowledge, chunk['metadata'])
        
        # Stats
        entities_count = len(knowledge.get('entities', []))
        rels_count = len(knowledge.get('relationships', []))
        total_entities += entities_count
        total_relationships += rels_count
        
        # Print progress
        if ((i - start_chunk + 1) % 5) == 0:
            print(f"\nProgress after {i-start_chunk+1} chunks:")
            print(f"   Entities: {total_entities}")
            print(f"   Relationships: {total_relationships}")
            print(f"   Errors: {errors}")
        
    except Exception as e:
        errors += 1
        print(f"\nError at chunk {i}: {e}")
        continue

print(f"HOÀN TẤT!")
print(f"Thống kê:")
print(f"   - Chunks xử lý: {num_chunks_to_process}")
print(f"   - Tổng entities: {total_entities}")
print(f"   - Tổng relationships: {total_relationships}")
print(f"   - Errors: {errors}")
print(f"   - Success rate: {((num_chunks_to_process - errors) / num_chunks_to_process * 100):.1f}%")

In [None]:
# Kiểm tra kết quả trong Neo4j
print("=== Thống kê Entities ===")
count_query = """
MATCH (n)
RETURN labels(n)[0] as Type, count(n) as Count
ORDER BY Count DESC
"""
results = graph.query(count_query)
for row in results:
    print(f"  {row['Type']}: {row['Count']}")

print("\n=== Thống kê Relationships ===")
rel_count_query = """
MATCH ()-[r]->()
RETURN type(r) as RelType, count(r) as Count
ORDER BY Count DESC
LIMIT 20
"""
rel_results = graph.query(rel_count_query)
for row in rel_results:
    print(f"  {row['RelType']}: {row['Count']}")

In [None]:
# Top entities quan trọng nhất
print("=== Top 20 Entities quan trọng nhất ===")
important_entities_query = """
MATCH (n)
OPTIONAL MATCH (n)-[r]-()
WITH n, count(DISTINCT r) as rel_count
WHERE rel_count > 0
RETURN n.name as Name, labels(n)[0] as Type, rel_count as Connections
ORDER BY rel_count DESC
LIMIT 20
"""
important = graph.query(important_entities_query)
for i, row in enumerate(important, 1):
    print(f"{i:2d}. {row['Name']} ({row['Type']}): {row['Connections']} connections")