In [None]:
# Import required libraries
import sys
import os
sys.path.append('..')

from scripts.inference import DisasterInformationExtractor
import json
from pathlib import Path

In [None]:
# Initialize the extractor with trained models
models_dir = "models/"  # Path to your trained models
extractor = DisasterInformationExtractor(models_dir)

## Sample Disaster News Articles

Let's test the system with some sample Vietnamese disaster news articles.

In [None]:
# Sample disaster news articles
sample_articles = [
    "Lũ lụt nghiêm trọng xảy ra tại tỉnh Nghệ An hôm qua, gây thiệt hại hàng trăm tỷ đồng. Chính phủ đã huy động lực lượng cứu hộ khẩn cấp.",
    
    "Động đất mạnh 6.5 độ Richter xảy ra ở huyện Sìn Hồ, Lai Châu vào sáng nay. Không có thương vong nhưng nhiều nhà cửa bị hư hại.",
    
    "Bão số 3 đổ bộ vào các tỉnh miền Trung, gió mạnh lên đến cấp 12. Hàng ngàn hecta lúa và hoa màu bị ngập úng, thiệt hại ước tính 500 tỷ đồng.",
    
    "Hỏa hoạn xảy ra tại khu công nghiệp Long Hậu, Đồng Nai đêm qua. Nguyên nhân ban đầu được xác định do chập điện. Không có thương vong.",
    
    "Sạt lở đất nghiêm trọng tại huyện Mường Tè, Lai Châu do mưa lũ kéo dài. 5 người mất tích, chính quyền địa phương đang tích cực tìm kiếm."
]

## Extract Disaster Information

Now let's extract disaster information from each article using our fine-tuned models.

In [None]:
# Process each article
results = []

for i, article in enumerate(sample_articles, 1):
    print(f"\n=== Article {i} ===")
    print(f"Text: {article}")
    
    # Extract disaster information
    disaster_info = extractor.extract_disaster_info(article)
    results.append(disaster_info)
    
    # Display results
    print(f"\nEvent Type: {disaster_info['event_type']} (Confidence: {disaster_info['event_confidence']:.3f})")
    
    print("\nExtracted Entities:")
    for entity in disaster_info['entities']:
        print(f"  - {entity['type']}: '{entity['text']}' (confidence: {entity.get('confidence', 'N/A')})")
    
    print("\nStructured Information:")
    structured = disaster_info.get('structured_info', {})
    for key, value in structured.items():
        if value:
            print(f"  - {key}: {value}")
    
    print("\nRelations:")
    for relation in disaster_info['relations']:
        head = relation['head']['text']
        tail = relation['tail']['text']
        rel_type = relation['relation_type']
        confidence = relation['confidence']
        print(f"  - {head} --[{rel_type}]--> {tail} (confidence: {confidence:.3f})")
    
    print("=" * 50)

## Batch Processing

For processing multiple articles efficiently, use the batch processing method.

In [None]:
# Batch processing
print("Processing all articles in batch...")
batch_results = extractor.batch_extract(sample_articles)

print(f"Processed {len(batch_results)} articles successfully!")

# Save results to file
output_file = "disaster_extraction_results.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(batch_results, f, indent=2, ensure_ascii=False)

print(f"Results saved to {output_file}")

## Analysis and Visualization

Let's analyze the extraction results and create some visualizations.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Set up the plotting style
plt.style.use('default')
sns.set_palette("husl")

# Analyze entity types
all_entities = []
event_types = []

for result in batch_results:
    event_types.append(result['event_type'])
    for entity in result['entities']:
        all_entities.append(entity['type'])

# Count frequencies
entity_counts = Counter(all_entities)
event_counts = Counter(event_types)

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Entity types distribution
ax1.bar(entity_counts.keys(), entity_counts.values())
ax1.set_title('Distribution of Extracted Entity Types')
ax1.set_xlabel('Entity Type')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Event types distribution
ax2.bar(event_counts.keys(), event_counts.values())
ax2.set_title('Distribution of Disaster Event Types')
ax2.set_xlabel('Event Type')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('extraction_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nEntity Type Distribution:")
for entity_type, count in entity_counts.items():
    print(f"  {entity_type}: {count}")

print("\nEvent Type Distribution:")
for event_type, count in event_counts.items():
    print(f"  {event_type}: {count}")

## Model Confidence Analysis

Let's analyze the confidence scores of our model predictions.

In [None]:
# Analyze confidence scores
event_confidences = [result['event_confidence'] for result in batch_results]
relation_confidences = []

for result in batch_results:
    for relation in result['relations']:
        relation_confidences.append(relation['confidence'])

# Create confidence analysis plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Event classification confidence
ax1.hist(event_confidences, bins=10, alpha=0.7, edgecolor='black')
ax1.set_title('Event Classification Confidence Distribution')
ax1.set_xlabel('Confidence Score')
ax1.set_ylabel('Frequency')
ax1.grid(True, alpha=0.3)

# Relation extraction confidence
if relation_confidences:
    ax2.hist(relation_confidences, bins=10, alpha=0.7, edgecolor='black')
    ax2.set_title('Relation Extraction Confidence Distribution')
    ax2.set_xlabel('Confidence Score')
    ax2.set_ylabel('Frequency')
    ax2.grid(True, alpha=0.3)
else:
    ax2.text(0.5, 0.5, 'No relations extracted', 
             horizontalalignment='center', verticalalignment='center',
             transform=ax2.transAxes, fontsize=12)
    ax2.set_title('Relation Extraction Confidence Distribution')

plt.tight_layout()
plt.savefig('confidence_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nConfidence Statistics:")
print(f"Event Classification - Mean: {sum(event_confidences)/len(event_confidences):.3f}, Min: {min(event_confidences):.3f}, Max: {max(event_confidences):.3f}")
if relation_confidences:
    print(f"Relation Extraction - Mean: {sum(relation_confidences)/len(relation_confidences):.3f}, Min: {min(relation_confidences):.3f}, Max: {max(relation_confidences):.3f}")
else:
    print("Relation Extraction - No relations extracted")

## Export Structured Data

Finally, let's export the extracted information in a structured format suitable for further analysis or integration with other systems.

In [None]:
# Create structured dataset for further analysis
structured_dataset = []

for i, result in enumerate(batch_results):
    structured_item = {
        'article_id': i + 1,
        'original_text': result['text'],
        'event_type': result['event_type'],
        'event_confidence': result['event_confidence'],
        'entities': result['entities'],
        'relations': result['relations'],
        'structured_info': {
            'disaster_type': result.get('disaster_type'),
            'location': result.get('location', []),
            'time': result.get('time'),
            'damage': result.get('damage', []),
            'response': result.get('response', []),
            'impact': result.get('impact', []),
            'forecast': result.get('forecast')
        }
    }
    structured_dataset.append(structured_item)

# Save structured dataset
structured_file = "structured_disaster_data.json"
with open(structured_file, 'w', encoding='utf-8') as f:
    json.dump(structured_dataset, f, indent=2, ensure_ascii=False)

print(f"Structured dataset saved to {structured_file}")
print(f"Total articles processed: {len(structured_dataset)}")

# Display summary
print("\n=== Processing Summary ===")
print(f"Total articles: {len(structured_dataset)}")
print(f"Unique event types: {len(set(item['event_type'] for item in structured_dataset))}")
print(f"Total entities extracted: {sum(len(item['entities']) for item in structured_dataset)}")
print(f"Total relations extracted: {sum(len(item['relations']) for item in structured_dataset)}")

# Show sample structured output
print("\n=== Sample Structured Output ===")
sample = structured_dataset[0]
print(json.dumps({
    'article_id': sample['article_id'],
    'event_type': sample['event_type'],
    'structured_info': sample['structured_info']
}, indent=2, ensure_ascii=False))

## Conclusion

This notebook demonstrated the complete disaster information extraction pipeline using fine-tuned models optimized for Vietnamese journalism. The system successfully:

1. **Entity Recognition**: Extracted disaster-related entities (location, damage, response, etc.)
2. **Event Classification**: Classified disaster event types with high confidence
3. **Relation Extraction**: Identified relationships between extracted entities
4. **Structured Output**: Provided clean, structured data for downstream applications

### Next Steps

- Integrate with existing RAG system for enhanced disaster information processing
- Deploy models as REST API for real-time processing
- Fine-tune on larger, more diverse datasets
- Add support for additional entity types and relation types
- Implement model monitoring and retraining pipelines

### Files Generated

- `disaster_extraction_results.json`: Complete extraction results
- `structured_disaster_data.json`: Structured dataset for analysis
- `extraction_analysis.png`: Entity and event type distributions
- `confidence_analysis.png`: Model confidence distributions