In [0]:
dbutils.library.restartPython()

In [0]:
# Import libraries
import os
import requests
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import mlflow
import mlflow.sklearn

# LangChain imports
from langchain.agents import AgentType, initialize_agent, tool
from langchain.schema import SystemMessage
from langchain.memory import ConversationBufferWindowMemory
from langchain_community.chat_models import ChatOpenAI
from langchain_community.llms import HuggingFaceHub

# Azure & Geospatial imports
try:    from azure.storage.blob import BlobServiceClient
except ModuleNotFoundError:
    print('Azure Storage Blob module not found. Please install azure-storage-blob package.')

from azure.identity import DefaultAzureCredential
from geopy.distance import geodesic

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


In [0]:
# Your specific Azure resource configuration
AZURE_CONFIG = {
    "resource_group": "irish-healthcare-agents-west-europe",
    "databricks_workspace": "irish-healthcare-db",
    "storage_account": "irishhealthdata",
    "location": "westeurope",
    "container_name": "supply-chain-data"
}

In [0]:
API_CONFIG = {
    "gnews_api_key": "",  # Replace with your actual key
    "gdacs_url": "https://www.gdacs.org/gdacsapi/api/events/geteventlist/SEARCH"
}

print("🔧 Azure Environment Configuration:")
for key, value in AZURE_CONFIG.items():
    print(f"   {key}: {value}")

🔧 Azure Environment Configuration:
   resource_group: irish-healthcare-agents-west-europe
   databricks_workspace: irish-healthcare-db
   storage_account: irishhealthdata
   location: westeurope
   container_name: supply-chain-data


In [0]:
def initialize_azure_storage():
    """Initialize connection to your Azure Storage Account"""
    try:
        # Using DefaultAzureCredential for authentication
        blob_service_client = BlobServiceClient(
            account_url=f"https://{AZURE_CONFIG['storage_account']}.blob.core.windows.net",
            credential=DefaultAzureCredential()
        )
        
        # Create container if it doesn't exist
        container_client = blob_service_client.get_container_client(AZURE_CONFIG['container_name'])
        try:
            container_client.create_container()
            print(f"✅ Created container: {AZURE_CONFIG['container_name']}")
        except:
            print(f"✅ Container already exists: {AZURE_CONFIG['container_name']}")
        
        print("✅ Azure Storage connection established successfully!")
        return blob_service_client
        
    except Exception as e:
        print(f"❌ Azure Storage connection failed: {e}")
        return None

# Initialize storage
storage_client = initialize_azure_storage()

✅ Container already exists: supply-chain-data
✅ Azure Storage connection established successfully!


In [0]:
# Create database
spark.sql("CREATE DATABASE IF NOT EXISTS supply_chain_analysis")
print("✅ Database 'supply_chain_analysis' created")

# COMMAND ----------

# Define schemas for supply chain data
gdacs_schema = StructType([
    StructField("event_id", StringType(), True),
    StructField("event_type", StringType(), True),
    StructField("event_name", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("severity", StringType(), True),
    StructField("country", StringType(), True),
    StructField("start_date", TimestampType(), True),
    StructField("end_date", TimestampType(), True),
    StructField("alert_level", StringType(), True),
    StructField("population_affected", IntegerType(), True),
    StructField("insert_timestamp", TimestampType(), True)
])

news_schema = StructType([
    StructField("article_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("content", StringType(), True),
    StructField("published_at", TimestampType(), True),
    StructField("source", StringType(), True),
    StructField("url", StringType(), True),
    StructField("keywords", ArrayType(StringType()), True),
    StructField("sentiment_score", DoubleType(), True),
    StructField("insert_timestamp", TimestampType(), True)
])


✅ Database 'supply_chain_analysis' created


In [0]:
# Create bronze layer tables
def create_bronze_tables():
    """Create bronze layer tables for raw data"""
    
    # GDACS Alerts Bronze Table
    spark.sql("""
        CREATE TABLE IF NOT EXISTS supply_chain_analysis.bronze_gdacs_alerts (
            event_id STRING,
            event_type STRING,
            event_name STRING,
            latitude DOUBLE,
            longitude DOUBLE,
            severity STRING,
            country STRING,
            start_date TIMESTAMP,
            end_date TIMESTAMP,
            alert_level STRING,
            population_affected INT,
            insert_timestamp TIMESTAMP,
            _file_source STRING,
            _load_timestamp TIMESTAMP
        )
        USING DELTA
        COMMENT 'Raw GDACS disaster alerts from API'
    """)
    
    # Supply Chain News Bronze Table
    spark.sql("""
        CREATE TABLE IF NOT EXISTS supply_chain_analysis.bronze_supply_chain_news (
            article_id STRING,
            title STRING,
            description STRING,
            content STRING,
            published_at TIMESTAMP,
            source STRING,
            url STRING,
            keywords ARRAY<STRING>,
            sentiment_score DOUBLE,
            insert_timestamp TIMESTAMP,
            _file_source STRING,
            _load_timestamp TIMESTAMP
        )
        USING DELTA
        COMMENT 'Raw supply chain news from GNews API'
    """)
    
    print("✅ Bronze layer tables created")

create_bronze_tables()

✅ Bronze layer tables created


In [0]:
# Create major ports dimension table
ports_data = [
    # Asia
    ("Shanghai", 31.2304, 121.4737, "China", "Major", "Very High", "Asia"),
    ("Singapore", 1.2644, 103.8220, "Singapore", "Major", "Very High", "Asia"),
    ("Shenzhen", 22.5431, 114.0579, "China", "Major", "High", "Asia"),
    ("Ningbo-Zhoushan", 29.8686, 121.5433, "China", "Major", "High", "Asia"),
    ("Hong Kong", 22.3193, 114.1694, "China", "Major", "High", "Asia"),
    ("Busan", 35.1796, 129.0756, "South Korea", "Major", "High", "Asia"),
    ("Qingdao", 36.0671, 120.3826, "China", "Major", "Medium", "Asia"),
    
    # Europe
    ("Rotterdam", 51.9225, 4.47917, "Netherlands", "Major", "High", "Europe"),
    ("Antwerp", 51.2291, 4.4053, "Belgium", "Major", "High", "Europe"),
    ("Hamburg", 53.5511, 9.9937, "Germany", "Major", "High", "Europe"),
    ("Felixstowe", 51.9617, 1.3513, "UK", "Major", "Medium", "Europe"),
    ("Valencia", 39.4699, -0.3763, "Spain", "Major", "Medium", "Europe"),
    
    # North America
    ("Los Angeles", 33.7175, -118.2675, "USA", "Major", "High", "North America"),
    ("Long Beach", 33.7623, -118.1954, "USA", "Major", "High", "North America"),
    ("New York", 40.6895, -74.1745, "USA", "Major", "High", "North America"),
    ("Savannah", 32.0814, -81.0914, "USA", "Major", "Medium", "North America"),
    ("Vancouver", 49.2827, -123.1207, "Canada", "Major", "Medium", "North America"),
    
    # Middle East
    ("Jebel Ali", 25.0263, 55.0564, "UAE", "Major", "High", "Middle East"),
    ("Salalah", 16.9344, 54.0239, "Oman", "Major", "Medium", "Middle East")
]

ports_df = spark.createDataFrame(ports_data, [
    "port_name", "latitude", "longitude", "country", 
    "port_size", "capacity_rating", "region"
])

ports_df.write.mode("overwrite").saveAsTable("supply_chain_analysis.dim_ports")
print("✅ Ports dimension table created with 17 major global ports")

# Display the ports
display(ports_df)

✅ Ports dimension table created with 17 major global ports


port_name,latitude,longitude,country,port_size,capacity_rating,region
Shanghai,31.2304,121.4737,China,Major,Very High,Asia
Singapore,1.2644,103.822,Singapore,Major,Very High,Asia
Shenzhen,22.5431,114.0579,China,Major,High,Asia
Ningbo-Zhoushan,29.8686,121.5433,China,Major,High,Asia
Hong Kong,22.3193,114.1694,China,Major,High,Asia
Busan,35.1796,129.0756,South Korea,Major,High,Asia
Qingdao,36.0671,120.3826,China,Major,Medium,Asia
Rotterdam,51.9225,4.47917,Netherlands,Major,High,Europe
Antwerp,51.2291,4.4053,Belgium,Major,High,Europe
Hamburg,53.5511,9.9937,Germany,Major,High,Europe


In [0]:
# Set up MLflow experiment
experiment_name = "/Users/{}/supply_chain_risk_prediction".format(
    dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
)

mlflow.set_experiment(experiment_name)
print(f"✅ MLflow experiment configured: {experiment_name}")


2025/10/04 18:09:18 INFO mlflow.tracking.fluent: Experiment with name '/Users/u1025325052@gmail.com/supply_chain_risk_prediction' does not exist. Creating a new experiment.


✅ MLflow experiment configured: /Users/u1025325052@gmail.com/supply_chain_risk_prediction


In [0]:
def validate_environment():
    """Validate that all components are working"""
    tests_passed = 0
    total_tests = 5
    
    print("🔍 Running Environment Validation...")
    
    # Test 1: Database accessible
    try:
        spark.sql("USE supply_chain_analysis")
        print("✅ Test 1: Database accessible")
        tests_passed += 1
    except:
        print("❌ Test 1: Database not accessible")
    
    # Test 2: Tables exist
    try:
        tables = spark.sql("SHOW TABLES IN supply_chain_analysis").collect()
        table_count = len(tables)
        if table_count >= 3:
            print(f"✅ Test 2: {table_count} tables exist")
            tests_passed += 1
        else:
            print(f"❌ Test 2: Only {table_count} tables found")
    except:
        print("❌ Test 2: Could not list tables")
    
    # Test 3: MLflow accessible
    try:
        experiment = mlflow.get_experiment_by_name(experiment_name)
        if experiment:
            print("✅ Test 3: MLflow experiment configured")
            tests_passed += 1
        else:
            print("❌ Test 3: MLflow experiment not found")
    except:
        print("❌ Test 3: MLflow test failed")
    
    # Test 4: Storage connection
    if storage_client:
        print("✅ Test 4: Azure Storage connected")
        tests_passed += 1
    else:
        print("❌ Test 4: Azure Storage connection failed")
    
    # Test 5: Spark operations
    try:
        test_df = spark.range(100)
        if test_df.count() == 100:
            print("✅ Test 5: Spark operations working")
            tests_passed += 1
        else:
            print("❌ Test 5: Spark operations failing")
    except:
        print("❌ Test 5: Spark test failed")
    
    # Summary
    print(f"\n🎯 Validation Results: {tests_passed}/{total_tests} tests passed")
    
    if tests_passed == total_tests:
        print("🚀 Environment is READY for data ingestion!")
        return True
    else:
        print("⚠️  Some components need attention")
        return False

# Run validation
environment_ready = validate_environment()

🔍 Running Environment Validation...
✅ Test 1: Database accessible
✅ Test 2: 3 tables exist
✅ Test 3: MLflow experiment configured
✅ Test 4: Azure Storage connected
✅ Test 5: Spark operations working

🎯 Validation Results: 5/5 tests passed
🚀 Environment is READY for data ingestion!


In [0]:
# Save configuration to DBFS for other notebooks
config_path = "/dbfs/FileStore/supply_chain/config.json"
os.makedirs(os.path.dirname(config_path), exist_ok=True)

# PROPER configuration structure with ACTUAL values
config_data = {
    "azure_config": AZURE_CONFIG,
    "api_config": API_CONFIG,  # This contains the actual values
    "environment_ready": environment_ready,
    "setup_timestamp": datetime.now().isoformat(),
    "ports_count": ports_df.count(),
    "mlflow_experiment": experiment_name,
    "databricks_runtime": "13.3 LTS"
}

print("💾 Saving configuration with ACTUAL values:")
print(f"   • GNews API Key in config: {'✅ PRESENT' if 'gnews_api_key' in config_data['api_config'] else '❌ MISSING'}")
print(f"   • GNews API Key value: {config_data['api_config']['gnews_api_key'][:10]}...")
print(f"   • GDACS URL: {config_data['api_config']['gdacs_url']}")

with open(config_path, "w") as f:
    json.dump(config_data, f, indent=2)

print(f"✅ Configuration saved to: {config_path}")

# Verify the saved configuration
print("\n🔍 VERIFYING SAVED CONFIGURATION:")
with open(config_path, "r") as f:
    saved_config = json.load(f)
    print(f"   • GNews API Key saved: {saved_config['api_config']['gnews_api_key'][:10]}...")
    print(f"   • GDACS URL saved: {saved_config['api_config']['gdacs_url']}")

✅ Configuration saved to: /dbfs/FileStore/supply_chain/config.json


In [0]:
if environment_ready:
    print("""
    🎉 ENVIRONMENT SETUP COMPLETE!
    
    Next steps:
    1. ✅ Replace GNews API key in the configuration
    2. ➡️ Proceed to Notebook 2: Data Ingestion Pipeline
    3. 🔧 Configure Azure OpenAI (optional for now)
    
    Your Azure Resources:
    • Resource Group: irish-healthcare-agents-west-europe
    • Databricks: irish-healthcare-db
    • Storage: irishhealthdata
    • Location: West Europe
    """)
else:
    print("""
    ⚠️  ENVIRONMENT NEEDS ATTENTION
    
    Please check:
    1. 🔑 Azure authentication permissions
    2. 📊 Database connectivity
    3. 🔧 Storage account access
    """)


    🎉 ENVIRONMENT SETUP COMPLETE!
    
    Next steps:
    1. ✅ Replace GNews API key in the configuration
    2. ➡️ Proceed to Notebook 2: Data Ingestion Pipeline
    3. 🔧 Configure Azure OpenAI (optional for now)
    
    Your Azure Resources:
    • Resource Group: irish-healthcare-agents-west-europe
    • Databricks: irish-healthcare-db
    • Storage: irishhealthdata
    • Location: West Europe
    


In [0]:
# Display current configuration
print("🔧 CURRENT CONFIGURATION STATUS:")
print(f"   Storage Account: {AZURE_CONFIG['storage_account']} ✅")
print(f"   Database: supply_chain_analysis ✅")  
print(f"   MLflow Experiment: {experiment_name} ✅")
print(f"   GNews API Key: {'❌ NEEDS CONFIGURATION' if API_CONFIG['gnews_api_key'] == 'YOUR_GNEWS_API_KEY_HERE' else '✅ CONFIGURED'}")
#print(f"   Azure OpenAI: {'⚡ OPTIONAL' if API_CONFIG['openai_api_key'] == 'YOUR_AZURE_OPENAI_KEY' else '✅ CONFIGURED'}")

🔧 CURRENT CONFIGURATION STATUS:
   Storage Account: irishhealthdata ✅
   Database: supply_chain_analysis ✅
   MLflow Experiment: /Users/u1025325052@gmail.com/supply_chain_risk_prediction ✅
   GNews API Key: ✅ CONFIGURED


[0;31m---------------------------------------------------------------------------[0m
[0;31mKeyError[0m                                  Traceback (most recent call last)
File [0;32m<command-8878759409946456>, line 7[0m
[1;32m      5[0m [38;5;28mprint[39m([38;5;124mf[39m[38;5;124m"[39m[38;5;124m   MLflow Experiment: [39m[38;5;132;01m{[39;00mexperiment_name[38;5;132;01m}[39;00m[38;5;124m ✅[39m[38;5;124m"[39m)
[1;32m      6[0m [38;5;28mprint[39m([38;5;124mf[39m[38;5;124m"[39m[38;5;124m   GNews API Key: [39m[38;5;132;01m{[39;00m[38;5;124m'[39m[38;5;124m❌ NEEDS CONFIGURATION[39m[38;5;124m'[39m [38;5;28;01mif[39;00m API_CONFIG[[38;5;124m'[39m[38;5;124mgnews_api_key[39m[38;5;124m'[39m] [38;5;241m==[39m [38;5;124m'[39m[38;5;124mYOUR_GNEWS_API_KEY_HERE[39m[38;5;124m'[39m [38;5;28;01melse[39;00m [38;5;124m'[39m[38;5;124m✅ CONFIGURED[39m[38;5;124m'[39m[38;5;132;01m}[39;00m[38;5;124m"[39m)
[0;32m----> 7[0m [38;5;28mprint[39m