## 1. Import Libraries

## 1. Setup & Import Libraries

In [1]:
# Install required packages
# !pip install flask flask-cors pandas numpy scikit-learn python-dotenv -q

In [2]:
from flask import Flask, jsonify, request
from flask_cors import CORS
import pandas as pd
import numpy as np
from pathlib import Path
import json
from datetime import datetime
from sklearn.linear_model import LinearRegression

## 2. Load Data from Modeling Results

In [3]:
# Define paths
DATA_DIR = Path('data/result')
CLEANED_DIR = Path('data/cleaned')

# Load clustering results
clustering_results = pd.read_csv(DATA_DIR / 'clustering_results.csv')
cluster_profiles = pd.read_csv(DATA_DIR / 'cluster_profiles.csv')
cluster_centroids = pd.read_csv(DATA_DIR / 'cluster_centroids.csv')

# Load cleaned data for additional features
data_integrated = pd.read_csv(CLEANED_DIR / 'data_integrated_wide.csv')

print("="*60)
print("DATA LOADED SUCCESSFULLY")
print("="*60)
print(f"Clustering Results: {clustering_results.shape}")
print(f"Cluster Profiles: {cluster_profiles.shape}")
print(f"Cluster Centroids: {cluster_centroids.shape}")
print(f"Integrated Data: {data_integrated.shape}")
print("\nClustering Results columns:")
print(clustering_results.columns.tolist())
print("\nSample data:")
clustering_results.head()

DATA LOADED SUCCESSFULLY
Clustering Results: (514, 6)
Cluster Profiles: (3, 11)
Cluster Centroids: (3, 3)
Integrated Data: (1028, 47)

Clustering Results columns:
['Kabupaten_Kota', 'Tahun', 'Region', 'Pengeluaran_Buah', 'Pengeluaran_Sayur', 'Cluster']

Sample data:


Unnamed: 0,Kabupaten_Kota,Tahun,Region,Pengeluaran_Buah,Pengeluaran_Sayur,Cluster
0,Aceh Barat,2024,Aceh,11160.0,15821.0,0
1,Aceh Barat Daya,2024,Aceh,7231.0,13790.0,1
2,Aceh Besar,2024,Aceh,6689.0,14052.0,1
3,Aceh Jaya,2024,Aceh,8789.0,14197.0,1
4,Aceh Selatan,2024,Aceh,5682.0,14771.0,1


## 3. Data Processing & Preparation

Prepare data untuk API responses dengan enrichment dan transformations.

In [4]:
def prepare_cluster_data():
    """
    Prepare enriched cluster data dengan tambahan informasi:
    - Region mapping
    - Cluster labels yang lebih deskriptif
    - Category classification
    """

    # Add cluster labels berdasarkan profil
    cluster_labels = {
        0: "Low Expenditure",
        1: "Balanced Expenditure",
        2: "High Expenditure",
    }

    # Enrich clustering results
    df = clustering_results.copy()
    df['Cluster_Label'] = df['Cluster'].map(cluster_labels)

    # Add category based on expenditure levels
    df['Cluster_Category'] = df['Cluster_Label']

    return df

# Process data
enriched_data = prepare_cluster_data()

print("âœ“ Data preparation complete!")
print(f"\nEnriched data shape: {enriched_data.shape}")
print("\nNew columns added:")
print([col for col in enriched_data.columns if col not in clustering_results.columns])
print("\nSample enriched data:")
enriched_data.head()

âœ“ Data preparation complete!

Enriched data shape: (514, 8)

New columns added:
['Cluster_Label', 'Cluster_Category']

Sample enriched data:


Unnamed: 0,Kabupaten_Kota,Tahun,Region,Pengeluaran_Buah,Pengeluaran_Sayur,Cluster,Cluster_Label,Cluster_Category
0,Aceh Barat,2024,Aceh,11160.0,15821.0,0,Low Expenditure,Low Expenditure
1,Aceh Barat Daya,2024,Aceh,7231.0,13790.0,1,Balanced Expenditure,Balanced Expenditure
2,Aceh Besar,2024,Aceh,6689.0,14052.0,1,Balanced Expenditure,Balanced Expenditure
3,Aceh Jaya,2024,Aceh,8789.0,14197.0,1,Balanced Expenditure,Balanced Expenditure
4,Aceh Selatan,2024,Aceh,5682.0,14771.0,1,Balanced Expenditure,Balanced Expenditure


## 4. Generate Predictions for 2025

Menggunakan Linear Regression untuk prediksi tren pengeluaran tahun 2025.

In [5]:
def generate_predictions_2025():
    """
    Generate predictions untuk pengeluaran 2025 berdasarkan tren 2023-2024
    """
    predictions = []
    
    # Group by Kabupaten_Kota untuk time series
    for kabupaten in enriched_data['Kabupaten_Kota'].unique():
        kab_data = enriched_data[enriched_data['Kabupaten_Kota'] == kabupaten].sort_values('Tahun')
        
        if len(kab_data) >= 2:  # Need at least 2 years for prediction
            years = kab_data['Tahun'].values.reshape(-1, 1)
            
            # Predict Buah
            if 'Total_Buah' in kab_data.columns:
                buah_values = kab_data['Total_Buah'].values
                model_buah = LinearRegression()
                model_buah.fit(years, buah_values)
                pred_buah_2025 = model_buah.predict([[2025]])[0]
            else:
                pred_buah_2025 = kab_data['Pengeluaran_Buah'].iloc[-1] * 1.05  # 5% growth assumption
            
            # Predict Sayur
            if 'Total_Sayur' in kab_data.columns:
                sayur_values = kab_data['Total_Sayur'].values
                model_sayur = LinearRegression()
                model_sayur.fit(years, sayur_values)
                pred_sayur_2025 = model_sayur.predict([[2025]])[0]
            else:
                pred_sayur_2025 = kab_data['Pengeluaran_Sayur'].iloc[-1] * 1.05
            
            # Get latest cluster info
            latest = kab_data.iloc[-1]
            
            predictions.append({
                'Kabupaten_Kota': kabupaten,
                'Region': latest.get('Region', 'Unknown'),
                'Cluster': int(latest['Cluster']),
                'Cluster_Label': latest['Cluster_Label'],
                'Predicted_Buah_2025': float(pred_buah_2025),
                'Predicted_Sayur_2025': float(pred_sayur_2025),
                'Predicted_Total_2025': float(pred_buah_2025 + pred_sayur_2025),
                'Current_Buah_2024': float(latest.get('Total_Buah', latest.get('Pengeluaran_Buah', 0))),
                'Current_Sayur_2024': float(latest.get('Total_Sayur', latest.get('Pengeluaran_Sayur', 0))),
                'Growth_Rate_Buah': float((pred_buah_2025 / latest.get('Total_Buah', latest.get('Pengeluaran_Buah', 1)) - 1) * 100),
                'Growth_Rate_Sayur': float((pred_sayur_2025 / latest.get('Total_Sayur', latest.get('Pengeluaran_Sayur', 1)) - 1) * 100)
            })
    
    predictions_df = pd.DataFrame(predictions)
    return predictions_df

# Generate predictions
predictions_2025 = generate_predictions_2025()

print("âœ“ Predictions generated for 2025!")
print(f"\nTotal predictions: {len(predictions_2025)}")
print("\nSample predictions:")
predictions_2025.head(10)

âœ“ Predictions generated for 2025!

Total predictions: 0

Sample predictions:


## 5. Create Summary Statistics

In [6]:
def generate_summary_statistics():
    """
    Generate comprehensive summary statistics untuk dashboard
    """
    
    stats = {
        'overview': {
            'total_kabupaten': int(enriched_data['Kabupaten_Kota'].nunique()),
            'total_clusters': int(enriched_data['Cluster'].nunique()),
            'years_covered': sorted(enriched_data['Tahun'].unique().tolist()),
            'total_data_points': int(len(enriched_data))
        },
        'cluster_distribution': enriched_data.groupby('Cluster')['Kabupaten_Kota'].count().to_dict(),
        'cluster_labels': enriched_data.groupby('Cluster')['Cluster_Label'].first().to_dict(),
        'regional_distribution': enriched_data['Region'].value_counts().to_dict() if 'Region' in enriched_data.columns else {},
        'expenditure_summary': {
            'avg_buah': float(enriched_data.get('Total_Buah', enriched_data.get('Pengeluaran_Buah', pd.Series([0]))).mean()),
            'avg_sayur': float(enriched_data.get('Total_Sayur', enriched_data.get('Pengeluaran_Sayur', pd.Series([0]))).mean()),
            'max_buah': float(enriched_data.get('Total_Buah', enriched_data.get('Pengeluaran_Buah', pd.Series([0]))).max()),
            'max_sayur': float(enriched_data.get('Total_Sayur', enriched_data.get('Pengeluaran_Sayur', pd.Series([0]))).max()),
            'min_buah': float(enriched_data.get('Total_Buah', enriched_data.get('Pengeluaran_Buah', pd.Series([0]))).min()),
            'min_sayur': float(enriched_data.get('Total_Sayur', enriched_data.get('Pengeluaran_Sayur', pd.Series([0]))).min())
        },
        'cluster_profiles': cluster_profiles.to_dict('records'),
        'centroids': cluster_centroids.to_dict('records')
    }
    
    return stats

# Generate statistics
summary_stats = generate_summary_statistics()

print("âœ“ Summary statistics generated!")
print("\nOverview:")
print(json.dumps(summary_stats['overview'], indent=2))
print("\nCluster Distribution:")
print(json.dumps(summary_stats['cluster_distribution'], indent=2))

âœ“ Summary statistics generated!

Overview:
{
  "total_kabupaten": 514,
  "total_clusters": 3,
  "years_covered": [
    2024
  ],
  "total_data_points": 514
}

Cluster Distribution:
{
  "0": 209,
  "1": 298,
  "2": 7
}


## 6. Export Data for Frontend

Export processed data ke format yang mudah dikonsumsi oleh frontend.

In [7]:
# Create output directory
frontend_data_dir = Path('frontend/public/data')
frontend_data_dir.mkdir(parents=True, exist_ok=True)

# Create modeling subdirectory
modeling_dir_output = frontend_data_dir / 'modeling'
modeling_dir_output.mkdir(parents=True, exist_ok=True)

# Export predictions
predictions_2025.to_csv(modeling_dir_output / 'predictions_2025.csv', index=False)
print(f"âœ“ Exported: {modeling_dir_output / 'predictions_2025.csv'}")

# Export summary statistics as JSON
with open(modeling_dir_output / 'summary_statistics.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)
print(f"âœ“ Exported: {modeling_dir_output / 'summary_statistics.json'}")

# Create visualization data for charts
viz_data = {
    'cluster_sizes': enriched_data.groupby(['Cluster', 'Cluster_Label']).size().reset_index(name='count').to_dict('records'),
    'expenditure_by_cluster': enriched_data.groupby('Cluster').agg({
        'Pengeluaran_Buah': 'mean',
        'Pengeluaran_Sayur': 'mean'
    }).reset_index().to_dict('records')
}

# Only add predictions summary if predictions exist
if len(predictions_2025) > 0 and 'Cluster' in predictions_2025.columns:
    viz_data['predictions_summary'] = predictions_2025.groupby('Cluster').agg({
        'Predicted_Buah_2025': 'mean',
        'Predicted_Sayur_2025': 'mean',
        'Growth_Rate_Buah': 'mean',
        'Growth_Rate_Sayur': 'mean'
    }).reset_index().to_dict('records')
else:
    viz_data['predictions_summary'] = []
    print("âš  Warning: No predictions data available (need multi-year data for predictions)")

with open(modeling_dir_output / 'visualization_data.json', 'w') as f:
    json.dump(viz_data, f, indent=2)
print(f"âœ“ Exported: {modeling_dir_output / 'visualization_data.json'}")

print("\n" + "="*60)
print("DATA EXPORT COMPLETE")
print("="*60)
print(f"Files exported to: {frontend_data_dir.absolute()}")

âœ“ Exported: frontend\public\data\modeling\predictions_2025.csv
âœ“ Exported: frontend\public\data\modeling\summary_statistics.json
âœ“ Exported: frontend\public\data\modeling\visualization_data.json

DATA EXPORT COMPLETE
Files exported to: d:\Perkuliahan\2025-2026\Analisa Big Data\ProjectABD\frontend\public\data


## 7. Flask API Application

Create REST API endpoints untuk frontend consumption.

In [8]:
# Initialize Flask app
app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# ============================================
# API ENDPOINTS
# ============================================

@app.route('/api/health', methods=['GET'])
def health_check():
    """Health check endpoint"""
    return jsonify({
        'status': 'healthy',
        'timestamp': datetime.now().isoformat(),
        'version': '1.0.0'
    })

@app.route('/api/clusters', methods=['GET'])
def get_all_clusters():
    """
    Get all clustering data
    Query params:
    - year: Filter by year (2023, 2024)
    - cluster: Filter by cluster ID
    - region: Filter by region
    """
    try:
        df = enriched_data.copy()
        
        # Apply filters
        if 'year' in request.args:
            year = int(request.args.get('year'))
            df = df[df['Tahun'] == year]
        
        if 'cluster' in request.args:
            cluster = int(request.args.get('cluster'))
            df = df[df['Cluster'] == cluster]
        
        if 'region' in request.args:
            region = request.args.get('region')
            if 'Region' in df.columns:
                df = df[df['Region'] == region]
        
        # Convert to JSON-friendly format
        result = df.to_dict('records')
        
        return jsonify({
            'success': True,
            'count': len(result),
            'data': result
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/clusters/<int:cluster_id>', methods=['GET'])
def get_cluster_by_id(cluster_id):
    """
    Get specific cluster data and profile
    """
    try:
        # Get cluster data
        cluster_data = enriched_data[enriched_data['Cluster'] == cluster_id]
        
        # Get cluster profile
        profile = cluster_profiles[cluster_profiles['Cluster'] == cluster_id].to_dict('records')
        
        # Get centroid
        centroid = cluster_centroids[cluster_centroids['Cluster'] == cluster_id].to_dict('records')
        
        return jsonify({
            'success': True,
            'cluster_id': cluster_id,
            'profile': profile[0] if profile else {},
            'centroid': centroid[0] if centroid else {},
            'data': cluster_data.to_dict('records'),
            'count': len(cluster_data)
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/statistics', methods=['GET'])
def get_statistics():
    """
    Get summary statistics
    """
    try:
        return jsonify({
            'success': True,
            'data': summary_stats
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/predictions', methods=['GET'])
def get_predictions():
    """
    Get predictions for 2025
    Query params:
    - cluster: Filter by cluster ID
    - kabupaten: Search by kabupaten name
    """
    try:
        df = predictions_2025.copy()
        
        if 'cluster' in request.args:
            cluster = int(request.args.get('cluster'))
            df = df[df['Cluster'] == cluster]
        
        if 'kabupaten' in request.args:
            kabupaten = request.args.get('kabupaten').lower()
            df = df[df['Kabupaten_Kota'].str.lower().str.contains(kabupaten)]
        
        return jsonify({
            'success': True,
            'count': len(df),
            'data': df.to_dict('records')
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/regions', methods=['GET'])
def get_regions():
    """
    Get unique regions and their statistics
    """
    try:
        if 'Region' not in enriched_data.columns:
            return jsonify({
                'success': False,
                'error': 'Region data not available'
            }), 404
        
        regions = enriched_data.groupby('Region').agg({
            'Kabupaten_Kota': 'count',
            'Total_Buah': 'mean' if 'Total_Buah' in enriched_data.columns else lambda x: 0,
            'Total_Sayur': 'mean' if 'Total_Sayur' in enriched_data.columns else lambda x: 0
        }).reset_index()
        
        regions.columns = ['Region', 'Count', 'Avg_Buah', 'Avg_Sayur']
        
        return jsonify({
            'success': True,
            'data': regions.to_dict('records')
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/search', methods=['GET'])
def search_kabupaten():
    """
    Search kabupaten/kota by name
    Query params:
    - q: Search query
    """
    try:
        query = request.args.get('q', '').lower()
        
        if not query:
            return jsonify({
                'success': False,
                'error': 'Query parameter q is required'
            }), 400
        
        results = enriched_data[
            enriched_data['Kabupaten_Kota'].str.lower().str.contains(query)
        ]
        
        return jsonify({
            'success': True,
            'query': query,
            'count': len(results),
            'data': results.to_dict('records')
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/visualization', methods=['GET'])
def get_visualization_data():
    """
    Get pre-processed data for visualizations
    """
    try:
        return jsonify({
            'success': True,
            'data': viz_data
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

print("âœ“ Flask API initialized successfully!")
print("\nAvailable endpoints:")
print("  GET  /api/health")
print("  GET  /api/clusters")
print("  GET  /api/clusters/<id>")
print("  GET  /api/statistics")
print("  GET  /api/predictions")
print("  GET  /api/regions")
print("  GET  /api/search")
print("  GET  /api/visualization")

âœ“ Flask API initialized successfully!

Available endpoints:
  GET  /api/health
  GET  /api/clusters
  GET  /api/clusters/<id>
  GET  /api/statistics
  GET  /api/predictions
  GET  /api/regions
  GET  /api/search
  GET  /api/visualization


## 8. Save Flask App to Python File

**Note**: Flask server tidak bisa dijalankan langsung di Jupyter notebook.
Kita akan save Flask app ke file `api.py` yang bisa dijalankan di terminal.

**Cara menjalankan**:

**Option 1 - Jalankan di Jupyter Notebook/VS Code**:
- Jalankan semua cell di notebook ini (cell 1-13)
- File `api.py` akan otomatis dibuat
- Lalu jalankan `api.py` di terminal

**Option 2 - Langsung di terminal**:
```powershell
# Di terminal
python api.py
```

**Option 3 - Production dengan Gunicorn**:
```powershell
pip install gunicorn
gunicorn -w 4 -b 0.0.0.0:5000 api:app
```

In [None]:
flask_code = ''''
from flask import Flask, jsonify, request
from flask_cors import CORS
import pandas as pd
import numpy as np
from pathlib import Path
import json
from datetime import datetime
import warnings

warnings.filterwarnings("ignore")

# Load data
DATA_DIR = Path("data/result")
CLEANED_DIR = Path("data/cleaned")
JSON_EXPORT_DIR = Path("data/api_exports")

# Load clustering results
clustering_results = pd.read_csv(DATA_DIR / "clustering_results.csv")
cluster_profiles = pd.read_csv(DATA_DIR / "cluster_profiles.csv")
cluster_centroids = pd.read_csv(DATA_DIR / "cluster_centroids.csv")

# Prepare enriched data with cluster labels
cluster_labels_map = {
    0: "Low Expenditure",
    1: "Balanced Expenditure",
    2: "High Expenditure",
}
enriched_data = clustering_results.copy()
enriched_data["Cluster_Label"] = enriched_data["Cluster"].map(cluster_labels_map)

# Load JSON exports
try:
    with open(JSON_EXPORT_DIR / "all_clusters.json", "r", encoding="utf-8") as f:
        all_clusters_json = json.load(f)
except FileNotFoundError:
    all_clusters_json = {"metadata": {}, "data": []}

try:
    with open(JSON_EXPORT_DIR / "cluster_details.json", "r", encoding="utf-8") as f:
        cluster_details_json = json.load(f)
except FileNotFoundError:
    cluster_details_json = {"metadata": {}, "clusters": []}

try:
    with open(JSON_EXPORT_DIR / "predictions_full.json", "r", encoding="utf-8") as f:
        predictions_json = json.load(f)
except FileNotFoundError:
    predictions_json = {"metadata": {}, "predictions": []}

try:
    with open(JSON_EXPORT_DIR / "regional_analysis.json", "r", encoding="utf-8") as f:
        regional_json = json.load(f)
except FileNotFoundError:
    regional_json = {"metadata": {}, "regions": []}

try:
    with open(JSON_EXPORT_DIR / "expenditure_trends.json", "r", encoding="utf-8") as f:
        trends_json = json.load(f)
except FileNotFoundError:
    trends_json = {"metadata": {}, "trends": []}

try:
    with open(JSON_EXPORT_DIR / "api_metadata.json", "r", encoding="utf-8") as f:
        api_metadata = json.load(f)
except FileNotFoundError:
    api_metadata = {"api_version": "1.0.0", "data_summary": {}}

# Generate summary statistics
summary_stats = {
    "overview": {
        "total_kabupaten": int(enriched_data["Kabupaten_Kota"].nunique()),
        "total_clusters": int(enriched_data["Cluster"].nunique()),
        "years_covered": sorted(enriched_data["Tahun"].unique().tolist()),
        "total_data_points": int(len(enriched_data)),
    },
    "cluster_distribution": enriched_data.groupby("Cluster")["Kabupaten_Kota"]
    .count()
    .to_dict(),
    "cluster_labels": enriched_data.groupby("Cluster")["Cluster_Label"]
    .first()
    .to_dict(),
    "regional_distribution": (
        enriched_data["Region"].value_counts().to_dict()
        if "Region" in enriched_data.columns
        else {}
    ),
    "expenditure_summary": {
        "avg_buah": float(enriched_data["Pengeluaran_Buah"].mean()),
        "avg_sayur": float(enriched_data["Pengeluaran_Sayur"].mean()),
        "max_buah": float(enriched_data["Pengeluaran_Buah"].max()),
        "max_sayur": float(enriched_data["Pengeluaran_Sayur"].max()),
        "min_buah": float(enriched_data["Pengeluaran_Buah"].min()),
        "min_sayur": float(enriched_data["Pengeluaran_Sayur"].min()),
    },
    "cluster_profiles": cluster_profiles.to_dict("records"),
    "centroids": cluster_centroids.to_dict("records"),
}

# Generate predictions for 2025 using Linear Regression
from sklearn.linear_model import LinearRegression

predictions_list = []
for kabupaten in enriched_data["Kabupaten_Kota"].unique():
    kab_data = enriched_data[enriched_data["Kabupaten_Kota"] == kabupaten].sort_values(
        "Tahun"
    )

    if len(kab_data) >= 2:
        years = kab_data["Tahun"].values.reshape(-1, 1)

        # Predict Buah
        buah_values = kab_data["Pengeluaran_Buah"].values
        model_buah = LinearRegression()
        model_buah.fit(years, buah_values)
        pred_buah_2025 = model_buah.predict([[2025]])[0]

        # Predict Sayur
        sayur_values = kab_data["Pengeluaran_Sayur"].values
        model_sayur = LinearRegression()
        model_sayur.fit(years, sayur_values)
        pred_sayur_2025 = model_sayur.predict([[2025]])[0]

        latest = kab_data.iloc[-1]

        predictions_list.append(
            {
                "Kabupaten_Kota": kabupaten,
                "Region": latest.get("Region", "Unknown"),
                "Cluster": int(latest["Cluster"]),
                "Cluster_Label": latest["Cluster_Label"],
                "Predicted_Buah_2025": float(pred_buah_2025),
                "Predicted_Sayur_2025": float(pred_sayur_2025),
                "Predicted_Total_2025": float(pred_buah_2025 + pred_sayur_2025),
                "Current_Buah_2024": float(latest["Pengeluaran_Buah"]),
                "Current_Sayur_2024": float(latest["Pengeluaran_Sayur"]),
                "Growth_Rate_Buah": (
                    float((pred_buah_2025 / latest["Pengeluaran_Buah"] - 1) * 100)
                    if latest["Pengeluaran_Buah"] > 0
                    else 0
                ),
                "Growth_Rate_Sayur": (
                    float((pred_sayur_2025 / latest["Pengeluaran_Sayur"] - 1) * 100)
                    if latest["Pengeluaran_Sayur"] > 0
                    else 0
                ),
            }
        )

predictions_2025 = pd.DataFrame(predictions_list)

# Create visualization data
viz_data = {
    "cluster_sizes": enriched_data.groupby(["Cluster", "Cluster_Label"])
    .size()
    .reset_index(name="count")
    .to_dict("records"),
    "expenditure_by_cluster": enriched_data.groupby("Cluster")
    .agg({"Pengeluaran_Buah": "mean", "Pengeluaran_Sayur": "mean"})
    .reset_index()
    .to_dict("records"),
}

if len(predictions_2025) > 0:
    viz_data["predictions_summary"] = (
        predictions_2025.groupby("Cluster")
        .agg(
            {
                "Predicted_Buah_2025": "mean",
                "Predicted_Sayur_2025": "mean",
                "Growth_Rate_Buah": "mean",
                "Growth_Rate_Sayur": "mean",
            }
        )
        .reset_index()
        .to_dict("records")
    )
else:
    viz_data["predictions_summary"] = []

# Initialize Flask app
app = Flask(__name__)
CORS(app)


@app.route("/api/health", methods=["GET"])
def health_check():
    return jsonify(
        {
            "status": "healthy",
            "timestamp": datetime.now().isoformat(),
            "version": "1.0.0",
        }
    )


@app.route("/api/clusters", methods=["GET"])
def get_all_clusters():
    try:
        df = enriched_data.copy()

        if "year" in request.args:
            year = int(request.args.get("year"))
            df = df[df["Tahun"] == year]

        if "cluster" in request.args:
            cluster = int(request.args.get("cluster"))
            df = df[df["Cluster"] == cluster]

        if "region" in request.args:
            region = request.args.get("region")
            if "Region" in df.columns:
                df = df[df["Region"] == region]

        result = df.to_dict("records")

        return jsonify({"success": True, "count": len(result), "data": result})
    except Exception as e:
        return jsonify({"success": False, "error": str(e)}), 500


@app.route("/api/clusters/<int:cluster_id>", methods=["GET"])
def get_cluster_by_id(cluster_id):
    try:
        cluster_data = enriched_data[enriched_data["Cluster"] == cluster_id]
        profile = cluster_profiles[cluster_profiles["Cluster"] == cluster_id].to_dict(
            "records"
        )
        centroid = cluster_centroids[
            cluster_centroids["Cluster"] == cluster_id
        ].to_dict("records")

        return jsonify(
            {
                "success": True,
                "cluster_id": cluster_id,
                "profile": profile[0] if profile else {},
                "centroid": centroid[0] if centroid else {},
                "data": cluster_data.to_dict("records"),
                "count": len(cluster_data),
            }
        )
    except Exception as e:
        return jsonify({"success": False, "error": str(e)}), 500


@app.route("/api/statistics", methods=["GET"])
def get_statistics():
    try:
        return jsonify({"success": True, "data": summary_stats})
    except Exception as e:
        return jsonify({"success": False, "error": str(e)}), 500


@app.route("/api/predictions", methods=["GET"])
def get_predictions():
    try:
        df = predictions_2025.copy()

        if "cluster" in request.args and len(df) > 0:
            cluster = int(request.args.get("cluster"))
            df = df[df["Cluster"] == cluster]

        if "kabupaten" in request.args and len(df) > 0:
            kabupaten = request.args.get("kabupaten").lower()
            df = df[df["Kabupaten_Kota"].str.lower().str.contains(kabupaten)]

        return jsonify(
            {"success": True, "count": len(df), "data": df.to_dict("records")}
        )
    except Exception as e:
        return jsonify({"success": False, "error": str(e)}), 500


@app.route("/api/regions", methods=["GET"])
def get_regions():
    try:
        if "Region" not in enriched_data.columns:
            return (
                jsonify({"success": False, "error": "Region data not available"}),
                404,
            )

        # Use Pengeluaran_Buah and Pengeluaran_Sayur instead of Total_Buah and Total_Sayur
        regions = (
            enriched_data.groupby("Region")
            .agg(
                {
                    "Kabupaten_Kota": "count",
                    "Pengeluaran_Buah": "mean",
                    "Pengeluaran_Sayur": "mean",
                }
            )
            .reset_index()
        )

        regions.columns = ["Region", "Count", "Avg_Buah", "Avg_Sayur"]

        return jsonify({"success": True, "data": regions.to_dict("records")})
    except Exception as e:
        return jsonify({"success": False, "error": str(e)}), 500


@app.route("/api/regions/list", methods=["GET"])
def get_regions_list():
    """Get unique list of regions/provinces"""
    try:
        if "Region" not in enriched_data.columns:
            return (
                jsonify({"success": False, "error": "Region data not available"}),
                404,
            )

        # Get unique regions sorted alphabetically
        regions_list = sorted(enriched_data["Region"].dropna().unique().tolist())

        return jsonify(
            {"success": True, "count": len(regions_list), "data": regions_list}
        )
    except Exception as e:
        return jsonify({"success": False, "error": str(e)}), 500


@app.route("/api/search", methods=["GET"])
def search_kabupaten():
    try:
        query = request.args.get("q", "").lower()

        if not query:
            return (
                jsonify({"success": False, "error": "Query parameter q is required"}),
                400,
            )

        results = enriched_data[
            enriched_data["Kabupaten_Kota"].str.lower().str.contains(query)
        ]

        return jsonify(
            {
                "success": True,
                "query": query,
                "count": len(results),
                "data": results.to_dict("records"),
            }
        )
    except Exception as e:
        return jsonify({"success": False, "error": str(e)}), 500


@app.route("/api/visualization", methods=["GET"])
def get_visualization_data():
    try:
        return jsonify({"success": True, "data": viz_data})
    except Exception as e:
        return jsonify({"success": False, "error": str(e)}), 500


if __name__ == "__main__":
    print("=" * 60)
    print("STARTING FLASK API SERVER")
    print("=" * 60)
    print("Server Configuration:")
    print("  Host: 0.0.0.0")
    print("  Port: 5000")
    print("  Debug: True")
    print("  CORS: Enabled")
    print("API Base URL: http://localhost:5000/api")
    print("Press CTRL+C to stop the server")
    print("=" * 60)

    app.run(host="0.0.0.0", port=5000, debug=True)

'''
# Write to file
with open('api.py', 'w', encoding='utf-8') as f:
    f.write(flask_code)

print("âœ“ Flask API saved to api.py")
print("\n" + "="*60)
print("TO RUN THE API SERVER:")
print("="*60)
print("\nOption 1 - Development mode:")
print("  python api.py")
print("\nOption 2 - Production mode (install gunicorn first):")
print("  pip install gunicorn")
print("  gunicorn -w 4 -b 0.0.0.0:5000 api:app")
print("\nOption 3 - Run in background (Windows PowerShell):")
print("  Start-Process python -ArgumentList 'api.py' -WindowStyle Hidden")
print("\n" + "="*60)
print("\nAPI will be available at: http://localhost:5000/api")
print("Test with: http://localhost:5000/api/health")
print("="*60)

âœ“ Flask API saved to api.py

TO RUN THE API SERVER:

Option 1 - Development mode:
  python api.py

Option 2 - Production mode (install gunicorn first):
  pip install gunicorn
  gunicorn -w 4 -b 0.0.0.0:5000 api:app

Option 3 - Run in background (Windows PowerShell):
  Start-Process python -ArgumentList 'api.py' -WindowStyle Hidden


API will be available at: http://localhost:5000/api
Test with: http://localhost:5000/api/health


## 9. API Testing & Examples

Test API endpoints menggunakan requests library.

In [15]:
import requests

BASE_URL = 'http://localhost:5000/api'

print("="*60)
print("API ENDPOINT TESTING")
print("="*60)

# Test 1: Health Check
print("\n1. Testing /api/health...")
try:
    response = requests.get(f'{BASE_URL}/health')
    print(f"   Status: {response.status_code}")
    print(f"   Response: {response.json()}")
except Exception as e:
    print(f"   Error: {e}")

# Test 2: Get Statistics
print("\n2. Testing /api/statistics...")
try:
    response = requests.get(f'{BASE_URL}/statistics')
    print(f"   Status: {response.status_code}")
    data = response.json()
    if data['success']:
        print(f"   Total Kabupaten: {data['data']['overview']['total_kabupaten']}")
        print(f"   Total Clusters: {data['data']['overview']['total_clusters']}")
except Exception as e:
    print(f"   Error: {e}")

# Test 3: Get All Clusters
print("\n3. Testing /api/clusters...")
try:
    response = requests.get(f'{BASE_URL}/clusters?year=2024')
    print(f"   Status: {response.status_code}")
    data = response.json()
    if data['success']:
        print(f"   Data count: {data['count']}")
        print(f"   Sample: {data['data'][0] if data['data'] else 'No data'}")
except Exception as e:
    print(f"   Error: {e}")

# Test 4: Get Specific Cluster
print("\n4. Testing /api/clusters/0...")
try:
    response = requests.get(f'{BASE_URL}/clusters/0')
    print(f"   Status: {response.status_code}")
    data = response.json()
    if data['success']:
        print(f"   Cluster ID: {data['cluster_id']}")
        print(f"   Data count: {data['count']}")
        print(f"   Profile: {data['profile']}")
except Exception as e:
    print(f"   Error: {e}")

# Test 5: Get Predictions
print("\n5. Testing /api/predictions...")
try:
    response = requests.get(f'{BASE_URL}/predictions?cluster=0')
    print(f"   Status: {response.status_code}")
    data = response.json()
    if data['success']:
        print(f"   Predictions count: {data['count']}")
        if data['data']:
            sample = data['data'][0]
            print(f"   Sample prediction for {sample['Kabupaten_Kota']}:")
            print(f"     Predicted Buah 2025: Rp {sample['Predicted_Buah_2025']:,.0f}")
            print(f"     Growth Rate: {sample['Growth_Rate_Buah']:.2f}%")
except Exception as e:
    print(f"   Error: {e}")

# Test 6: Search
print("\n6. Testing /api/search?q=jakarta...")
try:
    response = requests.get(f'{BASE_URL}/search?q=jakarta')
    print(f"   Status: {response.status_code}")
    data = response.json()
    if data['success']:
        print(f"   Results found: {data['count']}")
        if data['data']:
            print(f"   First result: {data['data'][0]['Kabupaten_Kota']}")
except Exception as e:
    print(f"   Error: {e}")

print("\n" + "="*60)
print("âœ“ API Testing Complete!")
print("="*60)

API ENDPOINT TESTING

1. Testing /api/health...
   Status: 200
   Response: {'status': 'healthy', 'timestamp': '2025-11-29T12:39:05.967392', 'version': '1.0.0'}

2. Testing /api/statistics...
   Status: 200
   Total Kabupaten: 514
   Total Clusters: 3

3. Testing /api/clusters...
   Status: 200
   Data count: 514
   Sample: {'Cluster': 0, 'Cluster_Label': 'Low Expenditure', 'Kabupaten_Kota': 'Aceh Barat', 'Pengeluaran_Buah': 11160.0, 'Pengeluaran_Sayur': 15821.0, 'Region': 'Aceh', 'Tahun': 2024}

4. Testing /api/clusters/0...
   Status: 200
   Cluster ID: 0
   Data count: 209
   Profile: {'Centroid_Buah': 12265.885167464114, 'Centroid_Sayur': 15813.181818181818, 'Cluster': 0, 'Mean_Buah': 12265.885167464116, 'Mean_Sayur': 15813.181818181818, 'Median_Buah': 11793.0, 'Median_Sayur': 15504.0, 'Silhouette_Score': 0.3697304324345739, 'Size': 209, 'Std_Buah': 2167.8291596886147, 'Std_Sayur': 4077.841028923005}

5. Testing /api/predictions...
   Status: 200
   Predictions count: 0

6. Testing

## 10. Integration Guide for Frontend

### React/TypeScript Integration Example

```typescript
// services/apiService.ts
const API_BASE_URL = 'http://localhost:5000/api';

export const apiService = {
  // Get all clusters
  getClusters: async (filters?: { year?: number; cluster?: number }) => {
    const params = new URLSearchParams(filters as any);
    const response = await fetch(`${API_BASE_URL}/clusters?${params}`);
    return response.json();
  },

  // Get specific cluster
  getClusterById: async (id: number) => {
    const response = await fetch(`${API_BASE_URL}/clusters/${id}`);
    return response.json();
  },

  // Get statistics
  getStatistics: async () => {
    const response = await fetch(`${API_BASE_URL}/statistics`);
    return response.json();
  },

  // Get predictions
  getPredictions: async (cluster?: number) => {
    const params = cluster ? `?cluster=${cluster}` : '';
    const response = await fetch(`${API_BASE_URL}/predictions${params}`);
    return response.json();
  },

  // Search kabupaten
  searchKabupaten: async (query: string) => {
    const response = await fetch(`${API_BASE_URL}/search?q=${encodeURIComponent(query)}`);
    return response.json();
  }
};
```

### Usage in React Components

```typescript
// components/ClusterDashboard.tsx
import { useEffect, useState } from 'react';
import { apiService } from '../services/apiService';

function ClusterDashboard() {
  const [statistics, setStatistics] = useState(null);
  const [loading, setLoading] = useState(true);

  useEffect(() => {
    const fetchData = async () => {
      try {
        const stats = await apiService.getStatistics();
        if (stats.success) {
          setStatistics(stats.data);
        }
      } catch (error) {
        console.error('Error fetching statistics:', error);
      } finally {
        setLoading(false);
      }
    };

    fetchData();
  }, []);

  if (loading) return <div>Loading...</div>;

  return (
    <div>
      <h1>Cluster Analysis Dashboard</h1>
      <p>Total Kabupaten: {statistics?.overview.total_kabupaten}</p>
      <p>Total Clusters: {statistics?.overview.total_clusters}</p>
    </div>
  );
}
```

## 11. Batch Export All JSON Files

Program untuk mengekspor semua data ke format JSON yang dapat dikonsumsi oleh API atau aplikasi lain.

**File yang akan dibuat**:
1. `all_clusters.json` - Semua data clustering
2. `cluster_details.json` - Detail per cluster dengan profil dan centroid
3. `predictions_full.json` - Prediksi lengkap untuk 2025
4. `regional_analysis.json` - Analisis per region
5. `expenditure_trends.json` - Tren pengeluaran dari 2023-2024
6. `api_metadata.json` - Metadata untuk dokumentasi API

In [12]:
import json
from pathlib import Path
from datetime import datetime

# Create JSON export directory
json_export_dir = Path('data/api_exports')
json_export_dir.mkdir(parents=True, exist_ok=True)

print("="*60)
print("BATCH JSON EXPORT - STARTED")
print("="*60)
print(f"\nExport directory: {json_export_dir.absolute()}\n")

# ============================================
# 1. ALL CLUSTERS DATA
# ============================================
print("1. Exporting all_clusters.json...")
all_clusters_data = {
    'metadata': {
        'total_records': len(enriched_data),
        'total_kabupaten': enriched_data['Kabupaten_Kota'].nunique(),
        'years': sorted(enriched_data['Tahun'].unique().tolist()),
        'clusters': sorted(enriched_data['Cluster'].unique().tolist()),
        'generated_at': datetime.now().isoformat()
    },
    'data': enriched_data.to_dict('records')
}

with open(json_export_dir / 'all_clusters.json', 'w', encoding='utf-8') as f:
    json.dump(all_clusters_data, f, indent=2, ensure_ascii=False)
print(f"   âœ“ Exported: {len(all_clusters_data['data'])} records")

# ============================================
# 2. CLUSTER DETAILS (per cluster with profile & centroid)
# ============================================
print("\n2. Exporting cluster_details.json...")
cluster_details = []

for cluster_id in sorted(enriched_data['Cluster'].unique()):
    # Get cluster data
    cluster_data = enriched_data[enriched_data['Cluster'] == cluster_id]
    
    # Get profile
    profile = cluster_profiles[cluster_profiles['Cluster'] == cluster_id].to_dict('records')
    profile_data = profile[0] if profile else {}
    
    # Get centroid
    centroid = cluster_centroids[cluster_centroids['Cluster'] == cluster_id].to_dict('records')
    centroid_data = centroid[0] if centroid else {}
    
    # Get regional distribution
    regional_dist = {}
    if 'Region' in cluster_data.columns:
        regional_dist = cluster_data['Region'].value_counts().to_dict()
    
    cluster_details.append({
        'cluster_id': int(cluster_id),
        'cluster_label': cluster_data['Cluster_Label'].iloc[0] if 'Cluster_Label' in cluster_data.columns else f"Cluster {cluster_id}",
        'size': len(cluster_data),
        'percentage': round((len(cluster_data) / len(enriched_data)) * 100, 2),
        'profile': profile_data,
        'centroid': centroid_data,
        'regional_distribution': regional_dist,
        'sample_kabupaten': cluster_data['Kabupaten_Kota'].head(10).tolist(),
        'statistics': {
            'avg_buah': float(cluster_data['Pengeluaran_Buah'].mean()),
            'avg_sayur': float(cluster_data['Pengeluaran_Sayur'].mean()),
            'median_buah': float(cluster_data['Pengeluaran_Buah'].median()),
            'median_sayur': float(cluster_data['Pengeluaran_Sayur'].median()),
            'std_buah': float(cluster_data['Pengeluaran_Buah'].std()),
            'std_sayur': float(cluster_data['Pengeluaran_Sayur'].std())
        }
    })

cluster_details_output = {
    'metadata': {
        'total_clusters': len(cluster_details),
        'generated_at': datetime.now().isoformat()
    },
    'clusters': cluster_details
}

with open(json_export_dir / 'cluster_details.json', 'w', encoding='utf-8') as f:
    json.dump(cluster_details_output, f, indent=2, ensure_ascii=False)
print(f"   âœ“ Exported: {len(cluster_details)} cluster profiles")

# ============================================
# 3. PREDICTIONS FULL
# ============================================
print("\n3. Exporting predictions_full.json...")
if len(predictions_2025) > 0:
    predictions_output = {
        'metadata': {
            'total_predictions': len(predictions_2025),
            'prediction_year': 2025,
            'base_years': [2023, 2024],
            'generated_at': datetime.now().isoformat()
        },
        'predictions': predictions_2025.to_dict('records'),
        'summary_by_cluster': predictions_2025.groupby('Cluster').agg({
            'Predicted_Buah_2025': 'mean',
            'Predicted_Sayur_2025': 'mean',
            'Growth_Rate_Buah': 'mean',
            'Growth_Rate_Sayur': 'mean'
        }).reset_index().to_dict('records')
    }
    
    with open(json_export_dir / 'predictions_full.json', 'w', encoding='utf-8') as f:
        json.dump(predictions_output, f, indent=2, ensure_ascii=False)
    print(f"   âœ“ Exported: {len(predictions_2025)} predictions")
else:
    print("   âš  No predictions available (multi-year data required)")
    # Create empty structure
    predictions_output = {
        'metadata': {
            'total_predictions': 0,
            'prediction_year': 2025,
            'generated_at': datetime.now().isoformat(),
            'note': 'No predictions available - multi-year data required'
        },
        'predictions': [],
        'summary_by_cluster': []
    }
    with open(json_export_dir / 'predictions_full.json', 'w', encoding='utf-8') as f:
        json.dump(predictions_output, f, indent=2, ensure_ascii=False)

# ============================================
# 4. REGIONAL ANALYSIS
# ============================================
print("\n4. Exporting regional_analysis.json...")
regional_analysis = []

if 'Region' in enriched_data.columns:
    for region in sorted(enriched_data['Region'].unique()):
        region_data = enriched_data[enriched_data['Region'] == region]
        
        # Cluster distribution in this region
        cluster_dist = region_data['Cluster'].value_counts().to_dict()
        
        regional_analysis.append({
            'region': region,
            'total_kabupaten': len(region_data),
            'percentage': round((len(region_data) / len(enriched_data)) * 100, 2),
            'cluster_distribution': cluster_dist,
            'expenditure': {
                'avg_buah': float(region_data['Pengeluaran_Buah'].mean()),
                'avg_sayur': float(region_data['Pengeluaran_Sayur'].mean()),
                'total_avg': float(region_data['Pengeluaran_Buah'].mean() + region_data['Pengeluaran_Sayur'].mean()),
                'max_buah': float(region_data['Pengeluaran_Buah'].max()),
                'max_sayur': float(region_data['Pengeluaran_Sayur'].max()),
                'min_buah': float(region_data['Pengeluaran_Buah'].min()),
                'min_sayur': float(region_data['Pengeluaran_Sayur'].min())
            },
            'top_kabupaten': region_data.nlargest(5, 'Pengeluaran_Buah')['Kabupaten_Kota'].tolist()
        })
    
    regional_output = {
        'metadata': {
            'total_regions': len(regional_analysis),
            'generated_at': datetime.now().isoformat()
        },
        'regions': regional_analysis
    }
    
    with open(json_export_dir / 'regional_analysis.json', 'w', encoding='utf-8') as f:
        json.dump(regional_output, f, indent=2, ensure_ascii=False)
    print(f"   âœ“ Exported: {len(regional_analysis)} regional profiles")
else:
    print("   âš  Region data not available")

# ============================================
# 5. EXPENDITURE TRENDS (2023-2024)
# ============================================
print("\n5. Exporting expenditure_trends.json...")
trends_data = []

# Get data for each kabupaten across years
for kabupaten in enriched_data['Kabupaten_Kota'].unique():
    kab_data = enriched_data[enriched_data['Kabupaten_Kota'] == kabupaten].sort_values('Tahun')
    
    if len(kab_data) >= 2:  # Has data for both years
        data_2023 = kab_data[kab_data['Tahun'] == 2023].iloc[0] if len(kab_data[kab_data['Tahun'] == 2023]) > 0 else None
        data_2024 = kab_data[kab_data['Tahun'] == 2024].iloc[0] if len(kab_data[kab_data['Tahun'] == 2024]) > 0 else None
        
        if data_2023 is not None and data_2024 is not None:
            # Calculate growth
            growth_buah = ((data_2024['Pengeluaran_Buah'] - data_2023['Pengeluaran_Buah']) / data_2023['Pengeluaran_Buah']) * 100 if data_2023['Pengeluaran_Buah'] > 0 else 0
            growth_sayur = ((data_2024['Pengeluaran_Sayur'] - data_2023['Pengeluaran_Sayur']) / data_2023['Pengeluaran_Sayur']) * 100 if data_2023['Pengeluaran_Sayur'] > 0 else 0
            
            trends_data.append({
                'kabupaten': kabupaten,
                'region': data_2024.get('Region', 'Unknown'),
                'cluster': int(data_2024['Cluster']),
                'year_2023': {
                    'buah': float(data_2023['Pengeluaran_Buah']),
                    'sayur': float(data_2023['Pengeluaran_Sayur']),
                    'total': float(data_2023['Pengeluaran_Buah'] + data_2023['Pengeluaran_Sayur'])
                },
                'year_2024': {
                    'buah': float(data_2024['Pengeluaran_Buah']),
                    'sayur': float(data_2024['Pengeluaran_Sayur']),
                    'total': float(data_2024['Pengeluaran_Buah'] + data_2024['Pengeluaran_Sayur'])
                },
                'growth': {
                    'buah_percent': round(float(growth_buah), 2),
                    'sayur_percent': round(float(growth_sayur), 2),
                    'buah_absolute': float(data_2024['Pengeluaran_Buah'] - data_2023['Pengeluaran_Buah']),
                    'sayur_absolute': float(data_2024['Pengeluaran_Sayur'] - data_2023['Pengeluaran_Sayur'])
                }
            })

trends_output = {
    'metadata': {
        'total_kabupaten': len(trends_data),
        'years': [2023, 2024],
        'generated_at': datetime.now().isoformat()
    },
    'trends': trends_data
}

with open(json_export_dir / 'expenditure_trends.json', 'w', encoding='utf-8') as f:
    json.dump(trends_output, f, indent=2, ensure_ascii=False)
print(f"   âœ“ Exported: {len(trends_data)} trend records")

# ============================================
# 6. API METADATA
# ============================================
print("\n6. Exporting api_metadata.json...")
api_metadata = {
    'api_version': '1.0.0',
    'generated_at': datetime.now().isoformat(),
    'data_summary': {
        'total_kabupaten': int(enriched_data['Kabupaten_Kota'].nunique()),
        'total_clusters': int(enriched_data['Cluster'].nunique()),
        'total_regions': int(enriched_data['Region'].nunique()) if 'Region' in enriched_data.columns else 0,
        'years_available': sorted(enriched_data['Tahun'].unique().tolist()),
        'total_records': len(enriched_data)
    },
    'files': {
        'all_clusters.json': {
            'description': 'Complete clustering dataset with all kabupaten/kota',
            'records': len(enriched_data),
            'size_kb': round((json_export_dir / 'all_clusters.json').stat().st_size / 1024, 2)
        },
        'cluster_details.json': {
            'description': 'Detailed profile for each cluster including statistics and samples',
            'clusters': len(cluster_details),
            'size_kb': round((json_export_dir / 'cluster_details.json').stat().st_size / 1024, 2)
        },
        'predictions_full.json': {
            'description': 'Predictions for 2025 based on 2023-2024 trends',
            'predictions': len(predictions_2025) if len(predictions_2025) > 0 else 0,
            'size_kb': round((json_export_dir / 'predictions_full.json').stat().st_size / 1024, 2)
        },
        'regional_analysis.json': {
            'description': 'Analysis grouped by region/province',
            'regions': len(regional_analysis) if 'Region' in enriched_data.columns else 0,
            'size_kb': round((json_export_dir / 'regional_analysis.json').stat().st_size / 1024, 2) if (json_export_dir / 'regional_analysis.json').exists() else 0
        },
        'expenditure_trends.json': {
            'description': 'Year-over-year expenditure trends (2023-2024)',
            'records': len(trends_data),
            'size_kb': round((json_export_dir / 'expenditure_trends.json').stat().st_size / 1024, 2)
        }
    },
    'endpoints': [
        {'path': '/api/health', 'method': 'GET', 'description': 'Health check'},
        {'path': '/api/clusters', 'method': 'GET', 'description': 'Get all clusters with filters'},
        {'path': '/api/clusters/<id>', 'method': 'GET', 'description': 'Get specific cluster details'},
        {'path': '/api/statistics', 'method': 'GET', 'description': 'Get summary statistics'},
        {'path': '/api/predictions', 'method': 'GET', 'description': 'Get 2025 predictions'},
        {'path': '/api/regions', 'method': 'GET', 'description': 'Get regional statistics'},
        {'path': '/api/search', 'method': 'GET', 'description': 'Search kabupaten by name'},
        {'path': '/api/visualization', 'method': 'GET', 'description': 'Get visualization data'}
    ]
}

with open(json_export_dir / 'api_metadata.json', 'w', encoding='utf-8') as f:
    json.dump(api_metadata, f, indent=2, ensure_ascii=False)
print(f"   âœ“ Exported: API metadata and documentation")

# ============================================
# SUMMARY
# ============================================
print("\n" + "="*60)
print("BATCH JSON EXPORT - COMPLETED")
print("="*60)
print(f"\nExport Directory: {json_export_dir.absolute()}")
print(f"\nFiles created:")
print(f"  1. all_clusters.json         - {len(enriched_data)} records")
print(f"  2. cluster_details.json      - {len(cluster_details)} clusters")
print(f"  3. predictions_full.json     - {len(predictions_2025)} predictions")
print(f"  4. regional_analysis.json    - {len(regional_analysis) if 'Region' in enriched_data.columns else 0} regions")
print(f"  5. expenditure_trends.json   - {len(trends_data)} trends")
print(f"  6. api_metadata.json         - API documentation")

# Calculate total size
total_size = sum([f.stat().st_size for f in json_export_dir.glob('*.json')])
print(f"\nTotal size: {total_size / 1024:.2f} KB ({total_size / (1024*1024):.2f} MB)")
print("="*60)
print("\nâœ“ All JSON files ready for API consumption!")
print("âœ“ Files can be served statically or loaded by backend API")
print("="*60)

BATCH JSON EXPORT - STARTED

Export directory: d:\Perkuliahan\2025-2026\Analisa Big Data\ProjectABD\data\api_exports

1. Exporting all_clusters.json...
   âœ“ Exported: 514 records

2. Exporting cluster_details.json...
   âœ“ Exported: 3 cluster profiles

3. Exporting predictions_full.json...
   âš  No predictions available (multi-year data required)

4. Exporting regional_analysis.json...
   âœ“ Exported: 35 regional profiles

5. Exporting expenditure_trends.json...
   âœ“ Exported: 0 trend records

6. Exporting api_metadata.json...
   âœ“ Exported: API metadata and documentation

BATCH JSON EXPORT - COMPLETED

Export Directory: d:\Perkuliahan\2025-2026\Analisa Big Data\ProjectABD\data\api_exports

Files created:
  1. all_clusters.json         - 514 records
  2. cluster_details.json      - 3 clusters
  3. predictions_full.json     - 0 predictions
  4. regional_analysis.json    - 35 regions
  5. expenditure_trends.json   - 0 trends
  6. api_metadata.json         - API documentation

To

## 12. Verifikasi File JSON

Membaca dan memverifikasi semua file JSON yang telah dibuat untuk memastikan struktur data benar.

In [13]:
# Verify all JSON files
json_files = list(json_export_dir.glob('*.json'))

print("="*60)
print("JSON FILES VERIFICATION")
print("="*60)

for json_file in sorted(json_files):
    print(f"\nðŸ“„ {json_file.name}")
    print("â”€" * 60)
    
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Get file size
        size_kb = json_file.stat().st_size / 1024
        
        print(f"   Size: {size_kb:.2f} KB")
        print(f"   Valid JSON: âœ“")
        
        # Show structure based on file type
        if 'metadata' in data:
            print(f"   Metadata: âœ“")
            if 'total_records' in data['metadata']:
                print(f"   Records: {data['metadata']['total_records']}")
            if 'total_clusters' in data['metadata']:
                print(f"   Clusters: {data['metadata']['total_clusters']}")
            if 'total_predictions' in data['metadata']:
                print(f"   Predictions: {data['metadata']['total_predictions']}")
            if 'total_regions' in data['metadata']:
                print(f"   Regions: {data['metadata']['total_regions']}")
        
        # Count top-level keys
        print(f"   Top-level keys: {list(data.keys())}")
        
        # Sample first record if available
        if 'data' in data and len(data['data']) > 0:
            print(f"   Sample record keys: {list(data['data'][0].keys())[:5]}...")
        elif 'clusters' in data and len(data['clusters']) > 0:
            print(f"   Sample cluster keys: {list(data['clusters'][0].keys())[:5]}...")
        elif 'regions' in data and len(data['regions']) > 0:
            print(f"   Sample region keys: {list(data['regions'][0].keys())[:5]}...")
        elif 'predictions' in data and len(data['predictions']) > 0:
            print(f"   Sample prediction keys: {list(data['predictions'][0].keys())[:5]}...")
        elif 'trends' in data and len(data['trends']) > 0:
            print(f"   Sample trend keys: {list(data['trends'][0].keys())[:5]}...")
        
    except json.JSONDecodeError as e:
        print(f"   âœ— Invalid JSON: {e}")
    except Exception as e:
        print(f"   âœ— Error: {e}")

print("\n" + "="*60)
print("âœ“ Verification complete!")
print("="*60)

JSON FILES VERIFICATION

ðŸ“„ all_clusters.json
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
   Size: 149.11 KB
   Valid JSON: âœ“
   Metadata: âœ“
   Records: 514
   Top-level keys: ['metadata', 'data']
   Sample record keys: ['Kabupaten_Kota', 'Tahun', 'Region', 'Pengeluaran_Buah', 'Pengeluaran_Sayur']...

ðŸ“„ api_metadata.json
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
   Size: 2.01 KB
   Valid JSON: âœ“
   Top-level keys: ['api_version', 'generated_at', 'data_summary', 'files', 'endpoints']

ðŸ“„ cluster_details.json
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
   Size: 5.72 KB
