## 1. Import Libraries

## 1. Setup & Import Libraries

In [11]:
# Install required packages
# !pip install flask flask-cors pandas numpy scikit-learn python-dotenv -q

In [12]:
from flask import Flask, jsonify, request
from flask_cors import CORS
import pandas as pd
import numpy as np
from pathlib import Path
import json
from datetime import datetime
from sklearn.linear_model import LinearRegression

## 2. Load Data from Modeling Results

In [13]:
# Define paths
DATA_DIR = Path('data/result')
CLEANED_DIR = Path('data/cleaned')

# Load clustering results
clustering_results = pd.read_csv(DATA_DIR / 'clustering_results.csv')
cluster_profiles = pd.read_csv(DATA_DIR / 'cluster_profiles.csv')
cluster_centroids = pd.read_csv(DATA_DIR / 'cluster_centroids.csv')

# Load cleaned data for additional features
data_integrated = pd.read_csv(CLEANED_DIR / 'data_integrated_wide.csv')

print("="*60)
print("DATA LOADED SUCCESSFULLY")
print("="*60)
print(f"Clustering Results: {clustering_results.shape}")
print(f"Cluster Profiles: {cluster_profiles.shape}")
print(f"Cluster Centroids: {cluster_centroids.shape}")
print(f"Integrated Data: {data_integrated.shape}")
print("\nClustering Results columns:")
print(clustering_results.columns.tolist())
print("\nSample data:")
clustering_results.head()

DATA LOADED SUCCESSFULLY
Clustering Results: (514, 6)
Cluster Profiles: (3, 11)
Cluster Centroids: (3, 3)
Integrated Data: (1028, 47)

Clustering Results columns:
['Kabupaten_Kota', 'Tahun', 'Region', 'Pengeluaran_Buah', 'Pengeluaran_Sayur', 'Cluster']

Sample data:


Unnamed: 0,Kabupaten_Kota,Tahun,Region,Pengeluaran_Buah,Pengeluaran_Sayur,Cluster
0,Aceh Barat,2024,Aceh,11160.0,15821.0,0
1,Aceh Barat Daya,2024,Aceh,7231.0,13790.0,1
2,Aceh Besar,2024,Aceh,6689.0,14052.0,1
3,Aceh Jaya,2024,Aceh,8789.0,14197.0,1
4,Aceh Selatan,2024,Aceh,5682.0,14771.0,1


## 3. Data Processing & Preparation

Prepare data untuk API responses dengan enrichment dan transformations.

In [14]:
def prepare_cluster_data():
    """
    Prepare enriched cluster data dengan tambahan informasi:
    - Region mapping
    - Cluster labels yang lebih deskriptif
    - Category classification
    """

    # Add cluster labels berdasarkan profil
    cluster_labels = {
        0: "Low Expenditure",
        1: "Balanced Expenditure",
        2: "High Expenditure",
    }

    # Enrich clustering results
    df = clustering_results.copy()
    df['Cluster_Label'] = df['Cluster'].map(cluster_labels)

    # Add category based on expenditure levels
    df['Cluster_Category'] = df['Cluster_Label']

    return df

# Process data
enriched_data = prepare_cluster_data()

print("✓ Data preparation complete!")
print(f"\nEnriched data shape: {enriched_data.shape}")
print("\nNew columns added:")
print([col for col in enriched_data.columns if col not in clustering_results.columns])
print("\nSample enriched data:")
enriched_data.head()

✓ Data preparation complete!

Enriched data shape: (514, 8)

New columns added:
['Cluster_Label', 'Cluster_Category']

Sample enriched data:


Unnamed: 0,Kabupaten_Kota,Tahun,Region,Pengeluaran_Buah,Pengeluaran_Sayur,Cluster,Cluster_Label,Cluster_Category
0,Aceh Barat,2024,Aceh,11160.0,15821.0,0,Low Expenditure,Low Expenditure
1,Aceh Barat Daya,2024,Aceh,7231.0,13790.0,1,Balanced Expenditure,Balanced Expenditure
2,Aceh Besar,2024,Aceh,6689.0,14052.0,1,Balanced Expenditure,Balanced Expenditure
3,Aceh Jaya,2024,Aceh,8789.0,14197.0,1,Balanced Expenditure,Balanced Expenditure
4,Aceh Selatan,2024,Aceh,5682.0,14771.0,1,Balanced Expenditure,Balanced Expenditure


## 4. Generate Predictions for 2025

Menggunakan Linear Regression untuk prediksi tren pengeluaran tahun 2025.

In [15]:
def generate_predictions_2025():
    """
    Generate predictions untuk pengeluaran 2025 berdasarkan tren 2023-2024
    """
    predictions = []
    
    # Group by Kabupaten_Kota untuk time series
    for kabupaten in enriched_data['Kabupaten_Kota'].unique():
        kab_data = enriched_data[enriched_data['Kabupaten_Kota'] == kabupaten].sort_values('Tahun')
        
        if len(kab_data) >= 2:  # Need at least 2 years for prediction
            years = kab_data['Tahun'].values.reshape(-1, 1)
            
            # Predict Buah
            if 'Total_Buah' in kab_data.columns:
                buah_values = kab_data['Total_Buah'].values
                model_buah = LinearRegression()
                model_buah.fit(years, buah_values)
                pred_buah_2025 = model_buah.predict([[2025]])[0]
            else:
                pred_buah_2025 = kab_data['Pengeluaran_Buah'].iloc[-1] * 1.05  # 5% growth assumption
            
            # Predict Sayur
            if 'Total_Sayur' in kab_data.columns:
                sayur_values = kab_data['Total_Sayur'].values
                model_sayur = LinearRegression()
                model_sayur.fit(years, sayur_values)
                pred_sayur_2025 = model_sayur.predict([[2025]])[0]
            else:
                pred_sayur_2025 = kab_data['Pengeluaran_Sayur'].iloc[-1] * 1.05
            
            # Get latest cluster info
            latest = kab_data.iloc[-1]
            
            predictions.append({
                'Kabupaten_Kota': kabupaten,
                'Region': latest.get('Region', 'Unknown'),
                'Cluster': int(latest['Cluster']),
                'Cluster_Label': latest['Cluster_Label'],
                'Predicted_Buah_2025': float(pred_buah_2025),
                'Predicted_Sayur_2025': float(pred_sayur_2025),
                'Predicted_Total_2025': float(pred_buah_2025 + pred_sayur_2025),
                'Current_Buah_2024': float(latest.get('Total_Buah', latest.get('Pengeluaran_Buah', 0))),
                'Current_Sayur_2024': float(latest.get('Total_Sayur', latest.get('Pengeluaran_Sayur', 0))),
                'Growth_Rate_Buah': float((pred_buah_2025 / latest.get('Total_Buah', latest.get('Pengeluaran_Buah', 1)) - 1) * 100),
                'Growth_Rate_Sayur': float((pred_sayur_2025 / latest.get('Total_Sayur', latest.get('Pengeluaran_Sayur', 1)) - 1) * 100)
            })
    
    predictions_df = pd.DataFrame(predictions)
    return predictions_df

# Generate predictions
predictions_2025 = generate_predictions_2025()

print("✓ Predictions generated for 2025!")
print(f"\nTotal predictions: {len(predictions_2025)}")
print("\nSample predictions:")
predictions_2025.head(10)

✓ Predictions generated for 2025!

Total predictions: 0

Sample predictions:


## 5. Create Summary Statistics

In [16]:
def generate_summary_statistics():
    """
    Generate comprehensive summary statistics untuk dashboard
    """
    
    stats = {
        'overview': {
            'total_kabupaten': int(enriched_data['Kabupaten_Kota'].nunique()),
            'total_clusters': int(enriched_data['Cluster'].nunique()),
            'years_covered': sorted(enriched_data['Tahun'].unique().tolist()),
            'total_data_points': int(len(enriched_data))
        },
        'cluster_distribution': enriched_data.groupby('Cluster')['Kabupaten_Kota'].count().to_dict(),
        'cluster_labels': enriched_data.groupby('Cluster')['Cluster_Label'].first().to_dict(),
        'regional_distribution': enriched_data['Region'].value_counts().to_dict() if 'Region' in enriched_data.columns else {},
        'expenditure_summary': {
            'avg_buah': float(enriched_data.get('Total_Buah', enriched_data.get('Pengeluaran_Buah', pd.Series([0]))).mean()),
            'avg_sayur': float(enriched_data.get('Total_Sayur', enriched_data.get('Pengeluaran_Sayur', pd.Series([0]))).mean()),
            'max_buah': float(enriched_data.get('Total_Buah', enriched_data.get('Pengeluaran_Buah', pd.Series([0]))).max()),
            'max_sayur': float(enriched_data.get('Total_Sayur', enriched_data.get('Pengeluaran_Sayur', pd.Series([0]))).max()),
            'min_buah': float(enriched_data.get('Total_Buah', enriched_data.get('Pengeluaran_Buah', pd.Series([0]))).min()),
            'min_sayur': float(enriched_data.get('Total_Sayur', enriched_data.get('Pengeluaran_Sayur', pd.Series([0]))).min())
        },
        'cluster_profiles': cluster_profiles.to_dict('records'),
        'centroids': cluster_centroids.to_dict('records')
    }
    
    return stats

# Generate statistics
summary_stats = generate_summary_statistics()

print("✓ Summary statistics generated!")
print("\nOverview:")
print(json.dumps(summary_stats['overview'], indent=2))
print("\nCluster Distribution:")
print(json.dumps(summary_stats['cluster_distribution'], indent=2))

✓ Summary statistics generated!

Overview:
{
  "total_kabupaten": 514,
  "total_clusters": 3,
  "years_covered": [
    2024
  ],
  "total_data_points": 514
}

Cluster Distribution:
{
  "0": 209,
  "1": 298,
  "2": 7
}


## 6. Export Data for Frontend

Export processed data ke format yang mudah dikonsumsi oleh frontend.

In [18]:
# Create output directory
frontend_data_dir = Path('frontend/public/data')
frontend_data_dir.mkdir(parents=True, exist_ok=True)

# Export predictions
predictions_2025.to_csv(frontend_data_dir / 'modeling/predictions_2025.csv', index=False)
print(f"✓ Exported: {frontend_data_dir / 'modeling/predictions_2025.csv'}")

# Export summary statistics as JSON
with open(frontend_data_dir / 'modeling/summary_statistics.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)
print(f"✓ Exported: {frontend_data_dir / 'modeling/summary_statistics.json'}")

# Create visualization data for charts
viz_data = {
    'cluster_sizes': enriched_data.groupby(['Cluster', 'Cluster_Label']).size().reset_index(name='count').to_dict('records'),
    'expenditure_by_cluster': clustering_results.groupby('Cluster').agg({
        'Pengeluaran_Buah': 'mean',
        'Pengeluaran_Sayur': 'mean'
    }).reset_index().to_dict('records'),
}

with open(frontend_data_dir / 'modeling/visualization_data.json', 'w') as f:
    json.dump(viz_data, f, indent=2)
print(f"✓ Exported: {frontend_data_dir / 'modeling/visualization_data.json'}")

print("\n" + "="*60)
print("DATA EXPORT COMPLETE")
print("="*60)
print(f"Files exported to: {frontend_data_dir.absolute()}")

✓ Exported: frontend\public\data\modeling\predictions_2025.csv
✓ Exported: frontend\public\data\modeling\summary_statistics.json
✓ Exported: frontend\public\data\modeling\visualization_data.json

DATA EXPORT COMPLETE
Files exported to: d:\Perkuliahan\2025-2026\Analisa Big Data\ProjectABD\frontend\public\data


## 7. Flask API Application

Create REST API endpoints untuk frontend consumption.

In [None]:
# Initialize Flask app
app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# ============================================
# API ENDPOINTS
# ============================================

@app.route('/api/health', methods=['GET'])
def health_check():
    """Health check endpoint"""
    return jsonify({
        'status': 'healthy',
        'timestamp': datetime.now().isoformat(),
        'version': '1.0.0'
    })

@app.route('/api/clusters', methods=['GET'])
def get_all_clusters():
    """
    Get all clustering data
    Query params:
    - year: Filter by year (2023, 2024)
    - cluster: Filter by cluster ID
    - region: Filter by region
    """
    try:
        df = enriched_data.copy()
        
        # Apply filters
        if 'year' in request.args:
            year = int(request.args.get('year'))
            df = df[df['Tahun'] == year]
        
        if 'cluster' in request.args:
            cluster = int(request.args.get('cluster'))
            df = df[df['Cluster'] == cluster]
        
        if 'region' in request.args:
            region = request.args.get('region')
            if 'Region' in df.columns:
                df = df[df['Region'] == region]
        
        # Convert to JSON-friendly format
        result = df.to_dict('records')
        
        return jsonify({
            'success': True,
            'count': len(result),
            'data': result
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/clusters/<int:cluster_id>', methods=['GET'])
def get_cluster_by_id(cluster_id):
    """
    Get specific cluster data and profile
    """
    try:
        # Get cluster data
        cluster_data = enriched_data[enriched_data['Cluster'] == cluster_id]
        
        # Get cluster profile
        profile = cluster_profiles[cluster_profiles['Cluster'] == cluster_id].to_dict('records')
        
        # Get centroid
        centroid = cluster_centroids[cluster_centroids['Cluster'] == cluster_id].to_dict('records')
        
        return jsonify({
            'success': True,
            'cluster_id': cluster_id,
            'profile': profile[0] if profile else {},
            'centroid': centroid[0] if centroid else {},
            'data': cluster_data.to_dict('records'),
            'count': len(cluster_data)
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/statistics', methods=['GET'])
def get_statistics():
    """
    Get summary statistics
    """
    try:
        return jsonify({
            'success': True,
            'data': summary_stats
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/predictions', methods=['GET'])
def get_predictions():
    """
    Get predictions for 2025
    Query params:
    - cluster: Filter by cluster ID
    - kabupaten: Search by kabupaten name
    """
    try:
        df = predictions_2025.copy()
        
        if 'cluster' in request.args:
            cluster = int(request.args.get('cluster'))
            df = df[df['Cluster'] == cluster]
        
        if 'kabupaten' in request.args:
            kabupaten = request.args.get('kabupaten').lower()
            df = df[df['Kabupaten_Kota'].str.lower().str.contains(kabupaten)]
        
        return jsonify({
            'success': True,
            'count': len(df),
            'data': df.to_dict('records')
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/regions', methods=['GET'])
def get_regions():
    """
    Get unique regions and their statistics
    """
    try:
        if 'Region' not in enriched_data.columns:
            return jsonify({
                'success': False,
                'error': 'Region data not available'
            }), 404
        
        regions = enriched_data.groupby('Region').agg({
            'Kabupaten_Kota': 'count',
            'Total_Buah': 'mean' if 'Total_Buah' in enriched_data.columns else lambda x: 0,
            'Total_Sayur': 'mean' if 'Total_Sayur' in enriched_data.columns else lambda x: 0
        }).reset_index()
        
        regions.columns = ['Region', 'Count', 'Avg_Buah', 'Avg_Sayur']
        
        return jsonify({
            'success': True,
            'data': regions.to_dict('records')
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/search', methods=['GET'])
def search_kabupaten():
    """
    Search kabupaten/kota by name
    Query params:
    - q: Search query
    """
    try:
        query = request.args.get('q', '').lower()
        
        if not query:
            return jsonify({
                'success': False,
                'error': 'Query parameter q is required'
            }), 400
        
        results = enriched_data[
            enriched_data['Kabupaten_Kota'].str.lower().str.contains(query)
        ]
        
        return jsonify({
            'success': True,
            'query': query,
            'count': len(results),
            'data': results.to_dict('records')
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/visualization', methods=['GET'])
def get_visualization_data():
    """
    Get pre-processed data for visualizations
    """
    try:
        return jsonify({
            'success': True,
            'data': viz_data
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

print("✓ Flask API initialized successfully!")
print("\nAvailable endpoints:")
print("  GET  /api/health")
print("  GET  /api/clusters")
print("  GET  /api/clusters/<id>")
print("  GET  /api/statistics")
print("  GET  /api/predictions")
print("  GET  /api/regions")
print("  GET  /api/search")
print("  GET  /api/visualization")

✓ Flask API initialized successfully!

Available endpoints:
  GET  /api/health
  GET  /api/clusters
  GET  /api/clusters/<id>
  GET  /api/statistics
  GET  /api/predictions
  GET  /api/regions
  GET  /api/search
  GET  /api/visualization


: 

## 8. Save Flask App to Python File

**Note**: Flask server tidak bisa dijalankan langsung di Jupyter notebook.
Kita akan save Flask app ke file `api.py` yang bisa dijalankan di terminal.

**Cara menjalankan**:

**Option 1 - Jalankan di Jupyter Notebook/VS Code**:
- Jalankan semua cell di notebook ini (cell 1-13)
- File `api.py` akan otomatis dibuat
- Lalu jalankan `api.py` di terminal

**Option 2 - Langsung di terminal**:
```powershell
# Di terminal
python api.py
```

**Option 3 - Production dengan Gunicorn**:
```powershell
pip install gunicorn
gunicorn -w 4 -b 0.0.0.0:5000 api:app
```

In [None]:
# Save Flask app to api.py file
flask_code = '''from flask import Flask, jsonify, request
from flask_cors import CORS
import pandas as pd
import numpy as np
from pathlib import Path
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load data
DATA_DIR = Path('data/modeling')
CLEANED_DIR = Path('data/cleaned')

clustering_results = pd.read_csv(DATA_DIR / 'clustering_results.csv')
cluster_profiles = pd.read_csv(DATA_DIR / 'cluster_profiles.csv')
cluster_centroids = pd.read_csv(DATA_DIR / 'cluster_centroids.csv')
data_integrated = pd.read_csv(CLEANED_DIR / 'data_integrated_wide.csv')

# Prepare enriched data
enriched_data = clustering_results.copy()

# Load predictions
predictions_2025 = pd.read_csv(Path('frontend/public/data/modeling/predictions_2025.csv'))

# Load summary stats
with open(Path('frontend/public/data/modeling/summary_statistics.json'), 'r') as f:
    summary_stats = json.load(f)

# Load viz data
with open(Path('frontend/public/data/modeling/visualization_data.json'), 'r') as f:
    viz_data = json.load(f)

# Initialize Flask app
app = Flask(__name__)
CORS(app)

@app.route('/api/health', methods=['GET'])
def health_check():
    return jsonify({
        'status': 'healthy',
        'timestamp': datetime.now().isoformat(),
        'version': '1.0.0'
    })

@app.route('/api/clusters', methods=['GET'])
def get_all_clusters():
    try:
        df = enriched_data.copy()
        
        if 'year' in request.args:
            year = int(request.args.get('year'))
            df = df[df['Tahun'] == year]
        
        if 'cluster' in request.args:
            cluster = int(request.args.get('cluster'))
            df = df[df['Cluster'] == cluster]
        
        if 'region' in request.args:
            region = request.args.get('region')
            if 'Region' in df.columns:
                df = df[df['Region'] == region]
        
        result = df.to_dict('records')
        
        return jsonify({
            'success': True,
            'count': len(result),
            'data': result
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/clusters/<int:cluster_id>', methods=['GET'])
def get_cluster_by_id(cluster_id):
    try:
        cluster_data = enriched_data[enriched_data['Cluster'] == cluster_id]
        profile = cluster_profiles[cluster_profiles['Cluster'] == cluster_id].to_dict('records')
        centroid = cluster_centroids[cluster_centroids['Cluster'] == cluster_id].to_dict('records')
        
        return jsonify({
            'success': True,
            'cluster_id': cluster_id,
            'profile': profile[0] if profile else {},
            'centroid': centroid[0] if centroid else {},
            'data': cluster_data.to_dict('records'),
            'count': len(cluster_data)
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/statistics', methods=['GET'])
def get_statistics():
    try:
        return jsonify({
            'success': True,
            'data': summary_stats
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/predictions', methods=['GET'])
def get_predictions():
    try:
        df = predictions_2025.copy()
        
        if 'cluster' in request.args:
            cluster = int(request.args.get('cluster'))
            df = df[df['Cluster'] == cluster]
        
        if 'kabupaten' in request.args:
            kabupaten = request.args.get('kabupaten').lower()
            df = df[df['Kabupaten_Kota'].str.lower().str.contains(kabupaten)]
        
        return jsonify({
            'success': True,
            'count': len(df),
            'data': df.to_dict('records')
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/regions', methods=['GET'])
def get_regions():
    try:
        if 'Region' not in enriched_data.columns:
            return jsonify({
                'success': False,
                'error': 'Region data not available'
            }), 404
        
        regions = enriched_data.groupby('Region').agg({
            'Kabupaten_Kota': 'count',
            'Total_Buah': 'mean' if 'Total_Buah' in enriched_data.columns else lambda x: 0,
            'Total_Sayur': 'mean' if 'Total_Sayur' in enriched_data.columns else lambda x: 0
        }).reset_index()
        
        regions.columns = ['Region', 'Count', 'Avg_Buah', 'Avg_Sayur']
        
        return jsonify({
            'success': True,
            'data': regions.to_dict('records')
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/search', methods=['GET'])
def search_kabupaten():
    try:
        query = request.args.get('q', '').lower()
        
        if not query:
            return jsonify({
                'success': False,
                'error': 'Query parameter q is required'
            }), 400
        
        results = enriched_data[
            enriched_data['Kabupaten_Kota'].str.lower().str.contains(query)
        ]
        
        return jsonify({
            'success': True,
            'query': query,
            'count': len(results),
            'data': results.to_dict('records')
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/visualization', methods=['GET'])
def get_visualization_data():
    try:
        return jsonify({
            'success': True,
            'data': viz_data
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

if __name__ == '__main__':
    print("="*60)
    print("STARTING FLASK API SERVER")
    print("="*60)
    print("\\nServer Configuration:")
    print("  Host: 0.0.0.0")
    print("  Port: 5000")
    print("  Debug: True")
    print("  CORS: Enabled")
    print("\\nAPI Base URL: http://localhost:5000/api")
    print("\\nPress CTRL+C to stop the server")
    print("="*60)
    
    app.run(host='0.0.0.0', port=5000, debug=True)
'''

# Write to file
with open('api.py', 'w', encoding='utf-8') as f:
    f.write(flask_code)

print("✓ Flask API saved to api.py")
print("\n" + "="*60)
print("TO RUN THE API SERVER:")
print("="*60)
print("\nOption 1 - Development mode:")
print("  python api.py")
print("\nOption 2 - Production mode (install gunicorn first):")
print("  pip install gunicorn")
print("  gunicorn -w 4 -b 0.0.0.0:5000 api:app")
print("\nOption 3 - Run in background (Windows PowerShell):")
print("  Start-Process python -ArgumentList 'api.py' -WindowStyle Hidden")
print("\n" + "="*60)
print("\nAPI will be available at: http://localhost:5000/api")
print("Test with: http://localhost:5000/api/health")
print("="*60)

✓ Flask API saved to api.py

TO RUN THE API SERVER:

Option 1 - Development mode:
  python api.py

Option 2 - Production mode (install gunicorn first):
  pip install gunicorn
  gunicorn -w 4 -b 0.0.0.0:5000 api:app

Option 3 - Run in background (Windows PowerShell):
  Start-Process python -ArgumentList 'api.py' -WindowStyle Hidden


API will be available at: http://localhost:5000/api
Test with: http://localhost:5000/api/health


: 

## 9. API Testing & Examples

Test API endpoints menggunakan requests library.

In [19]:
import requests

BASE_URL = 'http://localhost:5000/api'

print("="*60)
print("API ENDPOINT TESTING")
print("="*60)

# Test 1: Health Check
print("\n1. Testing /api/health...")
try:
    response = requests.get(f'{BASE_URL}/health')
    print(f"   Status: {response.status_code}")
    print(f"   Response: {response.json()}")
except Exception as e:
    print(f"   Error: {e}")

# Test 2: Get Statistics
print("\n2. Testing /api/statistics...")
try:
    response = requests.get(f'{BASE_URL}/statistics')
    print(f"   Status: {response.status_code}")
    data = response.json()
    if data['success']:
        print(f"   Total Kabupaten: {data['data']['overview']['total_kabupaten']}")
        print(f"   Total Clusters: {data['data']['overview']['total_clusters']}")
except Exception as e:
    print(f"   Error: {e}")

# Test 3: Get All Clusters
print("\n3. Testing /api/clusters...")
try:
    response = requests.get(f'{BASE_URL}/clusters?year=2024')
    print(f"   Status: {response.status_code}")
    data = response.json()
    if data['success']:
        print(f"   Data count: {data['count']}")
        print(f"   Sample: {data['data'][0] if data['data'] else 'No data'}")
except Exception as e:
    print(f"   Error: {e}")

# Test 4: Get Specific Cluster
print("\n4. Testing /api/clusters/0...")
try:
    response = requests.get(f'{BASE_URL}/clusters/0')
    print(f"   Status: {response.status_code}")
    data = response.json()
    if data['success']:
        print(f"   Cluster ID: {data['cluster_id']}")
        print(f"   Data count: {data['count']}")
        print(f"   Profile: {data['profile']}")
except Exception as e:
    print(f"   Error: {e}")

# Test 5: Get Predictions
print("\n5. Testing /api/predictions...")
try:
    response = requests.get(f'{BASE_URL}/predictions?cluster=0')
    print(f"   Status: {response.status_code}")
    data = response.json()
    if data['success']:
        print(f"   Predictions count: {data['count']}")
        if data['data']:
            sample = data['data'][0]
            print(f"   Sample prediction for {sample['Kabupaten_Kota']}:")
            print(f"     Predicted Buah 2025: Rp {sample['Predicted_Buah_2025']:,.0f}")
            print(f"     Growth Rate: {sample['Growth_Rate_Buah']:.2f}%")
except Exception as e:
    print(f"   Error: {e}")

# Test 6: Search
print("\n6. Testing /api/search?q=jakarta...")
try:
    response = requests.get(f'{BASE_URL}/search?q=jakarta')
    print(f"   Status: {response.status_code}")
    data = response.json()
    if data['success']:
        print(f"   Results found: {data['count']}")
        if data['data']:
            print(f"   First result: {data['data'][0]['Kabupaten_Kota']}")
except Exception as e:
    print(f"   Error: {e}")

print("\n" + "="*60)
print("✓ API Testing Complete!")
print("="*60)

API ENDPOINT TESTING

1. Testing /api/health...
   Error: HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/health (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001993A7543D0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

2. Testing /api/statistics...
   Error: HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/health (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001993A7543D0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

2. Testing /api/statistics...
   Error: HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/statistics (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001993AB33790>: Failed to establish a n

## 10. Integration Guide for Frontend

### React/TypeScript Integration Example

```typescript
// services/apiService.ts
const API_BASE_URL = 'http://localhost:5000/api';

export const apiService = {
  // Get all clusters
  getClusters: async (filters?: { year?: number; cluster?: number }) => {
    const params = new URLSearchParams(filters as any);
    const response = await fetch(`${API_BASE_URL}/clusters?${params}`);
    return response.json();
  },

  // Get specific cluster
  getClusterById: async (id: number) => {
    const response = await fetch(`${API_BASE_URL}/clusters/${id}`);
    return response.json();
  },

  // Get statistics
  getStatistics: async () => {
    const response = await fetch(`${API_BASE_URL}/statistics`);
    return response.json();
  },

  // Get predictions
  getPredictions: async (cluster?: number) => {
    const params = cluster ? `?cluster=${cluster}` : '';
    const response = await fetch(`${API_BASE_URL}/predictions${params}`);
    return response.json();
  },

  // Search kabupaten
  searchKabupaten: async (query: string) => {
    const response = await fetch(`${API_BASE_URL}/search?q=${encodeURIComponent(query)}`);
    return response.json();
  }
};
```

### Usage in React Components

```typescript
// components/ClusterDashboard.tsx
import { useEffect, useState } from 'react';
import { apiService } from '../services/apiService';

function ClusterDashboard() {
  const [statistics, setStatistics] = useState(null);
  const [loading, setLoading] = useState(true);

  useEffect(() => {
    const fetchData = async () => {
      try {
        const stats = await apiService.getStatistics();
        if (stats.success) {
          setStatistics(stats.data);
        }
      } catch (error) {
        console.error('Error fetching statistics:', error);
      } finally {
        setLoading(false);
      }
    };

    fetchData();
  }, []);

  if (loading) return <div>Loading...</div>;

  return (
    <div>
      <h1>Cluster Analysis Dashboard</h1>
      <p>Total Kabupaten: {statistics?.overview.total_kabupaten}</p>
      <p>Total Clusters: {statistics?.overview.total_clusters}</p>
    </div>
  );
}
```

## 11. Deployment Notes

### Production Deployment

#### Option 1: Using Gunicorn (Recommended)
```bash
# Install gunicorn
pip install gunicorn

# Run with gunicorn
gunicorn -w 4 -b 0.0.0.0:5000 api:app
```

#### Option 2: Using Docker
```dockerfile
FROM python:3.9-slim

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

EXPOSE 5000

CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:5000", "api:app"]
```

#### Option 3: Deploy to Cloud
- **Heroku**: `git push heroku main`
- **Railway**: Connect GitHub repo
- **Google Cloud Run**: Deploy containerized app
- **AWS EC2**: Traditional server deployment

### Environment Variables
Create `.env` file:
```
FLASK_ENV=production
SECRET_KEY=your-secret-key
API_KEY=your-gemini-api-key
```

### Security Considerations
1. Use HTTPS in production
2. Implement rate limiting
3. Add authentication for sensitive endpoints
4. Validate and sanitize all inputs
5. Use environment variables for secrets

### Performance Optimization
1. Enable caching (Redis/Memcached)
2. Use database instead of CSV for large datasets
3. Implement pagination for large responses
4. Add request compression (gzip)
5. Use CDN for static files

In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from datetime import datetime

: 

## 2. Load Data

**Data Sources**:
- Clustering results
- Cluster profiles
- Cluster centroids
- Integrated data (2023-2024)

In [None]:
# Paths
modeling_dir = Path('data/modeling')
cleaned_dir = Path('data/cleaned')
output_dir = Path('frontend/public/data/modeling')
output_dir.mkdir(parents=True, exist_ok=True)

# Load modeling results
df_clustering = pd.read_csv(modeling_dir / 'clustering_results.csv')
df_profiles = pd.read_csv(modeling_dir / 'cluster_profiles.csv')
df_centroids = pd.read_csv(modeling_dir / 'cluster_centroids.csv')

# Load integrated data
df_integrated = pd.read_csv(cleaned_dir / 'data_integrated_wide.csv')

print("="*60)
print("DATA LOADED SUCCESSFULLY")
print("="*60)
print(f"Clustering results: {df_clustering.shape}")
print(f"Cluster profiles: {df_profiles.shape}")
print(f"Cluster centroids: {df_centroids.shape}")
print(f"Integrated data: {df_integrated.shape}")

DATA LOADED SUCCESSFULLY
Clustering results: (1028, 9)
Cluster profiles: (4, 20)
Cluster centroids: (4, 6)
Integrated data: (1028, 47)


: 

## 3. Generate Summary Statistics

**Tujuan**: Membuat ringkasan statistik untuk dashboard cards.

**Metrics**:
- Total kabupaten/kota
- Number of clusters
- Average expenditure per category
- Total expenditure across Indonesia
- Year-over-year growth

In [None]:
# Calculate summary statistics
summary_stats = {
    "metadata": {
        "generated_at": datetime.now().isoformat(),
        "total_kabupaten": int(df_clustering['Kabupaten_Kota'].nunique()),
        "total_clusters": int(df_profiles['Cluster'].nunique()),
        "data_year": int(df_clustering['Tahun'].iloc[0])
    },
    "expenditure": {
        "buah": {
            "mean": float(df_clustering['Total_Buah'].mean()),
            "median": float(df_clustering['Total_Buah'].median()),
            "min": float(df_clustering['Total_Buah'].min()),
            "max": float(df_clustering['Total_Buah'].max()),
            "std": float(df_clustering['Total_Buah'].std())
        },
        "sayur": {
            "mean": float(df_clustering['Total_Sayur'].mean()),
            "median": float(df_clustering['Total_Sayur'].median()),
            "min": float(df_clustering['Total_Sayur'].min()),
            "max": float(df_clustering['Total_Sayur'].max()),
            "std": float(df_clustering['Total_Sayur'].std())
        },
        "total": {
            "mean": float(df_clustering['Total_Buah'].mean() + df_clustering['Total_Sayur'].mean()),
            "weekly_national": float((df_clustering['Total_Buah'] + df_clustering['Total_Sayur']).sum()),
            "monthly_national": float((df_clustering['Total_Buah'] + df_clustering['Total_Sayur']).sum() * 4),
            "yearly_national": float((df_clustering['Total_Buah'] + df_clustering['Total_Sayur']).sum() * 52)
        }
    },
    "clusters": []
}

# Add cluster information
for _, row in df_profiles.iterrows():
    cluster_info = {
        "id": int(row['Cluster']),
        "size": int(row['Count']),
        "percentage": float(row['Count'] / len(df_clustering) * 100),
        "centroid": {
            "buah": float(row['Total_Buah_mean']),
            "sayur": float(row['Total_Sayur_mean']),
            "total": float(row['Total_Buah_mean'] + row['Total_Sayur_mean'])
        }
    }
    summary_stats['clusters'].append(cluster_info)

# Calculate YoY growth if 2023 data exists
if 'Total_Pengeluaran_2023' in df_integrated.columns and 'Total_Pengeluaran_2024' in df_integrated.columns:
    avg_2023 = df_integrated['Total_Pengeluaran_2023'].mean()
    avg_2024 = df_integrated['Total_Pengeluaran_2024'].mean()
    growth = ((avg_2024 - avg_2023) / avg_2023) * 100
    summary_stats['growth'] = {
        "yoy_percentage": float(growth),
        "avg_2023": float(avg_2023),
        "avg_2024": float(avg_2024)
    }

print("✓ Summary statistics generated")
print(f"  Total Kabupaten: {summary_stats['metadata']['total_kabupaten']}")
print(f"  Total Clusters: {summary_stats['metadata']['total_clusters']}")
print(f"  Average Expenditure: Rp {summary_stats['expenditure']['total']['mean']:,.0f}/minggu")

✓ Summary statistics generated
  Total Kabupaten: 514
  Total Clusters: 4
  Average Expenditure: Rp 45,561/minggu


: 

## 4. Generate Visualization Data

**Tujuan**: Menyiapkan data untuk berbagai chart di frontend.

**Chart Types**:
1. Cluster distribution (pie/donut chart)
2. Expenditure comparison (bar chart)
3. Regional distribution (map data)
4. Time series (trend line)
5. Scatter plot data (buah vs sayur)

In [None]:
visualization_data = {
    "cluster_distribution": [],
    "expenditure_comparison": [],
    "regional_distribution": [],
    "scatter_plot": [],
    "top_regions": []
}

# 1. Cluster distribution for pie chart
for _, row in df_profiles.iterrows():
    visualization_data["cluster_distribution"].append({
        "cluster": f"Cluster {int(row['Cluster'])}",
        "count": int(row["Count"]),
        "percentage": float(row["Count"] / len(df_clustering) * 100)
    })

# 2. Expenditure comparison for bar chart
for _, row in df_profiles.iterrows():
    visualization_data["expenditure_comparison"].append({
        "cluster": f"Cluster {int(row['Cluster'])}",
        "buah": float(row["Total_Buah_mean"]),
        "sayur": float(row["Total_Sayur_mean"]),
        "total": float(row["Total_Buah_mean"] + row["Total_Sayur_mean"])
    })

# 3. Regional distribution
# First, check if Region column exists, if not create a mapping from kabupaten to region
if 'Region' not in df_clustering.columns:
    # Define region mapping based on province prefixes
    def get_region(kabupaten):
        kabupaten_lower = str(kabupaten).lower()
        if any(x in kabupaten_lower for x in ['aceh', 'sumatera', 'medan', 'padang', 'palembang', 'lampung', 'bengkulu', 'jambi', 'riau', 'bangka', 'belitung']):
            return 'Sumatera'
        elif any(x in kabupaten_lower for x in ['jakarta', 'bogor', 'depok', 'tangerang', 'bekasi', 'banten', 'jawa barat', 'bandung', 'cirebon', 'sukabumi', 'tasikmalaya']):
            return 'Jawa Barat'
        elif any(x in kabupaten_lower for x in ['jawa tengah', 'semarang', 'solo', 'surakarta', 'magelang', 'purwokerto', 'tegal', 'pekalongan', 'yogyakarta']):
            return 'Jawa Tengah & DIY'
        elif any(x in kabupaten_lower for x in ['jawa timur', 'surabaya', 'malang', 'kediri', 'blitar', 'jember', 'banyuwangi', 'madiun', 'madura']):
            return 'Jawa Timur'
        elif any(x in kabupaten_lower for x in ['bali', 'nusa tenggara', 'lombok', 'sumbawa', 'flores', 'timor', 'kupang']):
            return 'Bali & Nusa Tenggara'
        elif any(x in kabupaten_lower for x in ['kalimantan', 'pontianak', 'banjarmasin', 'samarinda', 'balikpapan', 'palangkaraya']):
            return 'Kalimantan'
        elif any(x in kabupaten_lower for x in ['sulawesi', 'makassar', 'manado', 'palu', 'kendari', 'gorontalo']):
            return 'Sulawesi'
        elif any(x in kabupaten_lower for x in ['maluku', 'ambon', 'ternate', 'papua', 'jayapura', 'sorong', 'merauke', 'nabire', 'timika']):
            return 'Maluku & Papua'
        else:
            return 'Lainnya'
    
    df_clustering['Region'] = df_clustering['Kabupaten_Kota'].apply(get_region)

# Group by region for regional distribution
regional_stats = df_clustering.groupby('Region').agg({
    'Kabupaten_Kota': 'count',
    'Total_Buah': 'mean',
    'Total_Sayur': 'mean',
    'Total_Pengeluaran': 'mean'
}).reset_index()

regional_stats.columns = ['region', 'count', 'avg_buah', 'avg_sayur', 'avg_total']

for _, row in regional_stats.iterrows():
    visualization_data['regional_distribution'].append({
        "region": str(row['region']),
        "count": int(row['count']),
        "avg_buah": float(row['avg_buah']),
        "avg_sayur": float(row['avg_sayur']),
        "avg_total": float(row['avg_total'])
    })

# 4. Scatter plot data (sample 100 points for performance)
sample_data = df_clustering.sample(n=min(100, len(df_clustering)), random_state=42)
for _, row in sample_data.iterrows():
    visualization_data['scatter_plot'].append({
        "kabupaten": str(row['Kabupaten_Kota']),
        "buah": float(row['Total_Buah']),
        "sayur": float(row['Total_Sayur']),
        "cluster": int(row['Cluster']),
        "region": str(row['Region'])
    })

# 5. Top regions by expenditure
top_regions = df_clustering.nlargest(10, 'Total_Pengeluaran')[['Kabupaten_Kota', 'Region', 'Total_Buah', 'Total_Sayur', 'Total_Pengeluaran', 'Cluster']]

for _, row in top_regions.iterrows():
    visualization_data['top_regions'].append({
        "kabupaten": str(row['Kabupaten_Kota']),
        "region": str(row['Region']),
        "buah": float(row['Total_Buah']),
        "sayur": float(row['Total_Sayur']),
        "total": float(row['Total_Pengeluaran']),
        "cluster": int(row['Cluster'])
    })

print("✓ Visualization data generated")
print(f"  Cluster distribution: {len(visualization_data['cluster_distribution'])} entries")
print(f"  Expenditure comparison: {len(visualization_data['expenditure_comparison'])} entries")
print(f"  Regional distribution: {len(visualization_data['regional_distribution'])} entries")
print(f"  Scatter plot: {len(visualization_data['scatter_plot'])} points")
print(f"  Top regions: {len(visualization_data['top_regions'])} regions")


NameError: name 'df_profiles' is not defined

: 

Saya akan membuat file API notebook untuk implementasi hasil modeling ke frontend. 

Created [](file:///d%3A/Perkuliahan/2025-2026/Analisa%20Big%20Data/ProjectABD/api.ipynb)

File api.ipynb berhasil dibuat! Notebook ini menyediakan:

**Fitur Utama**:
1. **Summary Statistics** - Statistik ringkasan untuk dashboard cards (JSON)
2. **Visualization Data** - Data terstruktur untuk berbagai chart (JSON)
3. **Predictions 2025** - Prediksi pengeluaran tahun depan berdasarkan trend (CSV)
4. **Auto-copy** - Otomatis copy semua file ke folder data

**Output Files**:
- `summary_statistics.json` - Metadata & statistik agregat
- `visualization_data.json` - Data untuk pie chart, bar chart, scatter plot
- `predictions_2025.csv` - Forecast untuk tahun 2025
- Copy semua clustering results ke frontend

Jalankan notebook ini setelah modeling selesai untuk mempersiapkan data yang siap dikonsumsi frontend!

Made changes.

Saya akan membuat file API notebook untuk implementasi hasil modeling ke frontend. 

Created [](file:///d%3A/Perkuliahan/2025-2026/Analisa%20Big%20Data/ProjectABD/api.ipynb)

File api.ipynb berhasil dibuat! Notebook ini menyediakan:

**Fitur Utama**:
1. **Summary Statistics** - Statistik ringkasan untuk dashboard cards (JSON)
2. **Visualization Data** - Data terstruktur untuk berbagai chart (JSON)
3. **Predictions 2025** - Prediksi pengeluaran tahun depan berdasarkan trend (CSV)
4. **Auto-copy** - Otomatis copy semua file ke folder data

**Output Files**:
- `summary_statistics.json` - Metadata & statistik agregat
- `visualization_data.json` - Data untuk pie chart, bar chart, scatter plot
- `predictions_2025.csv` - Forecast untuk tahun 2025
- Copy semua clustering results ke frontend

Jalankan notebook ini setelah modeling selesai untuk mempersiapkan data yang siap dikonsumsi frontend!

Made changes.

## 5. Generate Predictions for 2025

**Tujuan**: Membuat prediksi pengeluaran untuk tahun 2025 berdasarkan trend historis.

**Metode**: Simple linear growth model berdasarkan data 2023-2024.

**Assumptions**:
- Pertumbuhan linear dari 2023 ke 2024 akan berlanjut ke 2025
- Tidak ada shock ekonomi atau perubahan drastis
- Pattern konsumsi tetap konsisten

In [None]:
# Calculate growth rates from integrated data
df_predictions = df_clustering.copy()

# Get 2023 data if available
if 'Total_Pengeluaran_2023' in df_integrated.columns:
    # Merge with integrated data to get 2023 values
    df_with_history = df_clustering.merge(
        df_integrated[['Kabupaten_Kota', 'Total_Pengeluaran_2023']],
        on='Kabupaten_Kota',
        how='left'
    )
    
    # Calculate growth rate
    df_with_history['Growth_Rate'] = (
        (df_with_history['Pengeluaran_Buah'] + df_with_history['Pengeluaran_Sayur'] - df_with_history['Total_Pengeluaran_2023']) 
        / df_with_history['Total_Pengeluaran_2023']
    )
    
    # Fill NaN growth rates with average
    avg_growth_rate = df_with_history['Growth_Rate'].mean()
    df_with_history['Growth_Rate'].fillna(avg_growth_rate, inplace=True)
    
    # Predict 2025 (apply same growth rate)
    df_predictions['Pengeluaran_Buah_2025'] = df_with_history['Pengeluaran_Buah'] * (1 + df_with_history['Growth_Rate'])
    df_predictions['Pengeluaran_Sayur_2025'] = df_with_history['Pengeluaran_Sayur'] * (1 + df_with_history['Growth_Rate'])
else:
    # If no 2023 data, assume 5% growth (conservative estimate)
    default_growth = 0.05
    df_predictions['Pengeluaran_Buah_2025'] = df_predictions['Pengeluaran_Buah'] * (1 + default_growth)
    df_predictions['Pengeluaran_Sayur_2025'] = df_predictions['Pengeluaran_Sayur'] * (1 + default_growth)

df_predictions['Total_Pengeluaran_2025'] = df_predictions['Pengeluaran_Buah_2025'] + df_predictions['Pengeluaran_Sayur_2025']
df_predictions['Tahun'] = 2025

# Select columns for export
predictions_export = df_predictions[[
    'Kabupaten_Kota', 'Region', 'Cluster',
    'Pengeluaran_Buah_2025', 'Pengeluaran_Sayur_2025', 'Total_Pengeluaran_2025', 'Tahun'
]].copy()

predictions_export.rename(columns={
    'Pengeluaran_Buah_2025': 'Pengeluaran_Buah',
    'Pengeluaran_Sayur_2025': 'Pengeluaran_Sayur',
    'Total_Pengeluaran_2025': 'Total_Pengeluaran'
}, inplace=True)

print("✓ Predictions generated for 2025")
print(f"  Total predictions: {len(predictions_export)}")
print(f"  Average predicted buah: Rp {predictions_export['Pengeluaran_Buah'].mean():,.0f}")
print(f"  Average predicted sayur: Rp {predictions_export['Pengeluaran_Sayur'].mean():,.0f}")
print(f"  Average predicted total: Rp {predictions_export['Total_Pengeluaran'].mean():,.0f}")

NameError: name 'df_clustering' is not defined

: 

## 6. Export Files

**Output Files**:
1. `summary_statistics.json` - Statistik ringkasan
2. `visualization_data.json` - Data untuk chart
3. `predictions_2025.csv` - Prediksi tahun 2025
4. Copy clustering files ke frontend public folder

In [None]:
print("="*60)
print("EXPORTING FILES")
print("="*60)

# 1. Export summary statistics JSON
with open(output_dir / 'summary_statistics.json', 'w', encoding='utf-8') as f:
    json.dump(summary_stats, f, indent=2, ensure_ascii=False)
print(f"✓ Exported: summary_statistics.json")

# 2. Export visualization data JSON
with open(output_dir / 'visualization_data.json', 'w', encoding='utf-8') as f:
    json.dump(visualization_data, f, indent=2, ensure_ascii=False)
print(f"✓ Exported: visualization_data.json")

# 3. Export predictions CSV
predictions_export.to_csv(output_dir / 'predictions_2025.csv', index=False)
print(f"✓ Exported: predictions_2025.csv ({predictions_export.shape})")

# 4. Copy clustering files to frontend public folder
import shutil

files_to_copy = [
    'clustering_results.csv',
    'cluster_profiles.csv',
    'cluster_centroids.csv'
]

for filename in files_to_copy:
    src = modeling_dir / filename
    dst = output_dir / filename
    shutil.copy2(src, dst)
    print(f"✓ Copied: {filename}")

# 5. Copy integrated data to frontend
frontend_cleaned_dir = Path('frontend/public/data/cleaned')
frontend_cleaned_dir.mkdir(parents=True, exist_ok=True)

shutil.copy2(cleaned_dir / 'data_integrated_wide.csv', frontend_cleaned_dir / 'data_integrated_wide.csv')
print(f"✓ Copied: data_integrated_wide.csv")

print("\n" + "="*60)
print("✓ ALL FILES EXPORTED SUCCESSFULLY")
print("="*60)
print(f"\nOutput directory: {output_dir.absolute()}")
print(f"\nFiles ready for frontend integration:")
print(f"  - summary_statistics.json")
print(f"  - visualization_data.json")
print(f"  - predictions_2025.csv")
print(f"  - clustering_results.csv")
print(f"  - cluster_profiles.csv")
print(f"  - cluster_centroids.csv")

: 

## 7. Validation & Preview

**Tujuan**: Validasi data yang diekspor dan preview sample data.

In [None]:
print("="*60)
print("DATA VALIDATION & PREVIEW")
print("="*60)

# Validate JSON files
print("\n1. Summary Statistics Sample:")
print(f"   Total Kabupaten: {summary_stats['metadata']['total_kabupaten']}")
print(f"   Total Clusters: {summary_stats['metadata']['total_clusters']}")
print(f"   Average Buah: Rp {summary_stats['expenditure']['buah']['mean']:,.0f}")
print(f"   Average Sayur: Rp {summary_stats['expenditure']['sayur']['mean']:,.0f}")

print("\n2. Visualization Data Sample:")
print(f"   Cluster distribution entries: {len(visualization_data['cluster_distribution'])}")
print(f"   First cluster: {visualization_data['cluster_distribution'][0]}")

print("\n3. Predictions 2025 Sample:")
print(predictions_export.head())

print("\n4. File Sizes:")
for file in output_dir.glob('*'):
    size_kb = file.stat().st_size / 1024
    print(f"   {file.name}: {size_kb:.2f} KB")

print("\n" + "="*60)
print("✓ VALIDATION COMPLETE - Data ready for frontend")
print("="*60)

: 