In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import os
import sys
from pathlib import Path

# Geospatial libraries
import geopandas as gpd
from shapely.geometry import Point
import rasterio
from rasterio.transform import from_bounds

# Interpolation libraries
from pykrige.ok import OrdinaryKriging
from scipy.spatial.distance import cdist
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

# Set up matplotlib
plt.style.use('default')
sns.set_palette("husl")

print("‚úÖ All libraries imported successfully!")
print(f"üìÅ Working directory: {os.getcwd()}")

## üìä Step 1: Dataset Selection and Exploration

In [None]:
# File selection widget
data_dir = '/app/data'
available_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]

if not available_files:
    print("‚ùå No CSV files found in /app/data directory")
else:
    print(f"üìÇ Found {len(available_files)} CSV files:")
    for i, file in enumerate(available_files, 1):
        file_path = os.path.join(data_dir, file)
        file_size = os.path.getsize(file_path) / 1024  # KB
        print(f"   {i}. {file} ({file_size:.1f} KB)")

# Create file selection dropdown
file_selector = widgets.Dropdown(
    options=available_files,
    description='Select CSV:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

# Create load button
load_button = widgets.Button(
    description='üîÑ Load Dataset',
    button_style='info',
    layout=widgets.Layout(width='150px')
)

# Output area for dataset info
output_area = widgets.Output()

# Global variable to store current dataset
current_df = None
current_filename = None

def load_dataset(b):
    global current_df, current_filename
    with output_area:
        clear_output()
        if file_selector.value:
            try:
                current_filename = file_selector.value
                file_path = os.path.join(data_dir, current_filename)
                current_df = pd.read_csv(file_path)
                
                print(f"‚úÖ Successfully loaded: {current_filename}")
                print(f"üìä Shape: {current_df.shape[0]} rows √ó {current_df.shape[1]} columns")
                print(f"\nüìã Column Information:")
                
                # Display column info
                for i, col in enumerate(current_df.columns, 1):
                    dtype = current_df[col].dtype
                    non_null = current_df[col].count()
                    null_pct = (current_df[col].isnull().sum() / len(current_df)) * 100
                    print(f"   {i:2d}. {col:<20} | {str(dtype):<10} | {non_null:>6} non-null ({null_pct:5.1f}% missing)")
                
                print(f"\nüîç First 5 rows:")
                display(current_df.head())
                
                # Basic statistics for numeric columns
                numeric_cols = current_df.select_dtypes(include=[np.number]).columns
                if len(numeric_cols) > 0:
                    print(f"\nüìà Summary Statistics for Numeric Columns:")
                    display(current_df[numeric_cols].describe())
                    
            except Exception as e:
                print(f"‚ùå Error loading file: {str(e)}")
        else:
            print("‚ö†Ô∏è Please select a file first")

load_button.on_click(load_dataset)

# Display widgets
display(widgets.HBox([file_selector, load_button]))
display(output_area)

## üéØ Step 2: Variable Selection for Interpolation

In [None]:
# Variable selection widgets
def create_variable_selectors():
    if current_df is None:
        print("‚ö†Ô∏è Please load a dataset first!")
        return
    
    # Get column options
    all_columns = list(current_df.columns)
    numeric_columns = list(current_df.select_dtypes(include=[np.number]).columns)
    
    # Coordinate selectors
    print("üåç Select Coordinate Columns:")
    
    lat_selector = widgets.Dropdown(
        options=[None] + all_columns,
        description='Latitude:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='300px')
    )
    
    lon_selector = widgets.Dropdown(
        options=[None] + all_columns,
        description='Longitude:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='300px')
    )
    
    # Try to auto-detect coordinate columns
    lat_candidates = [col for col in all_columns if any(term in col.lower() for term in ['lat', 'y', 'northing'])]
    lon_candidates = [col for col in all_columns if any(term in col.lower() for term in ['lon', 'lng', 'x', 'easting'])]
    
    if lat_candidates:
        lat_selector.value = lat_candidates[0]
    if lon_candidates:
        lon_selector.value = lon_candidates[0]
    
    # Variables to interpolate
    print("\nüìä Select Variables to Interpolate:")
    
    variable_selector = widgets.SelectMultiple(
        options=numeric_columns,
        description='Variables:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px', height='150px')
    )
    
    # Date/time column (optional)
    date_selector = widgets.Dropdown(
        options=[None] + all_columns,
        description='Date Column:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='300px')
    )
    
    # Try to auto-detect date column
    date_candidates = [col for col in all_columns if any(term in col.lower() for term in ['date', 'time', 'day', 'month', 'year'])]
    if date_candidates:
        date_selector.value = date_candidates[0]
    
    # Interpolation method
    method_selector = widgets.Dropdown(
        options=['Kriging', 'IDW', 'Random Forest', 'Linear'],
        value='Kriging',
        description='Method:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='200px')
    )
    
    # Grid resolution
    resolution_selector = widgets.IntSlider(
        value=100,
        min=50,
        max=500,
        step=50,
        description='Grid Resolution:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )
    
    # Validate and process button
    validate_button = widgets.Button(
        description='‚úÖ Validate Selection',
        button_style='success',
        layout=widgets.Layout(width='200px')
    )
    
    # Output area for validation
    validation_output = widgets.Output()
    
    # Store selections globally
    global interpolation_config
    interpolation_config = {
        'lat_col': None,
        'lon_col': None,
        'variables': [],
        'date_col': None,
        'method': 'Kriging',
        'resolution': 100
    }
    
    def validate_selection(b):
        with validation_output:
            clear_output()
            
            # Validate selections
            errors = []
            
            if not lat_selector.value:
                errors.append("‚ùå Please select a latitude column")
            if not lon_selector.value:
                errors.append("‚ùå Please select a longitude column")
            if not variable_selector.value:
                errors.append("‚ùå Please select at least one variable to interpolate")
            
            if errors:
                for error in errors:
                    print(error)
                return
            
            # Update config
            interpolation_config.update({
                'lat_col': lat_selector.value,
                'lon_col': lon_selector.value,
                'variables': list(variable_selector.value),
                'date_col': date_selector.value,
                'method': method_selector.value,
                'resolution': resolution_selector.value
            })
            
            # Validate data
            try:
                lat_data = current_df[lat_selector.value].dropna()
                lon_data = current_df[lon_selector.value].dropna()
                
                print(f"‚úÖ Configuration validated successfully!")
                print(f"\nüìç Coordinate System:")
                print(f"   Latitude: {lat_selector.value} (range: {lat_data.min():.4f} to {lat_data.max():.4f})")
                print(f"   Longitude: {lon_selector.value} (range: {lon_data.min():.4f} to {lon_data.max():.4f})")
                
                print(f"\nüìä Variables to interpolate: {len(variable_selector.value)}")
                for var in variable_selector.value:
                    var_data = current_df[var].dropna()
                    print(f"   ‚Ä¢ {var}: {len(var_data)} valid points (range: {var_data.min():.3f} to {var_data.max():.3f})")
                
                print(f"\n‚öôÔ∏è Interpolation Settings:")
                print(f"   Method: {method_selector.value}")
                print(f"   Grid Resolution: {resolution_selector.value}x{resolution_selector.value}")
                if date_selector.value:
                    print(f"   Date Column: {date_selector.value}")
                
                print(f"\nüöÄ Ready to run spatial interpolation!")
                
            except Exception as e:
                print(f"‚ùå Validation error: {str(e)}")
    
    validate_button.on_click(validate_selection)
    
    # Display widgets
    display(widgets.VBox([
        widgets.HBox([lat_selector, lon_selector]),
        variable_selector,
        widgets.HBox([date_selector, method_selector]),
        resolution_selector,
        validate_button
    ]))
    
    display(validation_output)

# Button to show variable selectors
show_selectors_btn = widgets.Button(
    description='üéØ Configure Variables',
    button_style='warning',
    layout=widgets.Layout(width='200px')
)

show_selectors_btn.on_click(lambda b: create_variable_selectors())
display(show_selectors_btn)

## üöÄ Step 3: Run Spatial Interpolation

In [None]:
def run_spatial_interpolation():
    if current_df is None:
        print("‚ö†Ô∏è Please load a dataset first!")
        return
    
    if 'interpolation_config' not in globals() or not interpolation_config['lat_col']:
        print("‚ö†Ô∏è Please configure variables first!")
        return
    
    config = interpolation_config
    
    print(f"üöÄ Starting spatial interpolation for {current_filename}...")
    print(f"üìä Processing {len(config['variables'])} variables using {config['method']}")
    
    try:
        # Prepare data
        df_clean = current_df.dropna(subset=[config['lat_col'], config['lon_col']])
        
        results = {}
        
        for variable in config['variables']:
            print(f"\nüîÑ Processing {variable}...")
            
            # Get clean data for this variable
            var_data = df_clean.dropna(subset=[variable])
            
            if len(var_data) < 3:
                print(f"‚ö†Ô∏è Insufficient data points for {variable} ({len(var_data)} points)")
                continue
            
            # Extract coordinates and values
            lats = var_data[config['lat_col']].values
            lons = var_data[config['lon_col']].values
            values = var_data[variable].values
            
            # Create interpolation grid
            lat_min, lat_max = lats.min(), lats.max()
            lon_min, lon_max = lons.min(), lons.max()
            
            # Add buffer
            lat_buffer = (lat_max - lat_min) * 0.1
            lon_buffer = (lon_max - lon_min) * 0.1
            
            grid_lats = np.linspace(lat_min - lat_buffer, lat_max + lat_buffer, config['resolution'])
            grid_lons = np.linspace(lon_min - lon_buffer, lon_max + lon_buffer, config['resolution'])
            
            grid_lon, grid_lat = np.meshgrid(grid_lons, grid_lats)
            
            # Perform interpolation based on selected method
            if config['method'] == 'Kriging':
                try:
                    ok = OrdinaryKriging(
                        lons, lats, values,
                        variogram_model='linear',
                        verbose=False,
                        enable_plotting=False
                    )
                    z, ss = ok.execute('grid', grid_lons, grid_lats)
                    interpolated = z
                    print(f"   ‚úÖ Kriging interpolation completed")
                except Exception as e:
                    print(f"   ‚ùå Kriging failed: {e}")
                    print(f"   üîÑ Falling back to IDW...")
                    config['method'] = 'IDW'  # Fallback to IDW
            
            if config['method'] == 'IDW':
                # Inverse Distance Weighting
                interpolated = np.zeros_like(grid_lat)
                
                for i in range(config['resolution']):
                    for j in range(config['resolution']):
                        # Calculate distances
                        distances = np.sqrt((lats - grid_lat[i,j])**2 + (lons - grid_lon[i,j])**2)
                        
                        # Avoid division by zero
                        distances = np.maximum(distances, 1e-10)
                        
                        # IDW weights (power = 2)
                        weights = 1 / (distances ** 2)
                        interpolated[i,j] = np.sum(weights * values) / np.sum(weights)
                
                print(f"   ‚úÖ IDW interpolation completed")
            
            # Store results
            results[variable] = {
                'interpolated': interpolated,
                'grid_lat': grid_lat,
                'grid_lon': grid_lon,
                'original_lats': lats,
                'original_lons': lons,
                'original_values': values,
                'n_points': len(values)
            }
        
        # Generate visualizations
        print(f"\nüìä Generating visualizations...")
        
        n_vars = len(results)
        if n_vars == 0:
            print("‚ùå No variables were successfully interpolated")
            return
        
        # Create subplots
        fig, axes = plt.subplots(2, min(n_vars, 3), figsize=(15, 10))
        if n_vars == 1:
            axes = axes.reshape(2, 1)
        elif n_vars == 2:
            axes = axes.reshape(2, 2)
        
        for idx, (var_name, result) in enumerate(list(results.items())[:3]):  # Max 3 variables for display
            # Interpolated surface
            if n_vars == 1:
                ax1 = axes[0]
                ax2 = axes[1]
            else:
                ax1 = axes[0, idx]
                ax2 = axes[1, idx]
            
            # Plot interpolated surface
            im1 = ax1.contourf(result['grid_lon'], result['grid_lat'], result['interpolated'], 
                              levels=20, cmap='viridis')
            ax1.scatter(result['original_lons'], result['original_lats'], 
                       c=result['original_values'], s=50, cmap='viridis', 
                       edgecolors='white', linewidth=1)
            ax1.set_title(f'{var_name} - Interpolated Surface\n({result["n_points"]} data points)')
            ax1.set_xlabel('Longitude')
            ax1.set_ylabel('Latitude')
            plt.colorbar(im1, ax=ax1, shrink=0.8)
            
            # Plot original data points
            scatter = ax2.scatter(result['original_lons'], result['original_lats'], 
                                c=result['original_values'], s=80, cmap='plasma',
                                edgecolors='black', linewidth=1)
            ax2.set_title(f'{var_name} - Original Data Points')
            ax2.set_xlabel('Longitude')
            ax2.set_ylabel('Latitude')
            plt.colorbar(scatter, ax=ax2, shrink=0.8)
        
        plt.tight_layout()
        plt.show()
        
        # Save results
        output_dir = '/app/outputs'
        os.makedirs(output_dir, exist_ok=True)
        
        # Save interpolated surfaces as GeoTIFF
        for var_name, result in results.items():
            output_file = f"{output_dir}/{current_filename.replace('.csv', '')}_{var_name}_interpolated.tif"
            
            # Create GeoTIFF
            bounds = (
                result['grid_lon'].min(), result['grid_lat'].min(),
                result['grid_lon'].max(), result['grid_lat'].max()
            )
            transform = from_bounds(*bounds, config['resolution'], config['resolution'])
            
            with rasterio.open(
                output_file, 'w',
                driver='GTiff',
                height=config['resolution'],
                width=config['resolution'],
                count=1,
                dtype=result['interpolated'].dtype,
                crs='EPSG:4326',
                transform=transform,
            ) as dst:
                dst.write(result['interpolated'], 1)
            
            print(f"üíæ Saved: {output_file}")
        
        print(f"\n‚úÖ Spatial interpolation completed successfully!")
        print(f"üìÅ Results saved to: {output_dir}")
        
        # Store results globally for further analysis
        global interpolation_results
        interpolation_results = results
        
    except Exception as e:
        print(f"‚ùå Error during interpolation: {str(e)}")
        import traceback
        traceback.print_exc()

# Create run button
run_button = widgets.Button(
    description='üöÄ Run Interpolation',
    button_style='success',
    layout=widgets.Layout(width='200px', height='40px')
)

run_button.on_click(lambda b: run_spatial_interpolation())
display(run_button)

## üíæ Step 4: Export Results and Generate Reports

In [None]:
def export_results():
    if 'interpolation_results' not in globals():
        print("‚ö†Ô∏è No interpolation results to export. Please run interpolation first.")
        return
    
    print("üíæ Exporting results and generating reports...")
    
    output_dir = '/app/outputs'
    base_name = current_filename.replace('.csv', '')
    
    # Generate summary report
    report_file = f"{output_dir}/{base_name}_interpolation_report.txt"
    
    with open(report_file, 'w') as f:
        f.write(f"CHEAQI Spatial Interpolation Report\n")
        f.write(f"===================================\n\n")
        f.write(f"Dataset: {current_filename}\n")
        f.write(f"Processing Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        f.write(f"Configuration:\n")
        f.write(f"- Latitude Column: {interpolation_config['lat_col']}\n")
        f.write(f"- Longitude Column: {interpolation_config['lon_col']}\n")
        f.write(f"- Interpolation Method: {interpolation_config['method']}\n")
        f.write(f"- Grid Resolution: {interpolation_config['resolution']}x{interpolation_config['resolution']}\n\n")
        
        f.write(f"Variables Processed:\n")
        for var_name, result in interpolation_results.items():
            f.write(f"\n{var_name}:\n")
            f.write(f"  - Data Points: {result['n_points']}\n")
            f.write(f"  - Value Range: {result['original_values'].min():.3f} to {result['original_values'].max():.3f}\n")
            f.write(f"  - Mean: {result['original_values'].mean():.3f}\n")
            f.write(f"  - Std Dev: {result['original_values'].std():.3f}\n")
            f.write(f"  - Output File: {base_name}_{var_name}_interpolated.tif\n")
    
    print(f"üìÑ Report saved: {report_file}")
    
    # Export CSV with interpolated grid points (sample)
    for var_name, result in interpolation_results.items():
        # Sample every 10th point for CSV export
        step = max(1, interpolation_config['resolution'] // 10)
        sample_lats = result['grid_lat'][::step, ::step].flatten()
        sample_lons = result['grid_lon'][::step, ::step].flatten()
        sample_values = result['interpolated'][::step, ::step].flatten()
        
        sample_df = pd.DataFrame({
            'latitude': sample_lats,
            'longitude': sample_lons,
            f'{var_name}_interpolated': sample_values
        })
        
        sample_file = f"{output_dir}/{base_name}_{var_name}_grid_sample.csv"
        sample_df.to_csv(sample_file, index=False)
        print(f"üìä Grid sample saved: {sample_file}")
    
    print(f"\n‚úÖ All results exported successfully!")
    print(f"üìÅ Check the outputs directory for all files.")

# Export button
export_button = widgets.Button(
    description='üíæ Export Results',
    button_style='info',
    layout=widgets.Layout(width='200px')
)

export_button.on_click(lambda b: export_results())
display(export_button)

## üîÑ Step 5: Batch Processing (Process All CSV Files)

Use this section to apply the same interpolation configuration to all CSV files automatically.

In [None]:
def process_all_files():
    if 'interpolation_config' not in globals() or not interpolation_config['lat_col']:
        print("‚ö†Ô∏è Please configure interpolation settings first using a sample file!")
        return
    
    print("üîÑ Starting batch processing of all CSV files...")
    print(f"üìä Configuration will be applied to all {len(available_files)} files\n")
    
    batch_results = {}
    
    for i, filename in enumerate(available_files, 1):
        print(f"\n{'='*60}")
        print(f"üìÇ Processing file {i}/{len(available_files)}: {filename}")
        print(f"{'='*60}")
        
        try:
            # Load current file
            file_path = os.path.join(data_dir, filename)
            df = pd.read_csv(file_path)
            
            # Update global variables for current file
            global current_df, current_filename
            current_df = df
            current_filename = filename
            
            print(f"‚úÖ Loaded {filename}: {df.shape[0]} rows √ó {df.shape[1]} columns")
            
            # Check if required columns exist
            missing_cols = []
            if interpolation_config['lat_col'] not in df.columns:
                missing_cols.append(interpolation_config['lat_col'])
            if interpolation_config['lon_col'] not in df.columns:
                missing_cols.append(interpolation_config['lon_col'])
            
            available_vars = [var for var in interpolation_config['variables'] if var in df.columns]
            missing_vars = [var for var in interpolation_config['variables'] if var not in df.columns]
            
            if missing_cols:
                print(f"‚ö†Ô∏è Missing coordinate columns: {missing_cols}")
                batch_results[filename] = {'status': 'failed', 'reason': f'Missing columns: {missing_cols}'}
                continue
            
            if not available_vars:
                print(f"‚ö†Ô∏è No target variables found in {filename}")
                batch_results[filename] = {'status': 'failed', 'reason': 'No target variables found'}
                continue
            
            if missing_vars:
                print(f"‚ö†Ô∏è Missing variables (will skip): {missing_vars}")
                print(f"üìä Processing available variables: {available_vars}")
            
            # Temporarily update config for this file
            original_vars = interpolation_config['variables'].copy()
            interpolation_config['variables'] = available_vars
            
            # Run interpolation
            run_spatial_interpolation()
            
            # Export results
            export_results()
            
            # Restore original config
            interpolation_config['variables'] = original_vars
            
            batch_results[filename] = {
                'status': 'success',
                'variables_processed': available_vars,
                'variables_skipped': missing_vars
            }
            
            print(f"‚úÖ {filename} processed successfully!")
            
        except Exception as e:
            print(f"‚ùå Error processing {filename}: {str(e)}")
            batch_results[filename] = {'status': 'failed', 'reason': str(e)}
    
    # Generate batch summary
    print(f"\n{'='*80}")
    print(f"üìã BATCH PROCESSING SUMMARY")
    print(f"{'='*80}")
    
    successful = sum(1 for r in batch_results.values() if r['status'] == 'success')
    failed = len(batch_results) - successful
    
    print(f"‚úÖ Successful: {successful}/{len(available_files)} files")
    print(f"‚ùå Failed: {failed}/{len(available_files)} files")
    
    print(f"\nüìä Detailed Results:")
    for filename, result in batch_results.items():
        if result['status'] == 'success':
            vars_processed = len(result.get('variables_processed', []))
            vars_skipped = len(result.get('variables_skipped', []))
            print(f"   ‚úÖ {filename}: {vars_processed} variables processed, {vars_skipped} skipped")
        else:
            print(f"   ‚ùå {filename}: {result['reason']}")
    
    # Save batch summary
    summary_file = f"/app/outputs/batch_processing_summary_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.txt"
    with open(summary_file, 'w') as f:
        f.write("CHEAQI Batch Processing Summary\n")
        f.write("================================\n\n")
        f.write(f"Processing Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total Files: {len(available_files)}\n")
        f.write(f"Successful: {successful}\n")
        f.write(f"Failed: {failed}\n\n")
        
        f.write("Configuration Used:\n")
        for key, value in interpolation_config.items():
            f.write(f"  {key}: {value}\n")
        
        f.write("\nResults:\n")
        for filename, result in batch_results.items():
            f.write(f"\n{filename}:\n")
            f.write(f"  Status: {result['status']}\n")
            if result['status'] == 'success':
                f.write(f"  Variables Processed: {result.get('variables_processed', [])}\n")
                f.write(f"  Variables Skipped: {result.get('variables_skipped', [])}\n")
            else:
                f.write(f"  Reason: {result['reason']}\n")
    
    print(f"\nüíæ Summary saved: {summary_file}")
    print(f"\nüéâ Batch processing completed!")

# Batch processing button
batch_button = widgets.Button(
    description='üîÑ Process All Files',
    button_style='warning',
    layout=widgets.Layout(width='200px', height='40px')
)

batch_button.on_click(lambda b: process_all_files())
display(batch_button)

# Show warning
display(HTML("""
<div style="background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 5px; padding: 15px; margin: 10px 0;">
    <strong>‚ö†Ô∏è Important:</strong> Make sure to configure your interpolation settings using a sample file first before running batch processing!
</div>
"""))