<a href="https://colab.research.google.com/github/your-repo/your-project/blob/main/v2/nb/process_raw_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive to access data
from google.colab import drive, runtime
# Import required modules
import os
import glob
import time
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler
import json


In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
"""Main function to process all years in two steps."""
# Configuration
BASE_INPUT_DIR = '/content/drive/My Drive/Colab Notebooks/wildfire/new_data/processed'
BASE_OUTPUT_DIR = '/content/drive/My Drive/Colab Notebooks/wildfire/new_data/processed_scaled'
# YEARS = [2019, 2020, 2021, 2022, 2023, 2024]
YEARS = [2024]
SCALER_TYPE = 'standard'  # 'standard' or 'robust'

# Feature names for reference and logging
NUMERIC_FEATURES = [
    'LST_Day_1km', 'LST_Night_1km',
    'Emis_31', 'Emis_32', 'dewpoint_temperature_2m',
    'temperature_2m', 'soil_temperature_level_1',
    'surface_net_thermal_radiation', 'u_component_of_wind_10m',
    'v_component_of_wind_10m', 'surface_pressure', 'total_precipitation',
    'elevation', 'NDVI'
]

In [4]:
def load_and_combine_features(input_dir, batch_pattern="batch_*.parquet"):
    """Load all features from batch files for scaling computation."""
    print(f"\nLoading features from {input_dir}")
    all_features = []

    batch_files = glob.glob(os.path.join(input_dir, batch_pattern))
    total_files = len(batch_files)

    for i, batch_file in enumerate(batch_files, 1):
        print(f"Processing batch {i}/{total_files}: {os.path.basename(batch_file)}")
        df = pd.read_parquet(batch_file)
        features = df[NUMERIC_FEATURES].values
        all_features.append(features)

    return np.concatenate(all_features, axis=0)

def compute_scaling_parameters(features, scaler_type='robust'):
    """Compute scaling parameters for the features."""
    if scaler_type == 'robust':
        scaler = RobustScaler()
    else:
        scaler = StandardScaler()

    scaler.fit(features)

    # Extract and format scaling parameters
    if scaler_type == 'robust':
        params = {
            'center_': scaler.center_.tolist(),
            'scale_': scaler.scale_.tolist()
        }
    else:
        params = {
            'mean_': scaler.mean_.tolist(),
            'scale_': scaler.scale_.tolist()
        }

    # Add feature names to parameters
    params['features'] = NUMERIC_FEATURES
    params['scaler_type'] = scaler_type

    return scaler, params

def transform_and_save_batch(batch_file, scaler, output_dir):
    """Transform a single batch file and save the result."""
    df = pd.read_parquet(batch_file)

    # Extract features for transformation
    features = df[NUMERIC_FEATURES].values

    # Transform features
    transformed_features = scaler.transform(features)

    # Replace original features with transformed ones
    for i, col in enumerate(NUMERIC_FEATURES):
        df[col] = transformed_features[:, i]

    # Save transformed batch
    output_file = os.path.join(output_dir, os.path.basename(batch_file))
    df.to_parquet(output_file)

    return output_file

def collect_yearly_statistics(input_dir, scaler_type='robust'):
    """Collect statistical information for a single year."""
    year = os.path.basename(input_dir)
    print(f"\nCollecting statistics for year: {year}")

    # Load and combine features for scaling
    features = load_and_combine_features(input_dir)

    # Compute scaling parameters without transforming
    print("Computing scaling parameters...")
    scaler, scaling_params = compute_scaling_parameters(features, scaler_type)

    # Add year information
    scaling_params['year'] = year
    scaling_params['n_samples'] = len(features)

    return scaling_params

def combine_yearly_statistics(yearly_stats):
    """Combine statistics from multiple years to create global scaling parameters."""
    if not yearly_stats:
        raise ValueError("No yearly statistics provided")

    # Get feature names and scaler type from first year
    features = yearly_stats[0]['features']
    scaler_type = yearly_stats[0]['scaler_type']

    if scaler_type == 'robust':
        # For RobustScaler, combine centers and scales weighted by number of samples
        total_samples = sum(stats['n_samples'] for stats in yearly_stats)
        combined_center = np.zeros(len(features))
        combined_scale = np.zeros(len(features))

        for stats in yearly_stats:
            weight = stats['n_samples'] / total_samples
            combined_center += np.array(stats['center_']) * weight
            combined_scale += np.array(stats['scale_']) * weight

        combined_params = {
            'center_': combined_center.tolist(),
            'scale_': combined_scale.tolist()
        }
    else:
        # For StandardScaler, combine means and variances
        total_samples = sum(stats['n_samples'] for stats in yearly_stats)
        combined_mean = np.zeros(len(features))
        combined_var = np.zeros(len(features))

        # First pass: combine means
        for stats in yearly_stats:
            weight = stats['n_samples'] / total_samples
            combined_mean += np.array(stats['mean_']) * weight

        # Second pass: combine variances
        for stats in yearly_stats:
            weight = stats['n_samples'] / total_samples
            combined_var += (np.array(stats['scale_']) ** 2) * weight

        combined_params = {
            'mean_': combined_mean.tolist(),
            'scale_': np.sqrt(combined_var).tolist()
        }

    # Add metadata
    combined_params['features'] = features
    combined_params['scaler_type'] = scaler_type
    combined_params['total_samples'] = total_samples

    return combined_params

def create_scaler_from_params(params):
    """Create a scaler object from parameters."""
    if params['scaler_type'] == 'robust':
        scaler = RobustScaler()
        scaler.center_ = np.array(params['center_'])
        scaler.scale_ = np.array(params['scale_'])
    else:
        scaler = StandardScaler()
        scaler.mean_ = np.array(params['mean_'])
        scaler.scale_ = np.array(params['scale_'])

    return scaler

In [5]:
print(f"Starting data scaling process at {datetime.now()}")
print(f"Input directory: {BASE_INPUT_DIR}")
print(f"Output directory: {BASE_OUTPUT_DIR}")
print(f"Scaler type: {SCALER_TYPE}")

os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
combined_stats_file = os.path.join(BASE_OUTPUT_DIR, 'combined_scaling_params.json')

# Check if combined scaling parameters already exist
if os.path.exists(combined_stats_file):
    print(f"\nFound existing scaling parameters at: {combined_stats_file}")
    with open(combined_stats_file, 'r') as f:
        combined_stats = json.load(f)
else:
    # Step 1: Collect statistics from all years
    print("\nStep 1: Collecting statistics from all years...")
    yearly_stats = []

    for year in YEARS:
        input_dir = os.path.join(BASE_INPUT_DIR, str(year))
        if not os.path.exists(input_dir):
            print(f"Warning: Directory for year {year} not found, skipping...")
            continue

        year_stats = collect_yearly_statistics(input_dir, SCALER_TYPE)
        yearly_stats.append(year_stats)

    # Combine statistics from all years
    print("\nCombining statistics from all years...")
    combined_stats = combine_yearly_statistics(yearly_stats)

    # Save combined statistics
    with open(combined_stats_file, 'w') as f:
        json.dump(combined_stats, f, indent=2)
    print(f"Saved combined scaling parameters to: {combined_stats_file}")

# Step 2: Apply scaling to each year using combined statistics
print("\nStep 2: Applying scaling to each year...")
scaler = create_scaler_from_params(combined_stats)

for year in YEARS:
    input_dir = os.path.join(BASE_INPUT_DIR, str(year))
    if not os.path.exists(input_dir):
        continue

    output_dir = os.path.join(BASE_OUTPUT_DIR, str(year))
    os.makedirs(output_dir, exist_ok=True)

    print(f"\nProcessing year: {year}")
    start_time = time.time()

    # Process each batch file
    batch_files = glob.glob(os.path.join(input_dir, "batch_*.parquet"))
    total_batches = len(batch_files)

    print(f"Processing {total_batches} batch files...")
    for i, batch_file in enumerate(batch_files, 1):
        print(f"Transforming batch {i}/{total_batches}: {os.path.basename(batch_file)}")
        transform_and_save_batch(batch_file, scaler, output_dir)

    # Copy location mapping file if it exists
    mapping_file = os.path.join(input_dir, 'location_mapping.parquet')
    if os.path.exists(mapping_file):
        output_mapping = os.path.join(output_dir, 'location_mapping.parquet')
        pd.read_parquet(mapping_file).to_parquet(output_mapping)

    duration = time.time() - start_time
    print(f"Year {year} processing completed in {duration:.2f} seconds")

print(f"\nScaling process completed at {datetime.now()}")
print(f"Scaled data saved to: {BASE_OUTPUT_DIR}")
print(f"Combined scaling parameters saved to: {combined_stats_file}")

Starting data scaling process at 2025-02-20 13:16:00.127372
Input directory: /content/drive/My Drive/Colab Notebooks/wildfire/new_data/processed
Output directory: /content/drive/My Drive/Colab Notebooks/wildfire/new_data/processed_scaled
Scaler type: standard

Found existing scaling parameters at: /content/drive/My Drive/Colab Notebooks/wildfire/new_data/processed_scaled/combined_scaling_params.json

Step 2: Applying scaling to each year...

Processing year: 2024
Processing 16 batch files...
Transforming batch 1/16: batch_0001.parquet
Transforming batch 2/16: batch_0002.parquet
Transforming batch 3/16: batch_0003.parquet
Transforming batch 4/16: batch_0004.parquet
Transforming batch 5/16: batch_0005.parquet
Transforming batch 6/16: batch_0006.parquet
Transforming batch 7/16: batch_0007.parquet
Transforming batch 8/16: batch_0008.parquet
Transforming batch 9/16: batch_0009.parquet
Transforming batch 10/16: batch_0010.parquet
Transforming batch 11/16: batch_0011.parquet
Transforming batc

In [6]:
runtime.unassign()