<a href="https://colab.research.google.com/github/Manvithtech07/Forest-Fire-Detection-And-Prediction/blob/main/notebooks/data_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rasterio

Collecting rasterio
  Downloading rasterio-1.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading rasterio-1.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.3/22.3 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: cligj, click-plugins, affine, rasterio
Successfully installed affine-2.4.0 click-plugins-1.1.1.2 cligj-0.7.2 rasterio-1.4.3


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import rasterio
import numpy as np
import pandas as pd
import os
import random

# To ignore common warnings
import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [None]:
GDRIVE_PROJECT_PATH = "/content/drive/My Drive/"
FEATURES_PATH = f"{GDRIVE_PROJECT_PATH}/data/features"
DATA_PATH = f"{GDRIVE_PROJECT_PATH}/data"

## 1. Load All Feature Rasters

Loading all 8 of our feature maps (`.tif` files) into NumPy arrays. We will store them in a dictionary so we can easily access them by name.

In [None]:
feature_filenames = [
    "ndvi_pre.tif",
    "ndvi_post.tif",
    "nbr_pre.tif",
    "nbr_post.tif",
    "elevation.tif",
    "slope.tif",
    "aspect.tif",
    "dNBR.tif" # target variable
]
features_data = {}

template_meta = None

try:
    for filename in feature_filenames:
        feature_name = filename.split('.')[0]

        file_path = f"{FEATURES_PATH}/{filename}"

        with rasterio.open(file_path) as src:
            features_data[feature_name] = src.read(1)

            if template_meta is None:
                template_meta = src.meta

        print(f" Loaded {filename} as '{feature_name}'")

    print("Shape of our data:", features_data['dNBR'].shape)

except Exception as e:
    print(f"ERROR: Could not load files. Check your 'feature_filenames' list. {e}")

 Loaded ndvi_pre.tif as 'ndvi_pre'
 Loaded ndvi_post.tif as 'ndvi_post'
 Loaded nbr_pre.tif as 'nbr_pre'
 Loaded nbr_post.tif as 'nbr_post'
 Loaded elevation.tif as 'elevation'
 Loaded slope.tif as 'slope'
 Loaded aspect.tif as 'aspect'
 Loaded dNBR.tif as 'dNBR'
Shape of our data: (2337, 3669)


## 2. Create Pixel Masks for Sampling

Now we use our `dNBR` data to find all the "Burned" and "Unburned" pixels. This is the core of our stratified sampling.

Based on USGS standards, we'll set our threshold for a confirmed "burned" area at **`dNBR > 0.27`**.

We will:
1.  Create a "Burned" list of all pixel coordinates `(row, col)` that meet this condition.
2.  Create an "Unburned" list of all pixels that *don't* meet this condition (but are still valid data).

In [None]:
BURN_THRESHOLD = 0.27
dnbr_data = features_data['dNBR']

valid_pixels_mask = ~np.isnan(dnbr_data)
print(f"Total valid (non-cloud) pixels: {np.sum(valid_pixels_mask)}")

burned_mask = (dnbr_data > BURN_THRESHOLD) & (valid_pixels_mask)
burned_pixels = np.argwhere(burned_mask)
print(f"Found {len(burned_pixels)} 'Burned' pixels.")

unburned_mask = (dnbr_data <= BURN_THRESHOLD) & (valid_pixels_mask)
unburned_pixels = np.argwhere(unburned_mask)
print(f"Found {len(unburned_pixels)} 'Unburned' pixels.")



Total valid (non-cloud) pixels: 4057872
Found 24539 'Burned' pixels.
Found 4033333 'Unburned' pixels.


## 3. Perform Stratified Sampling

Now we create our final, balanced dataset. We will take a sample of 5,000 "Burned" points and 5,000 "Unburned" points to create a balanced dataset of 10,000 total samples.

For each sample point, we will:
1.  Get its `(row, col)` coordinate.
2.  Extract the value from all 8 of our feature maps at that *exact* coordinate.
3.  Store these 8 values as one row in our dataset.
4.  Finally, we will save this dataset as a `.csv` file.

In [None]:
from rasterio.warp import reproject, Resampling

def align_to_master(array, src_transform, src_crs, master_meta):
    H = master_meta['height']
    W = master_meta['width']

    dst = np.empty((H, W), dtype=np.float32)

    reproject(
        source=array,
        destination=dst,
        src_transform=src_transform,
        src_crs=src_crs,
        dst_transform=master_meta['transform'],
        dst_crs=master_meta['crs'],
        resampling=Resampling.bilinear
    )
    return dst

aligned_features = {}

for filename in feature_filenames:
    feature_name = filename.split('.')[0]
    path = f"{FEATURES_PATH}/{filename}"

    with rasterio.open(path) as src:
        array = src.read(1)
        aligned = align_to_master(array, src.transform, src.crs, template_meta)

    aligned_features[feature_name] = aligned

features_data = aligned_features


In [None]:
N_SAMPLES_PER_CLASS = 5000

training_data_list = []

print(f"Sampling {N_SAMPLES_PER_CLASS} 'Burned' pixels")

n_burned = min(len(burned_pixels), N_SAMPLES_PER_CLASS)
sampled_burned_indices = random.sample(range(len(burned_pixels)), n_burned)

for i in sampled_burned_indices:
    row, col = burned_pixels[i]

    sample_data = {}
    for feature_name, data_array in features_data.items():
        sample_data[feature_name] = data_array[row, col]

    sample_data['Burned'] = 1

    training_data_list.append(sample_data)

print(f"Sampled {n_burned} 'Burned' pixels.")

print(f"Sampling {N_SAMPLES_PER_CLASS} 'Unburned' pixels")

n_unburned = min(len(unburned_pixels), N_SAMPLES_PER_CLASS)
sampled_unburned_indices = random.sample(range(len(unburned_pixels)), n_unburned)

for i in sampled_unburned_indices:
    row, col = unburned_pixels[i]

    sample_data = {}
    for feature_name, data_array in features_data.items():
        sample_data[feature_name] = data_array[row, col]

    sample_data['Burned'] = 0

    training_data_list.append(sample_data)

print(f"Sampled {n_unburned} 'Unburned' pixels.")

print("\nConverting data to pandas DataFrame")
df = pd.DataFrame(training_data_list)

df_clean = df.dropna()

print(f"Total rows before cleaning: {len(df)}")
print(f"Total rows after cleaning: {len(df_clean)}")

csv_output_path = f"{DATA_PATH}/training_data.csv"
df_clean.to_csv(csv_output_path, index=False)

Sampling 5000 'Burned' pixels
Sampled 5000 'Burned' pixels.
Sampling 5000 'Unburned' pixels
Sampled 5000 'Unburned' pixels.

Converting data to pandas DataFrame
Total rows before cleaning: 10000
Total rows after cleaning: 10000


In [None]:
df

Unnamed: 0,ndvi_pre,ndvi_post,nbr_pre,nbr_post,elevation,slope,aspect,dNBR,Burned
0,0.374372,0.135473,0.330744,0.029732,291.253357,89.948647,53.870060,0.301013,1
1,0.328452,0.142593,0.279187,0.008456,294.846497,89.984947,194.088989,0.270731,1
2,0.294952,0.108847,0.274831,-0.011318,325.681427,89.973442,65.138115,0.286149,1
3,0.336803,0.136065,0.311538,0.016770,313.951141,89.978157,227.153778,0.294768,1
4,0.326934,0.128385,0.289117,-0.001431,355.860138,89.987427,168.040558,0.290548,1
...,...,...,...,...,...,...,...,...,...
9995,0.217841,0.201744,0.130376,0.175150,967.651001,89.999069,180.178940,-0.044774,0
9996,0.109236,0.052495,-0.032191,-0.078170,291.378021,89.989197,146.856308,0.045979,0
9997,0.262132,0.129594,0.205681,0.055074,218.332596,89.980705,43.266441,0.150607,0
9998,0.313092,0.215166,0.269641,0.181862,308.936493,89.988815,40.758434,0.087779,0
