# GISTDA Wildfire Machine Learning Training

## Import and Read SHAPE File

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import rasterio
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.metrics import classification_report, confusion_matrix
from IPython.display import display
from dask import delayed, compute
from rasterio.windows import Window

#pd.set_option("display.max_columns", None)  # To show all columns in a pandas DataFrame

# Define the folder containing the raster files
raster_train_file_path = r'Raster_Train'

# Parameters for chunk size
CHUNK_SIZE = 1024

@delayed
def read_raster_in_chunks(raster_path, file, root):
    with rasterio.open(raster_path) as src:
        height, width = src.height, src.width
        num_bands = src.count
        band_names = [f'B{str(i).zfill(2)}' for i in range(1, num_bands + 1)]
        chunk_dfs = []
        
        # Loop over the raster in chunks
        for row in range(0, height, CHUNK_SIZE):
            for col in range(0, width, CHUNK_SIZE):
                window = Window(col_off=col, row_off=row, 
                              width=min(CHUNK_SIZE, width - col),
                              height=min(CHUNK_SIZE, height - row))
                
                # Read all bands at once
                data = src.read(window=window)
                
                # Check if chunk contains any data
                if np.any(data):
                    rows, cols = data[0].shape
                    
                    # Create base DataFrame with coordinates
                    row_coords, col_coords = np.meshgrid(
                        np.arange(row, row + rows),
                        np.arange(col, col + cols),
                        indexing="ij"
                    )
                    
                    chunk_df = pd.DataFrame({
                        'raster_file': file,
                        'subfolder': os.path.basename(root),
                        'x': row_coords.flatten(),
                        'y': col_coords.flatten()
                    })
                    
                    # Add each band's data
                    for band_idx, band_name in enumerate(band_names, 1):
                        chunk_df[band_name] = data[band_idx-1].flatten()
                    
                    chunk_dfs.append(chunk_df)
        
        return pd.concat(chunk_dfs, ignore_index=True) if chunk_dfs else pd.DataFrame()

# Create list of tasks
dask_dfs = [
    read_raster_in_chunks(os.path.join(root, file), file, root)
    for root, dirs, files in os.walk(raster_train_file_path)
    for file in files if file.endswith('.tif')
]

# Compute all tasks
dataframes = compute(*dask_dfs)

# Combine all DataFrames
final_df = pd.concat(dataframes, ignore_index=True)

# Debug prints
print("DataFrame shape:", final_df.shape)
print("\nDataFrame columns:", final_df.columns.tolist())
print("\nSample of data:")
print(final_df.head())

DataFrame shape: (41746432, 20)

DataFrame columns: ['raster_file', 'subfolder', 'x', 'y', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09', 'B10', 'B11', 'B12', 'B13', 'B14', 'B15', 'B16']

Sample of data:
                               raster_file     subfolder  x  y    B01     B02  \
0  T47QLA_20210226T035719_chunk_0_1792.tif  Raster_Train  0  0  835.0  1664.0   
1  T47QLA_20210226T035719_chunk_0_1792.tif  Raster_Train  0  1  835.0  1668.0   
2  T47QLA_20210226T035719_chunk_0_1792.tif  Raster_Train  0  2  832.0  1670.0   
3  T47QLA_20210226T035719_chunk_0_1792.tif  Raster_Train  0  3  832.0  1641.0   
4  T47QLA_20210226T035719_chunk_0_1792.tif  Raster_Train  0  4  832.0  1647.0   

      B03     B04     B05     B06     B07     B08     B09     B10     B11  \
0  1722.0  1979.0  2090.0  2131.0  2249.0  2310.0  2341.0  2361.0  3256.0   
1  1744.0  1972.0  2090.0  2131.0  2249.0  2319.0  2341.0  2361.0  3256.0   
2  1726.0  1942.0  2104.0  2147.0  2245.0  2284.0  2343.0  234

## Exploratory Data Analysis (EDA) & Feature Engineering

In [2]:
# Convert pandas DataFrame to dask DataFrame if already loaded
ddf = pd.DataFrame(final_df)  # Adjust number of partitions as needed

# Drop columns in dask
df = ddf.drop(columns=['raster_file', 'subfolder', 'x', 'y', 'B13'])
display(df)  # Compute only when you need to display or save results

Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B09,B10,B11,B12,B14,B15,B16
0,835.0,1664.0,1722.0,1979.0,2090.0,2131.0,2249.0,2310.0,2341.0,2361.0,3256.0,3100.0,0.077174,-0.145833,1.0
1,835.0,1668.0,1744.0,1972.0,2090.0,2131.0,2249.0,2319.0,2341.0,2361.0,3256.0,3100.0,0.080867,-0.141521,1.0
2,832.0,1670.0,1726.0,1942.0,2104.0,2147.0,2245.0,2284.0,2343.0,2348.0,3273.0,3093.0,0.080928,-0.139152,1.0
3,832.0,1641.0,1723.0,1952.0,2104.0,2147.0,2245.0,2257.0,2343.0,2348.0,3273.0,3093.0,0.072464,-0.134171,1.0
4,832.0,1647.0,1747.0,1972.0,2074.0,2118.0,2228.0,2251.0,2328.0,2348.0,3280.0,3152.0,0.066067,-0.126063,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41746427,1085.0,1134.0,1266.0,1198.0,1543.0,2526.0,3090.0,3751.0,3487.0,3547.0,2400.0,1568.0,0.515862,-0.495316,0.0
41746428,1086.0,1147.0,1444.0,1298.0,1896.0,3828.0,4213.0,4459.0,4742.0,4033.0,2887.0,1841.0,0.549071,-0.510757,0.0
41746429,1086.0,1221.0,1661.0,1351.0,1896.0,3828.0,4213.0,5526.0,4742.0,4033.0,2887.0,1841.0,0.607096,-0.537777,0.0
41746430,1093.0,1220.0,1552.0,1369.0,1978.0,3880.0,4664.0,5294.0,5052.0,4033.0,3059.0,1919.0,0.589074,-0.546597,0.0


In [3]:
## Rename Sentinel-2 Bands columns and Burn Label
# List of new column names
new_col_names = ['Band_1', 'Band_2', 'Band_3', 'Band_4', 'Band_5', 'Band_6', 'Band_7', 
                 'Band_8', 'Band_8A', 'Band_9', 'Band_11', 'Band_12', 'NDVI', 'NDWI', 'Burn_Label']


# Renaming columns using the list
df.columns = new_col_names
display(df)

Unnamed: 0,Band_1,Band_2,Band_3,Band_4,Band_5,Band_6,Band_7,Band_8,Band_8A,Band_9,Band_11,Band_12,NDVI,NDWI,Burn_Label
0,835.0,1664.0,1722.0,1979.0,2090.0,2131.0,2249.0,2310.0,2341.0,2361.0,3256.0,3100.0,0.077174,-0.145833,1.0
1,835.0,1668.0,1744.0,1972.0,2090.0,2131.0,2249.0,2319.0,2341.0,2361.0,3256.0,3100.0,0.080867,-0.141521,1.0
2,832.0,1670.0,1726.0,1942.0,2104.0,2147.0,2245.0,2284.0,2343.0,2348.0,3273.0,3093.0,0.080928,-0.139152,1.0
3,832.0,1641.0,1723.0,1952.0,2104.0,2147.0,2245.0,2257.0,2343.0,2348.0,3273.0,3093.0,0.072464,-0.134171,1.0
4,832.0,1647.0,1747.0,1972.0,2074.0,2118.0,2228.0,2251.0,2328.0,2348.0,3280.0,3152.0,0.066067,-0.126063,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41746427,1085.0,1134.0,1266.0,1198.0,1543.0,2526.0,3090.0,3751.0,3487.0,3547.0,2400.0,1568.0,0.515862,-0.495316,0.0
41746428,1086.0,1147.0,1444.0,1298.0,1896.0,3828.0,4213.0,4459.0,4742.0,4033.0,2887.0,1841.0,0.549071,-0.510757,0.0
41746429,1086.0,1221.0,1661.0,1351.0,1896.0,3828.0,4213.0,5526.0,4742.0,4033.0,2887.0,1841.0,0.607096,-0.537777,0.0
41746430,1093.0,1220.0,1552.0,1369.0,1978.0,3880.0,4664.0,5294.0,5052.0,4033.0,3059.0,1919.0,0.589074,-0.546597,0.0


## Check Burn Class

In [4]:
# Check Burn Records
burn_counts = df['Burn_Label'].value_counts().rename(index={1: 'Burn', 0: 'Unburn'})

# Display the counts with labels
print(burn_counts)

Burn_Label
Unburn    38294669
Burn       3451763
Name: count, dtype: int64


### Downsampling

In [5]:
burn_count = burn_counts['Burn']
unburn_sample = df[df['Burn_Label'] == 0].sample(n=burn_count, random_state=42)

downsampled_df = pd.concat([df[df['Burn_Label'] == 1], unburn_sample])

# Check Burn Records
burn_counts = downsampled_df['Burn_Label'].value_counts().rename(index={1: 'Burn', 0: 'Unburn'})

# Display the counts with labels
print(burn_counts)

Burn_Label
Burn      3451763
Unburn    3451763
Name: count, dtype: int64


## Pre-Processing

### Remove infinite values

In [6]:
# Replacing infinite with nan 
downsampled_df.replace([np.inf, -np.inf], np.nan, inplace=True) 
  
# Dropping all the rows with nan values 
downsampled_df.dropna(inplace=True)

# Printing df 
display(downsampled_df)

Unnamed: 0,Band_1,Band_2,Band_3,Band_4,Band_5,Band_6,Band_7,Band_8,Band_8A,Band_9,Band_11,Band_12,NDVI,NDWI,Burn_Label
0,835.0,1664.0,1722.0,1979.0,2090.0,2131.0,2249.0,2310.0,2341.0,2361.0,3256.0,3100.0,0.077174,-0.145833,1.0
1,835.0,1668.0,1744.0,1972.0,2090.0,2131.0,2249.0,2319.0,2341.0,2361.0,3256.0,3100.0,0.080867,-0.141521,1.0
2,832.0,1670.0,1726.0,1942.0,2104.0,2147.0,2245.0,2284.0,2343.0,2348.0,3273.0,3093.0,0.080928,-0.139152,1.0
3,832.0,1641.0,1723.0,1952.0,2104.0,2147.0,2245.0,2257.0,2343.0,2348.0,3273.0,3093.0,0.072464,-0.134171,1.0
4,832.0,1647.0,1747.0,1972.0,2074.0,2118.0,2228.0,2251.0,2328.0,2348.0,3280.0,3152.0,0.066067,-0.126063,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14849422,1490.0,1525.0,1590.0,1638.0,1822.0,2243.0,2439.0,2694.0,2654.0,3051.0,3142.0,2470.0,0.243767,-0.257703,0.0
13273428,1504.0,1600.0,1728.0,1928.0,2161.0,2343.0,2537.0,2662.0,2750.0,2663.0,3329.0,2732.0,0.159913,-0.212756,0.0
37950561,1512.0,1463.0,1612.0,1429.0,1834.0,3284.0,3810.0,4095.0,4152.0,4052.0,2401.0,1641.0,0.482621,-0.435080,0.0
6646490,1671.0,1676.0,1858.0,1592.0,2223.0,4430.0,5505.0,5008.0,5802.0,4832.0,3608.0,2234.0,0.517576,-0.458782,0.0


### Seperate Burn_Label from DataFrame

In [7]:
# Seperate Burn_Label from DataFrame
burn_label = downsampled_df[['Burn_Label']]

# Drop Label from DataFrame
downsampled_df = downsampled_df.drop(columns=['Burn_Label'])

# Change type of Label to Integer Format
burn_label = burn_label.astype('int32')
display(burn_label)

Unnamed: 0,Burn_Label
0,1
1,1
2,1
3,1
4,1
...,...
14849422,0
13273428,0
37950561,0
6646490,0


### Normalization Data with MinMax Scaler

In [8]:
# Reassign the dataframe with a list of the columns
cols_norm = downsampled_df.columns.tolist()

# Import Normalize technique
scaler = MinMaxScaler()

# Normalize data
scaler.fit(downsampled_df)

# Normalize Data
df_norm = scaler.transform(downsampled_df)
df_norm = pd.DataFrame(df_norm, columns=cols_norm)

# Check df_norm shape after normalization
print("Shape of df_norm after normalization:", df_norm.shape)

# Concatenate df_norm with burn_label
df_norm = pd.concat([df_norm.reset_index(drop=True), burn_label.reset_index(drop=True)], axis=1, sort=False)
display(df_norm)

Shape of df_norm after normalization: (6903526, 14)


Unnamed: 0,Band_1,Band_2,Band_3,Band_4,Band_5,Band_6,Band_7,Band_8,Band_8A,Band_9,Band_11,Band_12,NDVI,NDWI,Burn_Label
0,0.162732,0.164735,0.092127,0.104737,0.187248,0.152113,0.146604,0.118611,0.144975,0.186958,0.191728,0.140177,0.270132,0.597490,1
1,0.162732,0.165495,0.094712,0.104065,0.187248,0.152113,0.146604,0.119458,0.144975,0.186958,0.191728,0.140177,0.274157,0.602261,1
2,0.162146,0.165875,0.092597,0.101182,0.189700,0.154138,0.146165,0.116163,0.145163,0.185078,0.193218,0.139719,0.274223,0.604882,1
3,0.162146,0.160365,0.092244,0.102143,0.189700,0.154138,0.146165,0.113621,0.145163,0.185078,0.193218,0.139719,0.264998,0.610393,1
4,0.162146,0.161505,0.095065,0.104065,0.184446,0.150468,0.144299,0.113057,0.143755,0.185078,0.193831,0.143576,0.258025,0.619364,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6903521,0.290537,0.138324,0.076616,0.071971,0.140305,0.166287,0.167453,0.154759,0.174345,0.286726,0.181739,0.098987,0.451710,0.473715,0
6903522,0.293268,0.152575,0.092832,0.099837,0.199685,0.178942,0.178207,0.151746,0.183354,0.230625,0.198125,0.116116,0.360313,0.523445,0
6903523,0.294829,0.126544,0.079201,0.051888,0.142407,0.298026,0.317897,0.286642,0.314910,0.431463,0.116807,0.044786,0.712050,0.277462,0
6903524,0.325854,0.167015,0.108108,0.067551,0.210545,0.443052,0.503896,0.372588,0.469738,0.544245,0.222573,0.083557,0.750149,0.251236,0


In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import KFold
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from typing import Tuple, Dict, Any
import logging
import os

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class GPUConfig:
    @staticmethod
    def setup_gpu() -> bool:
        """
        Configure GPU settings for optimal performance.
        Returns True if GPU is available and configured, False otherwise.
        """
        try:
            # Check if GPU is available
            if tf.test.is_built_with_cuda():
                # List physical devices
                physical_devices = tf.config.list_physical_devices('GPU')
                if physical_devices:
                    # Try to configure memory growth
                    try:
                        for device in physical_devices:
                            tf.config.experimental.set_memory_growth(device, True)
                        logging.info(f"GPU configured successfully: {physical_devices[0].name}")
                        return True
                    except RuntimeError as e:
                        logging.warning(f"Memory growth configuration failed: {str(e)}")
                        # Try setting memory limit instead
                        try:
                            tf.config.set_logical_device_configuration(
                                physical_devices[0],
                                [tf.config.LogicalDeviceConfiguration(memory_limit=1024 * 4)]  # 4GB limit
                            )
                            logging.info("GPU configured with memory limit")
                            return True
                        except RuntimeError as e:
                            logging.warning(f"Memory limit configuration failed: {str(e)}")
            
            # If we get here, either no GPU or configuration failed
            logging.warning("No GPU available or configuration failed. Using CPU.")
            os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # Force CPU usage
            return False
            
        except Exception as e:
            logging.error(f"GPU configuration error: {str(e)}")
            os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # Force CPU usage
            return False

class FCNNModel:
    def __init__(self, input_dim: int, batch_size: int = 32):
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.model = self._create_model()
        self.callbacks = self._create_callbacks()

    def _create_model(self) -> Sequential:
        """Create and compile the FCNN model."""
        # Check if mixed precision is supported
        policy = 'mixed_float16' if tf.config.list_physical_devices('GPU') else 'float32'
        tf.keras.mixed_precision.set_global_policy(policy)
        
        model = Sequential([
            Dense(128, activation='relu', input_dim=self.input_dim),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dropout(0.1),
            Dense(1, activation='sigmoid')
        ])
        
        # Create optimizer with appropriate settings
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        if policy == 'mixed_float16':
            optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
        
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy',
            metrics=['accuracy', tf.keras.metrics.AUC()]
        )
        return model

    def _create_callbacks(self) -> list:
        """Create training callbacks with more conservative settings."""
        return [
            EarlyStopping(
                monitor='val_loss',
                patience=8,
                restore_best_weights=True,
                min_delta=0.001
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=4,
                min_lr=0.00001,
                min_delta=0.001
            )
        ]

class ModelTrainer:
    def __init__(self, X: np.ndarray, y: np.ndarray, n_splits: int = 5):
        self.X = X
        self.y = y
        self.n_splits = n_splits
        self.kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        
    def train_and_evaluate(self) -> Tuple[Sequential, float, float, Dict[str, Any]]:
        """Perform cross-validation training and evaluation with error handling."""
        scores = []
        histories = []
        
        # Initialize the model
        fcnn = FCNNModel(input_dim=self.X.shape[1])
        
        for fold, (train_idx, val_idx) in enumerate(self.kf.split(self.X), 1):
            try:
                X_train, X_val = self.X[train_idx], self.X[val_idx]
                y_train, y_val = self.y[train_idx], self.y[val_idx]
                
                logging.info(f"Training fold {fold}/{self.n_splits}")
                
                # Train the model with error handling
                history = fcnn.model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=50,
                    batch_size=fcnn.batch_size,
                    callbacks=fcnn.callbacks,
                    verbose=1
                )
                
                # Evaluate the model
                score = fcnn.model.evaluate(X_val, y_val, verbose=0)
                scores.append(score[1])  # Accuracy
                histories.append(history.history)
                
                logging.info(f"Fold {fold} accuracy: {score[1]:.4f}")
                
            except Exception as e:
                logging.error(f"Error in fold {fold}: {str(e)}")
                continue
        
        if not scores:
            raise RuntimeError("All folds failed to train")
        
        mean_cv = np.mean(scores)
        std_cv = np.std(scores)
        
        # Final evaluation
        final_metrics = self._final_evaluation(fcnn.model)
        
        return fcnn.model, mean_cv, std_cv, final_metrics
    
    def _final_evaluation(self, model: Sequential) -> Dict[str, Any]:
        """Perform final evaluation with error handling."""
        try:
            y_pred = (model.predict(self.X, batch_size=32) > 0.5).astype(int)
            report = classification_report(self.y, y_pred, output_dict=True)
            cm = confusion_matrix(self.y, y_pred)
            return {
                'classification_report': report,
                'confusion_matrix': cm
            }
        except Exception as e:
            logging.error(f"Error in final evaluation: {str(e)}")
            return {
                'classification_report': None,
                'confusion_matrix': None
            }

def main(df_norm: pd.DataFrame) -> None:
    """Main execution function with enhanced error handling."""
    try:
        # Setup GPU and get status
        gpu_available = GPUConfig.setup_gpu()
        
        # Prepare data
        X = df_norm.drop(columns=['Burn_Label']).values
        y = df_norm['Burn_Label'].values
        
        # Initialize trainer
        trainer = ModelTrainer(X, y)
        
        # Train and evaluate model
        model, mean_cv, std_cv, metrics = trainer.train_and_evaluate()
        
        # Log results
        logging.info(f"Training completed {'with GPU' if gpu_available else 'on CPU'}")
        logging.info(f"Cross-validation accuracy: {mean_cv:.4f} (±{std_cv:.4f})")
        
        # Save the model with error handling
        try:
            model.save('fcnn_model.h5')
            logging.info("Model saved successfully")
        except Exception as e:
            logging.error(f"Error saving model: {str(e)}")
            
    except Exception as e:
        logging.error(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    # Assuming df_norm is your input DataFrame
    main(df_norm)

2024-11-20 11:11:04,859 - INFO - GPU configured successfully: /physical_device:GPU:0


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3050 Laptop GPU, compute capability 8.6


2024-11-20 11:11:04,962 - INFO - Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3050 Laptop GPU, compute capability 8.6
2024-11-20 11:11:06,576 - INFO - Training fold 1/5


Epoch 1/50

KeyboardInterrupt: 