# Soil Classification Model Training

## 1. Environment Setup
This section sets up the necessary libraries and configures GPU acceleration.

In [None]:
# This is the notebook used for training the model.

# --- 1. Imports and Environment Setup ---
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from PIL import Image
import shutil

In [None]:
# --- GPU and mixed precision setup for faster training and efficiency ---
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    # Enable dynamic memory allocation for GPUs
    for gpu in physical_devices:
        tf.config.experimental.set_memory_growth(gpu, True)
    # Enable XLA compilation for speed
    tf.config.optimizer.set_jit(True)
    # Enable mixed precision for faster computation
    from tensorflow.keras import mixed_precision
    mixed_precision.set_global_policy('mixed_float16')
    print(f"GPU acceleration enabled: {len(physical_devices)} GPU(s) found")
    print("Mixed precision enabled")
else:
    print("No GPU found, using CPU")

In [None]:
# --- 2. Configuration ---
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Model and data configuration
IMG_SIZE = 224         # Input image size (height, width)
BATCH_SIZE = 64        # Images per batch
EPOCHS = 12            # Number of training epochs
NUM_CLASSES = 4        # Number of soil classes
LEARNING_RATE = 0.001  # Initial learning rate

# Paths to data directories and CSVs
TRAIN_DIR = '/kaggle/input/soilcl/soil_classification-2025/train'
TEST_DIR = '/kaggle/input/soilcl/soil_classification-2025/test'
TRAIN_CSV = '/kaggle/input/soilcl/soil_classification-2025/train_labels.csv'
TEST_CSV = '/kaggle/input/soilcl/soil_classification-2025/test_ids.csv'
PROCESSED_TRAIN_DIR = '/kaggle/working/train'
PROCESSED_TEST_DIR = '/kaggle/working/test'

# Ensure processed directories exist
os.makedirs(PROCESSED_TRAIN_DIR, exist_ok=True)
os.makedirs(PROCESSED_TEST_DIR, exist_ok=True)



## 2. Data Preparation
We convert all images to JPG and split the data into training and validation sets.

In [None]:
# --- 3. Image Conversion Utility ---
def convert_to_jpg(source_dir, target_dir, file_mapping=None):
    """
    Converts all images in source_dir to JPG format and saves them in target_dir.
    Maintains a mapping from original filename to new filename.
    """
    if file_mapping is None:
        file_mapping = {}
    for filename in os.listdir(source_dir):
        source_path = os.path.join(source_dir, filename)
        if not os.path.isfile(source_path):
            continue
        file_ext = os.path.splitext(filename)[1].lower()
        if file_ext in ['.jpg', '.jpeg']:
            # Already JPG, just copy
            target_path = os.path.join(target_dir, filename)
            shutil.copy2(source_path, target_path)
            file_mapping[filename] = filename
        else:
            # Convert to JPG
            try:
                new_filename = os.path.splitext(filename)[0] + '.jpg'
                target_path = os.path.join(target_dir, new_filename)
                with Image.open(source_path) as img:
                    img = img.convert('RGB')
                    img.save(target_path, 'JPEG', quality=95)
                file_mapping[filename] = new_filename
            except Exception as e:
                print(f"Error converting {filename}: {e}")
                try:
                    target_path = os.path.join(target_dir, filename)
                    shutil.copy2(source_path, target_path)
                    file_mapping[filename] = filename
                except:
                    print(f"Could not process {filename}")
    return file_mapping

In [None]:
# --- 4. Data Preparation and Augmentation ---
def prepare_data_optimized():
    """
    Prepares training, validation, and test data generators with augmentation.
    Handles image conversion, dataset split, and class weighting for imbalance.
    """
    print("Converting images to JPG format...")
    train_file_mapping = convert_to_jpg(TRAIN_DIR, PROCESSED_TRAIN_DIR)
    test_file_mapping = convert_to_jpg(TEST_DIR, PROCESSED_TEST_DIR)
    
    # Read CSVs with image IDs and labels
    train_df = pd.read_csv(TRAIN_CSV)
    test_df = pd.read_csv(TEST_CSV)
    
    # Map original image IDs to processed (JPG) filenames
    train_df['processed_image_id'] = train_df['image_id'].map(lambda x: train_file_mapping.get(x, x))
    test_df['processed_image_id'] = test_df['image_id'].map(lambda x: test_file_mapping.get(x, x))
    
    # Stratified split into training and validation sets (80/20)
    train_data, val_data = train_test_split(
        train_df, test_size=0.2, random_state=42, stratify=train_df['soil_type'])
    
    print("Training class distribution:")
    print(train_data['soil_type'].value_counts())
    
    # Data augmentation for training set
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=40,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.2,
        zoom_range=0.3,
        horizontal_flip=True,
        vertical_flip=True,
        brightness_range=[0.7, 1.3],
        fill_mode='nearest',
        channel_shift_range=0.1
    )
    # Only rescaling for validation and test sets
    valid_datagen = ImageDataGenerator(rescale=1./255)
    test_datagen = ImageDataGenerator(rescale=1./255)
    
    # Repeat training data to help with class imbalance (especially for minority classes)
    repeated_train_data = train_data.loc[np.repeat(train_data.index.values, 3)]
    
    # Create data generators
    train_generator = train_datagen.flow_from_dataframe(
        dataframe=repeated_train_data,
        directory=PROCESSED_TRAIN_DIR,
        x_col='processed_image_id',
        y_col='soil_type',
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='categorical'
    )
    valid_generator = valid_datagen.flow_from_dataframe(
        dataframe=val_data,
        directory=PROCESSED_TRAIN_DIR,
        x_col='processed_image_id',
        y_col='soil_type',
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='categorical',
        shuffle=False
    )
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_df,
        directory=PROCESSED_TEST_DIR,
        x_col='processed_image_id',
        y_col=None,
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode=None,
        shuffle=False
    )
    
    # Compute class weights to help with imbalance (extra boost for "Clay soil" and "Black Soil")
    class_weights = {}
    total_samples = len(train_data)
    soil_counts = train_data['soil_type'].value_counts()
    for i, soil_type in enumerate(train_generator.class_indices):
        count = soil_counts.get(soil_type, 0)
        if count > 0:
            class_weights[i] = (1 / count) * (total_samples / len(soil_counts))
        # Boost weights for minority classes
        if soil_type == 'Clay soil':
            class_weights[i] *= 3.0
        elif soil_type == 'Black Soil':
            class_weights[i] *= 1.8
    return train_generator, valid_generator, test_generator, train_data, val_data, test_df, class_weights


## 3. Model Architecture
We use a DenseNet121 backbone with additional layers for better performance.

In [None]:
# --- 5. Model Architecture ---
def create_enhanced_densenet():
    """
    Builds an enhanced DenseNet121 model with additional dense, dropout, and batch normalization layers.
    Unfreezes last 30 layers for fine-tuning.
    """
    base_model = DenseNet121(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
    # Freeze all layers except the last 30 for fine-tuning
    for layer in base_model.layers[:-30]:
        layer.trainable = False
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = BatchNormalization()(x)
    # Add dense and dropout layers for better generalization
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.3)(x)
    # Output layer for classification
    predictions = Dense(NUM_CLASSES, activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=predictions)
    # Use Adam optimizer and macro F1 as metric
    optimizer = Adam(learning_rate=LEARNING_RATE)
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy', tf.keras.metrics.F1Score(average='macro')]
    )
    return model

In [None]:

# --- 6. Evaluation Function ---
def evaluate_enhanced_model(model, valid_generator, class_indices):
    """
    Evaluates the model on the validation set and prints macro and per-class F1 scores.
    """
    valid_generator.reset()
    y_pred_probs = model.predict(valid_generator, steps=int(np.ceil(valid_generator.samples/BATCH_SIZE)))
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = valid_generator.classes
    # Calculate macro and individual F1 scores
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_individual = f1_score(y_true, y_pred, average=None)
    idx_to_class = {v: k for k, v in class_indices.items()}
    class_f1_scores = {idx_to_class[i]: score for i, score in enumerate(f1_individual)}
    print(f"Macro F1 Score: {f1_macro:.4f}")
    print("Individual F1 Scores:")
    for name, score in class_f1_scores.items():
        print(f" {name}: {score:.4f}")
    # Check if target F1 score is achieved
    if f1_macro >= 0.95:
        print("🎯 TARGET ACHIEVED: F1 Score ≥ 0.95!")
    else:
        print(f"📈 Progress: {f1_macro:.4f}/0.95 ({(f1_macro/0.95)*100:.1f}%)")
    return f1_macro


<a id="target-heading"></a>
## 4. Training
The model is trained with early stopping and learning rate scheduling.

In [None]:

# --- 7. Main Training Pipeline ---
def main_enhanced():
    """
    Main training and evaluation workflow:
    - Data preparation
    - Model building
    - Training with callbacks
    - Evaluation
    - Test prediction and submission file creation
    """
    print("Starting enhanced soil classification for F1 ≥ 0.95...")
    # Prepare data generators and class weights
    train_generator, valid_generator, test_generator, train_data, val_data, test_df, class_weights = prepare_data_optimized()
    class_indices = train_generator.class_indices
    print(f"Class indices: {class_indices}")
    print(f"Enhanced class weights: {class_weights}")
    # Build the model
    print("Creating enhanced DenseNet121 model...")
    model = create_enhanced_densenet()
    print(f"Model parameters: {model.count_params():,}")
    # Define callbacks for early stopping and learning rate reduction
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.00001)
    ]
    # Train the model
    print("Training enhanced DenseNet121...")
    history = model.fit(
        train_generator,
        steps_per_epoch=len(train_generator),
        validation_data=valid_generator,
        validation_steps=len(valid_generator),
        epochs=EPOCHS,
        callbacks=callbacks,
        class_weight=class_weights,
        verbose=1
    )
    # Evaluate model performance
    print("Evaluating enhanced model...")
    f1_score_result = evaluate_enhanced_model(model, valid_generator, class_indices)
    # Generate predictions for test set
    print("Generating test predictions...")
    test_generator.reset()
    test_preds = model.predict(test_generator, steps=int(np.ceil(test_generator.samples/BATCH_SIZE)))
    test_classes = np.argmax(test_preds, axis=1)
    # Map predicted indices to class names
    idx_to_class = {v: k for k, v in class_indices.items()}
    test_class_names = [idx_to_class[idx] for idx in test_classes]
    # Create and save submission file
    submission_df = pd.DataFrame({
        'image_id': test_df['image_id'],
        'soil_type': test_class_names
    })
    submission_df.to_csv('enhanced_f1_95_submission.csv', index=False)
    if f1_score_result >= 0.95:
        print(f"🎯 SUCCESS! F1 Score: {f1_score_result:.4f} ≥ 0.95")
    else:
        print(f"📊 Result: F1 Score: {f1_score_result:.4f} (Target: 0.95)")
    print("Enhanced submission saved as: enhanced_f1_95_submission.csv")


## 5. Evaluation and Submission
We evaluate the model and generate predictions for submission.

In [None]:
# --- Entry point for script execution ---
if __name__ == "__main__":
    main_enhanced()