In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from PIL import Image
import shutil

2025-05-23 19:45:14.547633: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748029514.754031      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748029514.817386      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Maximum GPU optimization (from soil3.ipynb)
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    for gpu in physical_devices:
        tf.config.experimental.set_memory_growth(gpu, True)
    
    # Enable XLA compilation for speed
    tf.config.optimizer.set_jit(True)
    
    # Enable mixed precision from soil3.ipynb
    from tensorflow.keras import mixed_precision
    mixed_precision.set_global_policy('mixed_float16')
    print(f"GPU acceleration enabled: {len(physical_devices)} GPU(s) found")
    print("Mixed precision enabled")
else:
    print("No GPU found, using CPU")

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Optimized configuration based on soil3.ipynb success
IMG_SIZE = 224  # Same as soil3.ipynb
BATCH_SIZE = 64  # Same as soil3.ipynb (proven optimal)
EPOCHS = 12  # Same as soil3.ipynb
NUM_CLASSES = 4
LEARNING_RATE = 0.001  # Same as soil3.ipynb

# Paths
TRAIN_DIR = '/kaggle/input/soilcl/soil_classification-2025/train'
TEST_DIR = '/kaggle/input/soilcl/soil_classification-2025/test'
TRAIN_CSV = '/kaggle/input/soilcl/soil_classification-2025/train_labels.csv'
TEST_CSV = '/kaggle/input/soilcl/soil_classification-2025/test_ids.csv'
PROCESSED_TRAIN_DIR = '/kaggle/working/train'
PROCESSED_TEST_DIR = '/kaggle/working/test'

os.makedirs(PROCESSED_TRAIN_DIR, exist_ok=True)
os.makedirs(PROCESSED_TEST_DIR, exist_ok=True)

def convert_to_jpg(source_dir, target_dir, file_mapping=None):
    """Fast image conversion from soil3.ipynb"""
    if file_mapping is None:
        file_mapping = {}
    
    for filename in os.listdir(source_dir):
        source_path = os.path.join(source_dir, filename)
        
        if not os.path.isfile(source_path):
            continue
        
        file_ext = os.path.splitext(filename)[1].lower()
        
        if file_ext in ['.jpg', '.jpeg']:
            target_path = os.path.join(target_dir, filename)
            shutil.copy2(source_path, target_path)
            file_mapping[filename] = filename
        else:
            try:
                new_filename = os.path.splitext(filename)[0] + '.jpg'
                target_path = os.path.join(target_dir, new_filename)
                
                with Image.open(source_path) as img:
                    img = img.convert('RGB')
                    img.save(target_path, 'JPEG', quality=95)
                
                file_mapping[filename] = new_filename
            except Exception as e:
                print(f"Error converting {filename}: {e}")
                try:
                    target_path = os.path.join(target_dir, filename)
                    shutil.copy2(source_path, target_path)
                    file_mapping[filename] = filename
                except:
                    print(f"Could not process {filename}")
    
    return file_mapping

def prepare_data_optimized():
    """Enhanced data preparation based on soil3.ipynb + research"""
    print("Converting images to JPG format...")
    train_file_mapping = convert_to_jpg(TRAIN_DIR, PROCESSED_TRAIN_DIR)
    test_file_mapping = convert_to_jpg(TEST_DIR, PROCESSED_TEST_DIR)
    
    train_df = pd.read_csv(TRAIN_CSV)
    test_df = pd.read_csv(TEST_CSV)
    
    train_df['processed_image_id'] = train_df['image_id'].map(
        lambda x: train_file_mapping.get(x, x))
    test_df['processed_image_id'] = test_df['image_id'].map(
        lambda x: test_file_mapping.get(x, x))
    
    # Same validation split as soil3.ipynb
    train_data, val_data = train_test_split(
        train_df, test_size=0.2, random_state=42, stratify=train_df['soil_type'])
    
    print("Training class distribution:")
    print(train_data['soil_type'].value_counts())
    
    # Enhanced augmentation based on research + soil3.ipynb
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=40,  # Same as soil3.ipynb
        width_shift_range=0.3,  # Same as soil3.ipynb
        height_shift_range=0.3,  # Same as soil3.ipynb
        shear_range=0.2,
        zoom_range=0.3,
        horizontal_flip=True,
        vertical_flip=True,
        brightness_range=[0.7, 1.3],
        fill_mode='nearest',
        channel_shift_range=0.1
    )
    
    valid_datagen = ImageDataGenerator(rescale=1./255)
    test_datagen = ImageDataGenerator(rescale=1./255)
    
    # Enhanced dataset repetition for better Clay soil performance
    repeated_train_data = train_data.loc[np.repeat(train_data.index.values, 3)]  # Increased from 2 to 3
    
    train_generator = train_datagen.flow_from_dataframe(
        dataframe=repeated_train_data,
        directory=PROCESSED_TRAIN_DIR,
        x_col='processed_image_id',
        y_col='soil_type',
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='categorical'
    )
    
    valid_generator = valid_datagen.flow_from_dataframe(
        dataframe=val_data,
        directory=PROCESSED_TRAIN_DIR,
        x_col='processed_image_id',
        y_col='soil_type',
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='categorical',
        shuffle=False
    )
    
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_df,
        directory=PROCESSED_TEST_DIR,
        x_col='processed_image_id',
        y_col=None,
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode=None,
        shuffle=False
    )
    
    # Enhanced class weights targeting Clay soil improvement
    class_weights = {}
    total_samples = len(train_data)
    soil_counts = train_data['soil_type'].value_counts()
    
    for i, soil_type in enumerate(train_generator.class_indices):
        count = soil_counts.get(soil_type, 0)
        if count > 0:
            class_weights[i] = (1 / count) * (total_samples / len(soil_counts))
            # Massive boost for Clay soil (from research)
            if soil_type == 'Clay soil':
                class_weights[i] *= 3.0  # Triple boost for Clay soil
            elif soil_type == 'Black Soil':
                class_weights[i] *= 1.8  # Enhanced boost for Black Soil
    
    return train_generator, valid_generator, test_generator, train_data, val_data, test_df, class_weights

def create_enhanced_densenet():
    """Enhanced DenseNet121 model based on soil3.ipynb + research"""
    base_model = DenseNet121(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
    
    # Same layer unfreezing as soil3.ipynb
    for layer in base_model.layers[:-30]:
        layer.trainable = False
    
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = BatchNormalization()(x)
    
    # Enhanced architecture for better Clay soil classification
    x = Dense(512, activation='relu')(x)  # Increased from 256
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.3)(x)
    
    predictions = Dense(NUM_CLASSES, activation='softmax')(x)
    
    model = Model(inputs=base_model.input, outputs=predictions)
    
    # Same optimizer as soil3.ipynb
    optimizer = Adam(learning_rate=LEARNING_RATE)
    
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy', tf.keras.metrics.F1Score(average='macro')]
    )
    
    return model

def evaluate_enhanced_model(model, valid_generator, class_indices):
    """Enhanced evaluation targeting F1 ≥ 0.95"""
    valid_generator.reset()
    y_pred_probs = model.predict(valid_generator, steps=int(np.ceil(valid_generator.samples/BATCH_SIZE)))
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = valid_generator.classes
    
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_individual = f1_score(y_true, y_pred, average=None)
    
    idx_to_class = {v: k for k, v in class_indices.items()}
    class_f1_scores = {idx_to_class[i]: score for i, score in enumerate(f1_individual)}
    
    print(f"Macro F1 Score: {f1_macro:.4f}")
    print("Individual F1 Scores:")
    for name, score in class_f1_scores.items():
        print(f"  {name}: {score:.4f}")
    
    if f1_macro >= 0.95:
        print("🎯 TARGET ACHIEVED: F1 Score ≥ 0.95!")
    else:
        print(f"📈 Progress: {f1_macro:.4f}/0.95 ({(f1_macro/0.95)*100:.1f}%)")
    
    return f1_macro

def main_enhanced():
    """Main execution targeting F1 ≥ 0.95 based on soil3.ipynb success"""
    print("Starting enhanced soil classification for F1 ≥ 0.95...")
    
    # Prepare data
    train_generator, valid_generator, test_generator, train_data, val_data, test_df, class_weights = prepare_data_optimized()
    
    class_indices = train_generator.class_indices
    print(f"Class indices: {class_indices}")
    print(f"Enhanced class weights: {class_weights}")
    
    # Create enhanced model
    print("Creating enhanced DenseNet121 model...")
    model = create_enhanced_densenet()
    print(f"Model parameters: {model.count_params():,}")
    
    # Same callbacks as soil3.ipynb
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.00001)
    ]
    
    # Train model
    print("Training enhanced DenseNet121...")
    history = model.fit(
        train_generator,
        steps_per_epoch=len(train_generator),
        validation_data=valid_generator,
        validation_steps=len(valid_generator),
        epochs=EPOCHS,
        callbacks=callbacks,
        class_weight=class_weights,
        verbose=1
    )
    
    # Evaluate
    print("Evaluating enhanced model...")
    f1_score_result = evaluate_enhanced_model(model, valid_generator, class_indices)
    
    # Generate predictions
    print("Generating test predictions...")
    test_generator.reset()
    test_preds = model.predict(test_generator, steps=int(np.ceil(test_generator.samples/BATCH_SIZE)))
    test_classes = np.argmax(test_preds, axis=1)
    
    # Create submission
    idx_to_class = {v: k for k, v in class_indices.items()}
    test_class_names = [idx_to_class[idx] for idx in test_classes]
    
    submission_df = pd.DataFrame({
        'image_id': test_df['image_id'],
        'soil_type': test_class_names
    })
    
    submission_df.to_csv('enhanced_f1_95_submission.csv', index=False)
    
    if f1_score_result >= 0.95:
        print(f"🎯 SUCCESS! F1 Score: {f1_score_result:.4f} ≥ 0.95")
    else:
        print(f"📊 Result: F1 Score: {f1_score_result:.4f} (Target: 0.95)")
    
    print("Enhanced submission saved as: enhanced_f1_95_submission.csv")

GPU acceleration enabled: 1 GPU(s) found
Mixed precision enabled


In [3]:
if __name__ == "__main__":
    main_enhanced()

Starting enhanced soil classification for F1 ≥ 0.95...
Converting images to JPG format...
Training class distribution:
soil_type
Alluvial soil    422
Red soil         211
Black Soil       185
Clay soil        159
Name: count, dtype: int64
Found 2931 validated image filenames belonging to 4 classes.
Found 245 validated image filenames belonging to 4 classes.
Found 341 validated image filenames.
Class indices: {'Alluvial soil': 0, 'Black Soil': 1, 'Clay soil': 2, 'Red soil': 3}
Enhanced class weights: {0: 0.5787914691943128, 1: 2.376486486486487, 2: 4.6084905660377355, 3: 1.1575829383886256}
Creating enhanced DenseNet121 model...


I0000 00:00:1748029538.550801      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m29084464/29084464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Model parameters: 7,733,188
Training enhanced DenseNet121...


  self._warn_if_super_not_called()


Epoch 1/12


I0000 00:00:1748029557.711677      92 service.cc:148] XLA service 0x79cca8002c10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748029557.712188      92 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1748029557.738563      92 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1748029557.827211      92 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 2s/step - accuracy: 0.3892 - f1_score: 0.3829 - loss: 2.4751 - val_accuracy: 0.8612 - val_f1_score: 0.8577 - val_loss: 0.5208 - learning_rate: 0.0010
Epoch 2/12
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 1s/step - accuracy: 0.8184 - f1_score: 0.8195 - loss: 0.6629 - val_accuracy: 0.9020 - val_f1_score: 0.9024 - val_loss: 0.2538 - learning_rate: 0.0010
Epoch 3/12
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 1s/step - accuracy: 0.8471 - f1_score: 0.8463 - loss: 0.4609 - val_accuracy: 0.9184 - val_f1_score: 0.9200 - val_loss: 0.2640 - learning_rate: 0.0010
Epoch 4/12
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 1s/step - accuracy: 0.9132 - f1_score: 0.9121 - loss: 0.3054 - val_accuracy: 0.9306 - val_f1_score: 0.9300 - val_loss: 0.3301 - learning_rate: 0.0010
Epoch 5/12
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 1s/step - accuracy: 0.9182 - f

  self._warn_if_super_not_called()


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 4s/step 
🎯 SUCCESS! F1 Score: 0.9780 ≥ 0.95
Enhanced submission saved as: enhanced_f1_95_submission.csv
