In [1]:
# Import essential libraries for data handling, image processing, model building, and evaluation
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import EfficientNetB1
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from PIL import Image
import tensorflow as tf
from sklearn.metrics import roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

2025-05-24 10:48:18.217800: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748083698.240371      85 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748083698.247241      85 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# Data path configuration 
TRAIN_IMAGE_DIR = "/kaggle/input/soil-classification-part-2/soil_competition-2025/train"           
TRAIN_LABELS_PATH = "/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv"
TEST_IMAGE_DIR = "/kaggle/input/soil-classification-part-2/soil_competition-2025/test"
TEST_IDS_PATH = "/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv"

In [4]:
# Load training labels from CSV 
df_train_labels = pd.read_csv(TRAIN_LABELS_PATH)
print("Sample rows from train_labels.csv:")
print(df_train_labels.head())
print(f"\nLabel distribution (should be all 1s - soil):")
print(df_train_labels['label'].value_counts())

Sample rows from train_labels.csv:
           image_id  label
0  img_ed005410.jpg      1
1  img_0c5ecd2a.jpg      1
2  img_ed713bb5.jpg      1
3  img_12c58874.jpg      1
4  img_eff357af.jpg      1

Label distribution (should be all 1s - soil):
label
1    1222
Name: count, dtype: int64


In [5]:
# Load soil images
soil_image_paths = []
missing_count = 0

In [7]:
# Verify existence and validity of soil images by iterating over labels,
# checking if image files exist and can be opened properly,
# and track corrupted or missing images.
import os
for idx, row in df_train_labels.iterrows():
    image_id = row['image_id']         
    image_path = os.path.join(TRAIN_IMAGE_DIR, image_id)
    
    if os.path.exists(image_path):
        try:
            with Image.open(image_path) as img:
                if img.mode in ['RGB', 'L', 'RGBA']:
                    soil_image_paths.append(image_path)
        except Exception as e:
            print(f"Corrupted image: {image_path}")
            missing_count += 1
    else:
        print(f"Warning: {image_path} not found!")
        missing_count += 1

print(f" Total soil images loaded: {len(soil_image_paths)}")
print(f" Missing/corrupted images: {missing_count}")


📂 Loading soil images...
✅ Total soil images loaded: 1218
❌ Missing/corrupted images: 0


In [8]:
# Split soil images for training and validation 
train_paths, val_paths = train_test_split(
    soil_image_paths,
    test_size=0.2,
    random_state=42
)

print(f"\n🔄 Dataset split:")
print(f"   Training soil images: {len(train_paths)}")
print(f"   Validation soil images: {len(val_paths)}")


🔄 Dataset split:
   Training soil images: 974
   Validation soil images: 244


In [9]:
# Image preprocessing 
#  Preprocess images for feature extraction

def preprocess_images(paths, img_size=(224, 224)):
    images = []
    valid_paths = []
    
    for path in paths:
        try:
            img = Image.open(path).convert('RGB')
            img = img.resize(img_size, Image.Resampling.LANCZOS)
            img_array = np.array(img, dtype=np.float32)
            
            # EfficientNet preprocessing
            img_array = tf.keras.applications.efficientnet.preprocess_input(img_array)
            images.append(img_array)
            valid_paths.append(path)
            
        except Exception as e:
            print(f"Error processing {path}: {e}")
    
    return np.array(images), valid_paths

In [10]:
# Load and preprocess training images
X_train, valid_train_paths = preprocess_images(train_paths)
X_val, valid_val_paths = preprocess_images(val_paths)

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")


🔄 Preprocessing training images...
Training data shape: (974, 224, 224, 3)
Validation data shape: (244, 224, 224, 3)


In [11]:
#  Build Feature Extraction Model 
#  Creating a feature extraction model using EfficientNet

def create_feature_extractor(input_shape=(224, 224, 3)):
    
    base_model = EfficientNetB1(
        include_top=False, 
        weights='imagenet', 
        input_shape=input_shape
    )
    
    # Freeze base model for stable feature extraction
    base_model.trainable = False
    
    # Add feature extraction head
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    features = Dense(512, activation='relu', name='features')(x)
    
    model = Model(inputs=base_model.input, outputs=features)
    return model

In [12]:
# Create feature extractor
feature_extractor = create_feature_extractor()
print(f"\n Feature extractor created with {feature_extractor.count_params():,} parameters")

I0000 00:00:1748083874.284092      85 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1748083874.284732      85 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb1_notop.h5
[1m27018416/27018416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step

🏗 Feature extractor created with 7,231,111 parameters


In [13]:
# Extract features from soil images 
print("\n🔍 Extracting features from soil images...")
train_features = feature_extractor.predict(X_train, batch_size=32, verbose=1)
val_features = feature_extractor.predict(X_val, batch_size=32, verbose=1)


🔍 Extracting features from soil images...


I0000 00:00:1748083886.440403     147 service.cc:148] XLA service 0x7967e40041e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748083886.441517     147 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1748083886.441537     147 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1748083887.400517     147 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 3/31[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 61ms/step

I0000 00:00:1748083894.166850     147 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 398ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 893ms/step


In [14]:
print(f"Training features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")

Training features shape: (974, 512)
Validation features shape: (244, 512)


In [15]:
#  Compute mean and covariance of soil features

def compute_soil_statistics(features):
    soil_mean = np.mean(features, axis=0)
    soil_cov = np.cov(features.T)
    
    # Add small regularization to covariance matrix
    soil_cov += np.eye(soil_cov.shape[0]) * 1e-6
    
    return soil_mean, soil_cov

soil_mean, soil_cov = compute_soil_statistics(train_features)
print(f"\n📊 Soil statistics computed:")
print(f"   Mean feature vector shape: {soil_mean.shape}")
print(f"   Covariance matrix shape: {soil_cov.shape}")


📊 Soil statistics computed:
   Mean feature vector shape: (512,)
   Covariance matrix shape: (512, 512)


In [16]:
# Define anomaly detection functions 
#  Compute Mahalanobis distance from soil prototype

def mahalanobis_distance(features, mean, cov):
    try:
        cov_inv = np.linalg.inv(cov)
    except:
        cov_inv = np.linalg.pinv(cov)
    
    diff = features - mean
    distances = np.array([np.sqrt(np.dot(np.dot(d, cov_inv), d.T)) for d in diff])
    return distances

In [17]:
# Compute Euclidean distance from soil prototype
def euclidean_distance(features, mean):
    distances = np.linalg.norm(features - mean, axis=1)
    return distances

In [18]:
#Compute cosine similarity with soil prototype
def cosine_similarity_score(features, mean):
    # Normalize vectors
    features_norm = features / (np.linalg.norm(features, axis=1, keepdims=True) + 1e-8)
    mean_norm = mean / (np.linalg.norm(mean) + 1e-8)
    
    similarities = np.dot(features_norm, mean_norm)
    return similarities

In [19]:
# Compute baseline distances on training data 

train_mahal_dist = mahalanobis_distance(train_features, soil_mean, soil_cov)
train_eucl_dist = euclidean_distance(train_features, soil_mean)
train_cosine_sim = cosine_similarity_score(train_features, soil_mean)

val_mahal_dist = mahalanobis_distance(val_features, soil_mean, soil_cov)
val_eucl_dist = euclidean_distance(val_features, soil_mean)
val_cosine_sim = cosine_similarity_score(val_features, soil_mean)


📏 Computing baseline distances on training data...


In [20]:
# Compute thresholds (95th percentile of training distances)
mahal_threshold = np.percentile(train_mahal_dist, 95)
eucl_threshold = np.percentile(train_eucl_dist, 95)
cosine_threshold = np.percentile(train_cosine_sim, 5) 

In [21]:
print(f"\n Computed thresholds:")
print(f"   Mahalanobis threshold: {mahal_threshold:.4f}")
print(f"   Euclidean threshold: {eucl_threshold:.4f}")
print(f"   Cosine similarity threshold: {cosine_threshold:.4f}")


🎯 Computed thresholds:
   Mahalanobis threshold: 30.8685
   Euclidean threshold: 6.6573
   Cosine similarity threshold: 0.5514


In [22]:
class SoilAnomalyDetector:
    def __init__(self, feature_extractor, soil_mean, soil_cov, thresholds):
        self.feature_extractor = feature_extractor
        self.soil_mean = soil_mean
        self.soil_cov = soil_cov
        self.mahal_threshold = thresholds['mahalanobis']
        self.eucl_threshold = thresholds['euclidean']
        self.cosine_threshold = thresholds['cosine']
    
    def extract_features(self, images):
        """Extract features from images"""
        return self.feature_extractor.predict(images, batch_size=32, verbose=0)
    
    def compute_anomaly_scores(self, features):
        """Compute multiple anomaly scores"""
        # Mahalanobis distance (normalized)
        mahal_dist = mahalanobis_distance(features, self.soil_mean, self.soil_cov)
        mahal_scores = mahal_dist / self.mahal_threshold
        
        # Euclidean distance (normalized)  
        eucl_dist = euclidean_distance(features, self.soil_mean)
        eucl_scores = eucl_dist / self.eucl_threshold
        
        # Cosine similarity (inverted and normalized)
        cosine_sim = cosine_similarity_score(features, self.soil_mean)
        cosine_scores = (self.cosine_threshold - cosine_sim) / self.cosine_threshold
        
        return mahal_scores, eucl_scores, cosine_scores
    
    def predict(self, images):
        """Predict if images are soil (1) or non-soil (0)"""
        features = self.extract_features(images)
        mahal_scores, eucl_scores, cosine_scores = self.compute_anomaly_scores(features)
        
        # Ensemble scoring (weighted average)
        ensemble_scores = (0.4 * mahal_scores + 0.4 * eucl_scores + 0.2 * cosine_scores)
        
        # Convert to probabilities (sigmoid-like function)
        anomaly_probabilities = 1 / (1 + np.exp(-2 * (ensemble_scores - 1)))
        
        # Convert to soil probabilities (1 - anomaly_probability)
        soil_probabilities = 1 - anomaly_probabilities
        
        # Binary predictions: 1 for soil, 0 for non-soil
        predictions = (soil_probabilities > 0.5).astype(int)
        
        return predictions, soil_probabilities, {
            'mahalanobis': mahal_scores,
            'euclidean': eucl_scores, 
            'cosine': cosine_scores,
            'ensemble': ensemble_scores
        }

In [23]:
# Create anomaly detector
thresholds = {
    'mahalanobis': mahal_threshold,
    'euclidean': eucl_threshold,
    'cosine': cosine_threshold
}

anomaly_detector = SoilAnomalyDetector(
    feature_extractor, soil_mean, soil_cov, thresholds
)

In [24]:
#  Validate on soil images (should predict as soil = 1) 
print("\n Validating on soil images...")
val_predictions, val_probabilities, val_scores = anomaly_detector.predict(X_val)

soil_accuracy = np.mean(val_predictions)  # Should be close to 1 (high soil classification rate)
print(f" Soil validation accuracy: {soil_accuracy:.4f} ({soil_accuracy*100:.2f}%)")
print(f"   Average soil probability: {np.mean(val_probabilities):.4f}")
print(f"   Soil images classified correctly: {np.sum(val_predictions)}/{len(val_predictions)}")


🧪 Validating on soil images...
✅ Soil validation accuracy: 0.7828 (78.28%)
   Average soil probability: 0.5714
   Soil images classified correctly: 191/244


In [30]:
#  Function to process test images and create submission 
def create_submission(detector, test_dir, test_ids_path, output_csv='submission.csv'):
    """Process test images using test_ids.csv and create submission file"""
    print(f"\n🔍 Processing test images from: {test_dir}")
    print(f"📋 Using test IDs from: {test_ids_path}")
    
    # Load test IDs
    try:
        test_ids_df = pd.read_csv(test_ids_path)
        print(f"✅ Loaded test_ids.csv with {len(test_ids_df)} entries")
        print("Sample test IDs:")
        print(test_ids_df.head())
        
        # Get the column name for image IDs
        if 'image_id' in test_ids_df.columns:
            test_files = test_ids_df['image_id'].tolist()
        elif 'id' in test_ids_df.columns:
            test_files = test_ids_df['id'].tolist()
        else:
            # Use the first column if standard names not found
            test_files = test_ids_df.iloc[:, 0].tolist()
            print(f"Using column '{test_ids_df.columns[0]}' as image IDs")
            
    except Exception as e:
        print(f"❌ Error loading test_ids.csv: {e}")
        return None
    
    if not os.path.exists(test_dir):
        print(f"❌ Test directory not found: {test_dir}")
        return None
    
    print(f"📊 Processing {len(test_files)} test images from test_ids.csv")
    
    # Process images in batches
    batch_size = 32
    all_predictions = []
    all_probabilities = []
    all_filenames = []
    missing_files = []
    
    for i in range(0, len(test_files), batch_size):
        batch_files = test_files[i:i+batch_size]
        batch_images = []
        batch_valid_files = []
        
        # Load batch images
        for filename in batch_files:
            try:
                img_path = os.path.join(test_dir, filename)
                
                if not os.path.exists(img_path):
                    print(f"⚠️  Missing file: {filename}")
                    missing_files.append(filename)
                    # Add default prediction for missing files (predict as non-soil)
                    all_predictions.append(0)
                    all_probabilities.append(0.0)
                    all_filenames.append(filename)
                    continue
                
                img = Image.open(img_path).convert('RGB')
                img = img.resize((224, 224), Image.Resampling.LANCZOS)
                img_array = np.array(img, dtype=np.float32)
                img_array = tf.keras.applications.efficientnet.preprocess_input(img_array)
                
                batch_images.append(img_array)
                batch_valid_files.append(filename)
                
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                # Add default prediction for corrupted files (predict as non-soil)
                all_predictions.append(0)
                all_probabilities.append(0.0)
                all_filenames.append(filename)
        
        if len(batch_images) > 0:
            batch_images = np.array(batch_images)
            
            # Make predictions
            batch_preds, batch_probs, _ = detector.predict(batch_images)
            
            all_predictions.extend(batch_preds)
            all_probabilities.extend(batch_probs)
            all_filenames.extend(batch_valid_files)
            
            print(f"   Processed batch {i//batch_size + 1}: {len(batch_images)} images")
    
    # Ensure we have predictions for all test IDs
    if len(all_predictions) != len(test_files):
        print(f"⚠️  Mismatch: Expected {len(test_files)} predictions, got {len(all_predictions)}")
    
    # Create submission DataFrame in the required format
    submission_df = pd.DataFrame({
        'image_id': test_files,  # Use original order from test_ids.csv
        'label': all_predictions[:len(test_files)]  # Ensure matching length
    })
    
    # Save submission file
    submission_df.to_csv(output_csv, index=False)
    print(f"💾 Submission file saved to: {output_csv}")
    
    # Print summary
    soil_count = sum(all_predictions[:len(test_files)])
    non_soil_count = len(test_files) - soil_count
    
    print(f"\n📈 Submission Summary:")
    print(f"   Total images in submission: {len(submission_df)}")
    print(f"   Predicted as Soil (label=1): {soil_count} ({soil_count/len(submission_df)*100:.1f}%)")
    print(f"   Predicted as Non-Soil (label=0): {non_soil_count} ({non_soil_count/len(submission_df)*100:.1f}%)")
    if len(all_probabilities) > 0:
        print(f"   Average soil probability: {np.mean(all_probabilities[:len(test_files)]):.4f}")
    if missing_files:
        print(f"   Missing/corrupted files: {len(missing_files)}")
    
    # Show sample of submission format
    print(f"\n📋 Sample submission format:")
    print(submission_df.head(10))
    print("...")
    print(submission_df.tail(5))
    
    # Verify submission format
    print(f"\n✅ Submission file validation:")
    print(f"   Shape: {submission_df.shape}")
    print(f"   Columns: {list(submission_df.columns)}")
    print(f"   Label distribution: {submission_df['label'].value_counts().to_dict()}")
    print(f"   No missing values: {not submission_df.isnull().any().any()}")
    
    return submission_df

In [31]:
#  Create submission file 
submission_results = create_submission(anomaly_detector, TEST_IMAGE_DIR, TEST_IDS_PATH, 'submission.csv')


🔍 Processing test images from: /kaggle/input/soil-classification-part-2/soil_competition-2025/test
📋 Using test IDs from: /kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv
✅ Loaded test_ids.csv with 967 entries
Sample test IDs:
                               image_id
0  6595f1266325552489c7d1635fafb88f.jpg
1  4b614841803d5448b59e2c6ca74ea664.jpg
2  ca30e008692a50638b43d944f46245c8.jpg
3  6a9046a219425f7599729be627df1c1a.jpg
4  97c1e0276d2d5c2f88dddbc87357611e.jpg
📊 Processing 967 test images from test_ids.csv
   Processed batch 1: 32 images
   Processed batch 2: 32 images
   Processed batch 3: 32 images
   Processed batch 4: 32 images
   Processed batch 5: 32 images
   Processed batch 6: 32 images
   Processed batch 7: 32 images
   Processed batch 8: 32 images
   Processed batch 9: 32 images
   Processed batch 10: 32 images
   Processed batch 11: 32 images
   Processed batch 12: 32 images
   Processed batch 13: 32 images
   Processed batch 14: 32 images
   Pr

In [32]:
#  Save the complete model 
def save_soil_detector(detector, save_path='soil_anomaly_detector'):
    """Save the complete anomaly detector"""
    
    # Save feature extractor
    detector.feature_extractor.save(f'{save_path}_feature_extractor.h5')
    
    # Save soil statistics and thresholds
    np.savez(f'{save_path}_parameters.npz',
             soil_mean=detector.soil_mean,
             soil_cov=detector.soil_cov,
             mahal_threshold=detector.mahal_threshold,
             eucl_threshold=detector.eucl_threshold,
             cosine_threshold=detector.cosine_threshold)
    
    print(f"💾 Complete detector saved:")
    print(f"   Feature extractor: {save_path}_feature_extractor.h5")
    print(f"   Parameters: {save_path}_parameters.npz")

save_soil_detector(anomaly_detector)

print(f"\n🎯 One-Class Soil Detection Complete!")
print(f"   Approach: Anomaly Detection (One-Class Classification)")
print(f"   Training: Only soil images used")
print(f"   Testing: Detects deviations from soil patterns")
print(f"   Validation accuracy on soil: {soil_accuracy:.4f} ({soil_accuracy*100:.2f}%)")

if submission_results is not None:
    print(f"   Submission file created: submission.csv")
    print(f"   Format: image_id, label (1=soil, 0=non-soil)")
    total_images = len(submission_results)
    soil_predictions = sum(submission_results['label'])
    print(f"   Predicted {soil_predictions}/{total_images} images as soil")
    

💾 Complete detector saved:
   Feature extractor: soil_anomaly_detector_feature_extractor.h5
   Parameters: soil_anomaly_detector_parameters.npz

🎯 One-Class Soil Detection Complete!
   Approach: Anomaly Detection (One-Class Classification)
   Training: Only soil images used
   Testing: Detects deviations from soil patterns
   Validation accuracy on soil: 0.7828 (78.28%)
   Submission file created: submission.csv
   Format: image_id, label (1=soil, 0=non-soil)
   Predicted 271/967 images as soil
