In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/soil-classification/soil_classification-2025/sample_submission.csv
/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv
/kaggle/input/soil-classification/soil_classification-2025/test_ids.csv
/kaggle/input/soil-classification/soil_classification-2025/test/img_0f035b97.jpg
/kaggle/input/soil-classification/soil_classification-2025/test/img_f13af256.jpg
/kaggle/input/soil-classification/soil_classification-2025/test/img_15b41dbc.jpg
/kaggle/input/soil-classification/soil_classification-2025/test/img_cfb4fc7a.jpg
/kaggle/input/soil-classification/soil_classification-2025/test/img_683111fb.jpg
/kaggle/input/soil-classification/soil_classification-2025/test/img_c4bd7b3e.jpg
/kaggle/input/soil-classification/soil_classification-2025/test/img_4ccce0f8.jpg
/kaggle/input/soil-classification/soil_classification-2025/test/img_86faa98d.jpg
/kaggle/input/soil-classification/soil_classification-2025/test/img_c448342c.jpg
/kaggle/input/soil-classification/soil_cla

In [4]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Custom Dataset class
class SoilDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, is_test=False):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test
        self.class_to_idx = {"Alluvial soil": 0, "Black Soil": 1, "Clay soil": 2, "Red soil": 3}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.df.iloc[idx]["image_id"])
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        if self.is_test:
            return image, 0  # Dummy label for test set

        label = self.class_to_idx[self.df.iloc[idx]["soil_type"]]
        return image, label


# Model architecture
class SoilClassifier(nn.Module):
    def __init__(self, num_classes=4):
        super(SoilClassifier, self).__init__()
        try:
            # Try to load pretrained ResNet50 with newer API
            weights = models.ResNet50_Weights.DEFAULT
            self.model = models.resnet50(weights=weights)
            print("Successfully loaded pretrained ResNet50")

            # Freeze early layers
            for param in list(self.model.parameters())[:-2]:
                param.requires_grad = False

            # Modify the final layer
            num_features = self.model.fc.in_features
            self.model.fc = nn.Sequential(
                nn.Linear(num_features, 512),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(512, num_classes),
            )
        except Exception as e:
            print(f"Failed to load pretrained model: {e}")
            print("Using a simpler CNN architecture instead")

            # Define a simpler CNN architecture
            self.model = nn.Sequential(
                # First Conv Block
                nn.Conv2d(3, 32, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.BatchNorm2d(32),
                nn.MaxPool2d(2),
                # Second Conv Block
                nn.Conv2d(32, 64, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.BatchNorm2d(64),
                nn.MaxPool2d(2),
                # Third Conv Block
                nn.Conv2d(64, 128, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.BatchNorm2d(128),
                nn.MaxPool2d(2),
                # Fourth Conv Block
                nn.Conv2d(128, 256, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.BatchNorm2d(256),
                nn.MaxPool2d(2),
                # Adaptive pooling to handle different input sizes
                nn.AdaptiveAvgPool2d((1, 1)),
                # Flatten and Dense layers
                nn.Flatten(),
                nn.Linear(256, 512),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(512, num_classes),
            )

    def forward(self, x):
        return self.model(x)


def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(dataloader, desc="Training")
    for inputs, labels in pbar:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        # Update progress bar
        pbar.set_postfix({"loss": loss.item(), "acc": 100.0 * correct / total})

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100.0 * correct / total
    return epoch_loss, epoch_acc


def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        pbar = tqdm(dataloader, desc="Validating")
        for inputs, labels in pbar:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            # Update progress bar
            pbar.set_postfix({"loss": loss.item(), "acc": 100.0 * correct / total})

    val_loss = running_loss / len(dataloader)
    val_acc = 100.0 * correct / total
    return val_loss, val_acc


def main():
    # Data paths
    DATA_DIR = "/kaggle/input/soil-classification/soil_classification-2025"
    TRAIN_DIR = os.path.join(DATA_DIR, "train")
    TEST_DIR = os.path.join(DATA_DIR, "test")
    TRAIN_CSV = os.path.join(DATA_DIR, "train_labels.csv")
    TEST_CSV = os.path.join(DATA_DIR, "test_ids.csv")

    # Output paths - use /kaggle/working for outputs
    OUTPUT_DIR = "/kaggle/working"
    MODEL_PATH = os.path.join(OUTPUT_DIR, "best_model.pth")
    SUBMISSION_CSV = os.path.join(OUTPUT_DIR, "submission.csv")
    CONFUSION_MATRIX_PATH = os.path.join(OUTPUT_DIR, "confusion_matrix.png")
    HISTORY_PLOT_PATH = os.path.join(OUTPUT_DIR, "training_history.png")

    # Load training data
    train_df = pd.read_csv(TRAIN_CSV)
    print("Training data shape:", train_df.shape)
    print("\nClass distribution:")
    print(train_df["soil_type"].value_counts())

    # Split into train and validation sets
    train_df, val_df = train_test_split(
        train_df, test_size=0.2, stratify=train_df["soil_type"], random_state=42
    )

    # Define transformations
    train_transform = transforms.Compose(
        [
            transforms.Resize((224, 224)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(10),
            transforms.ColorJitter(brightness=0.2, contrast=0.2),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
    )

    val_transform = transforms.Compose(
        [
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
    )

    # Create datasets and dataloaders
    train_dataset = SoilDataset(train_df, TRAIN_DIR, transform=train_transform)
    val_dataset = SoilDataset(val_df, TRAIN_DIR, transform=val_transform)

    batch_size = 32
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    # Initialize model, criterion, optimizer
    model = SoilClassifier().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=3, factor=0.1)

    # Training settings
    num_epochs = 20
    best_val_acc = 0
    patience = 5
    patience_counter = 0

    # Training history
    history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

    # Training loop
    for epoch in range(num_epochs):
        print(f"\nEpoch [{epoch + 1}/{num_epochs}]")

        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = validate(model, val_loader, criterion, device)

        # Update learning rate
        scheduler.step(val_loss)

        # Save history
        history["train_loss"].append(train_loss)
        history["train_acc"].append(train_acc)
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)

        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), MODEL_PATH)
            patience_counter = 0
        else:
            patience_counter += 1

        # Early stopping
        if patience_counter >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs")
            break

    # Load best model for evaluation
    model.load_state_dict(torch.load(MODEL_PATH))
    model.eval()

    # Generate predictions for validation set
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc="Evaluating"):
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.numpy())

    # Plot confusion matrix
    class_names = list(train_dataset.class_to_idx.keys())
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names
    )
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.xticks(rotation=45)
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.savefig(CONFUSION_MATRIX_PATH)
    plt.close()

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=class_names))

    # Generate predictions for test set
    test_df = pd.read_csv(TEST_CSV)
    test_dataset = SoilDataset(test_df, TEST_DIR, transform=val_transform, is_test=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    test_predictions = []
    with torch.no_grad():
        for inputs, _ in tqdm(test_loader, desc="Generating predictions"):
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            test_predictions.extend(predicted.cpu().numpy())

    # Create submission DataFrame
    idx_to_class = {v: k for k, v in test_dataset.class_to_idx.items()}
    submission_df = pd.DataFrame(
        {
            "image_id": test_df["image_id"],
            "soil_type": [idx_to_class[pred] for pred in test_predictions],
        }
    )

    # Save predictions
    submission_df.to_csv(SUBMISSION_CSV, index=False)
    print(f"Predictions saved to {SUBMISSION_CSV}")

    # Plot training history
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history["train_loss"], label="Train")
    plt.plot(history["val_loss"], label="Validation")
    plt.title("Loss")
    plt.xlabel("Epoch")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history["train_acc"], label="Train")
    plt.plot(history["val_acc"], label="Validation")
    plt.title("Accuracy")
    plt.xlabel("Epoch")
    plt.legend()

    plt.tight_layout()
    plt.savefig(HISTORY_PLOT_PATH)
    plt.close()


if __name__ == "__main__":
    main()


Using device: cpu
Training data shape: (1222, 2)

Class distribution:
soil_type
Alluvial soil    528
Red soil         264
Black Soil       231
Clay soil        199
Name: count, dtype: int64


Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


Failed to load pretrained model: <urlopen error [Errno -3] Temporary failure in name resolution>
Using a simpler CNN architecture instead

Epoch [1/20]


Training: 100%|██████████| 31/31 [01:52<00:00,  3.63s/it, loss=0.179, acc=74.6]
Validating: 100%|██████████| 8/8 [00:11<00:00,  1.48s/it, loss=0.612, acc=79.6]


Train Loss: 0.5983, Train Acc: 74.62%
Val Loss: 0.5020, Val Acc: 79.59%

Epoch [2/20]


Training: 100%|██████████| 31/31 [01:43<00:00,  3.35s/it, loss=1.12, acc=82.8] 
Validating: 100%|██████████| 8/8 [00:12<00:00,  1.56s/it, loss=0.499, acc=78.8]


Train Loss: 0.4459, Train Acc: 82.80%
Val Loss: 0.4876, Val Acc: 78.78%

Epoch [3/20]


Training: 100%|██████████| 31/31 [01:43<00:00,  3.35s/it, loss=0.485, acc=83.1]
Validating: 100%|██████████| 8/8 [00:12<00:00,  1.57s/it, loss=0.441, acc=79.2]


Train Loss: 0.4543, Train Acc: 83.11%
Val Loss: 0.4379, Val Acc: 79.18%

Epoch [4/20]


Training: 100%|██████████| 31/31 [01:44<00:00,  3.38s/it, loss=0.49, acc=86.8] 
Validating: 100%|██████████| 8/8 [00:13<00:00,  1.64s/it, loss=0.353, acc=83.3]


Train Loss: 0.3715, Train Acc: 86.80%
Val Loss: 0.3370, Val Acc: 83.27%

Epoch [5/20]


Training: 100%|██████████| 31/31 [01:45<00:00,  3.40s/it, loss=0.254, acc=86.4]
Validating: 100%|██████████| 8/8 [00:12<00:00,  1.58s/it, loss=0.312, acc=86.1]


Train Loss: 0.3567, Train Acc: 86.39%
Val Loss: 0.3473, Val Acc: 86.12%

Epoch [6/20]


Training: 100%|██████████| 31/31 [01:44<00:00,  3.36s/it, loss=0.157, acc=86.2]
Validating: 100%|██████████| 8/8 [00:13<00:00,  1.67s/it, loss=0.295, acc=82]  


Train Loss: 0.3492, Train Acc: 86.18%
Val Loss: 0.4673, Val Acc: 82.04%

Epoch [7/20]


Training: 100%|██████████| 31/31 [01:44<00:00,  3.38s/it, loss=0.497, acc=85.4] 
Validating: 100%|██████████| 8/8 [00:13<00:00,  1.63s/it, loss=0.457, acc=88.6]


Train Loss: 0.3404, Train Acc: 85.36%
Val Loss: 0.3334, Val Acc: 88.57%

Epoch [8/20]


Training: 100%|██████████| 31/31 [01:46<00:00,  3.45s/it, loss=0.142, acc=84.6]
Validating: 100%|██████████| 8/8 [00:13<00:00,  1.65s/it, loss=0.376, acc=83.7]


Train Loss: 0.3521, Train Acc: 84.65%
Val Loss: 0.3842, Val Acc: 83.67%

Epoch [9/20]


Training: 100%|██████████| 31/31 [01:46<00:00,  3.44s/it, loss=0.586, acc=86.5]
Validating: 100%|██████████| 8/8 [00:13<00:00,  1.64s/it, loss=0.507, acc=85.7]


Train Loss: 0.3298, Train Acc: 86.49%
Val Loss: 0.4449, Val Acc: 85.71%

Epoch [10/20]


Training: 100%|██████████| 31/31 [01:44<00:00,  3.37s/it, loss=0.219, acc=87.8]
Validating: 100%|██████████| 8/8 [00:13<00:00,  1.66s/it, loss=0.336, acc=89]  


Train Loss: 0.2916, Train Acc: 87.82%
Val Loss: 0.2955, Val Acc: 88.98%

Epoch [11/20]


Training: 100%|██████████| 31/31 [01:44<00:00,  3.38s/it, loss=0.124, acc=90]  
Validating: 100%|██████████| 8/8 [00:12<00:00,  1.57s/it, loss=0.516, acc=86.5]


Train Loss: 0.2861, Train Acc: 89.97%
Val Loss: 0.3283, Val Acc: 86.53%

Epoch [12/20]


Training: 100%|██████████| 31/31 [01:45<00:00,  3.41s/it, loss=0.557, acc=88.2] 
Validating: 100%|██████████| 8/8 [00:13<00:00,  1.66s/it, loss=0.319, acc=89]  


Train Loss: 0.2849, Train Acc: 88.23%
Val Loss: 0.2491, Val Acc: 88.98%

Epoch [13/20]


Training: 100%|██████████| 31/31 [01:45<00:00,  3.42s/it, loss=0.887, acc=89.2] 
Validating: 100%|██████████| 8/8 [00:12<00:00,  1.59s/it, loss=0.513, acc=89.8]


Train Loss: 0.3031, Train Acc: 89.15%
Val Loss: 0.2902, Val Acc: 89.80%

Epoch [14/20]


Training: 100%|██████████| 31/31 [01:44<00:00,  3.37s/it, loss=0.371, acc=88.9]
Validating: 100%|██████████| 8/8 [00:12<00:00,  1.58s/it, loss=0.387, acc=87.8]


Train Loss: 0.2735, Train Acc: 88.95%
Val Loss: 0.2782, Val Acc: 87.76%

Epoch [15/20]


Training: 100%|██████████| 31/31 [01:45<00:00,  3.41s/it, loss=0.531, acc=89.9]
Validating: 100%|██████████| 8/8 [00:13<00:00,  1.63s/it, loss=0.402, acc=89.8]


Train Loss: 0.2609, Train Acc: 89.87%
Val Loss: 0.2749, Val Acc: 89.80%

Epoch [16/20]


Training: 100%|██████████| 31/31 [01:45<00:00,  3.42s/it, loss=0.448, acc=89.7] 
Validating: 100%|██████████| 8/8 [00:12<00:00,  1.58s/it, loss=0.448, acc=92.2]


Train Loss: 0.2662, Train Acc: 89.66%
Val Loss: 0.2721, Val Acc: 92.24%

Epoch [17/20]


Training: 100%|██████████| 31/31 [01:44<00:00,  3.36s/it, loss=0.301, acc=90.4] 
Validating: 100%|██████████| 8/8 [00:12<00:00,  1.62s/it, loss=0.286, acc=91]   


Train Loss: 0.2460, Train Acc: 90.38%
Val Loss: 0.2157, Val Acc: 91.02%

Epoch [18/20]


Training: 100%|██████████| 31/31 [01:45<00:00,  3.40s/it, loss=0.101, acc=92.6] 
Validating: 100%|██████████| 8/8 [00:12<00:00,  1.59s/it, loss=0.327, acc=91.8] 


Train Loss: 0.1974, Train Acc: 92.63%
Val Loss: 0.2163, Val Acc: 91.84%

Epoch [19/20]


Training: 100%|██████████| 31/31 [01:45<00:00,  3.42s/it, loss=0.185, acc=92.2] 
Validating: 100%|██████████| 8/8 [00:12<00:00,  1.58s/it, loss=0.317, acc=91.8]


Train Loss: 0.1930, Train Acc: 92.22%
Val Loss: 0.2084, Val Acc: 91.84%

Epoch [20/20]


Training: 100%|██████████| 31/31 [01:46<00:00,  3.43s/it, loss=0.177, acc=93.4] 
Validating: 100%|██████████| 8/8 [00:13<00:00,  1.63s/it, loss=0.289, acc=92.2] 


Train Loss: 0.1779, Train Acc: 93.45%
Val Loss: 0.2008, Val Acc: 92.24%


Evaluating: 100%|██████████| 8/8 [00:12<00:00,  1.59s/it]



Classification Report:
               precision    recall  f1-score   support

Alluvial soil       0.99      0.88      0.93       106
   Black Soil       0.88      0.93      0.91        46
    Clay soil       0.78      0.97      0.87        40
     Red soil       0.98      0.96      0.97        53

     accuracy                           0.92       245
    macro avg       0.91      0.94      0.92       245
 weighted avg       0.93      0.92      0.92       245



Generating predictions: 100%|██████████| 11/11 [00:17<00:00,  1.58s/it]


Predictions saved to /kaggle/working/submission.csv
