In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/soil-classification-part-2/soil_competition-2025/train'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# 📌 SECTION 1: IMPORT LIBRARIES
import os
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 📌 SECTION 2: LOAD DATA (FIXED)

train_csv_path = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv'
train_img_dir  = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'

test_csv_path  = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv'
test_img_dir   = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'

# Load and inspect the training data
train_df = pd.read_csv(train_csv_path)
train_df.columns = train_df.columns.str.strip()  # Clean any accidental spaces

print("Columns in train_df:", train_df.columns)  # ✅ See what the actual column names are

# Set up label mapping
class_mapping = {
    'Alluvial soil': 0,
    'Black Soil': 1,
    'Clay soil': 2,
    'Red soil': 3
}

# 🛠 Check the column name here — most likely it is 'soil Type' with space
# So use the correct name as seen in the print statement above:
train_df['label'] = train_df['soil_type'].map(class_mapping)

# 📌 SECTION 3: IMAGE TRANSFORMATIONS

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
# 📌 SECTION 4: CUSTOM DATASET CLASS

class SoilDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None, is_test=False):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        image_id = self.dataframe.iloc[idx, 0]
        image_path = os.path.join(self.image_dir, image_id)
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        
        if self.is_test:
            return image, image_id
        else:
            label = self.dataframe.iloc[idx]['label']
            return image, label
# 📌 SECTION 5: TRAIN/VALIDATION SPLIT

train_split, val_split = train_test_split(train_df, test_size=0.2, stratify=train_df['label'])

train_dataset = SoilDataset(train_split, train_img_dir, transform)
val_dataset = SoilDataset(val_split, train_img_dir, transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
# 📌 SECTION 6: MODEL SETUP

model = models.resnet18(pretrained=True)
model.fc = torch.nn.Linear(model.fc.in_features, 4)  # 4 classes
model = model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# 📌 SECTION 7: TRAINING LOOP

epochs = 5
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss:.4f}")
# 📌 SECTION 8: VALIDATION

model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(labels.numpy())

print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=list(class_mapping.keys())))
# 📌 SECTION 9: PREDICTION ON TEST SET & SUBMISSION

test_df = pd.read_csv(test_csv_path)
test_dataset = SoilDataset(test_df, test_img_dir, transform, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=32)

model.eval()
predictions = []
image_ids = []

with torch.no_grad():
    for images, ids in test_loader:
        images = images.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        predictions.extend(preds)
        image_ids.extend(ids)

# Decode predictions
inverse_class_map = {v: k for k, v in class_mapping.items()}
predicted_labels = [inverse_class_map[p] for p in predictions]

submission_df = pd.DataFrame({'image_id': image_ids, 'soil_type': predicted_labels})
submission_df.to_csv('submission.csv', index=False)

print("✅ submission.csv created successfully.")


In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 📌 SECTION 2: LOAD DATA (FIXED)

train_csv_path = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv'
train_img_dir  = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'

test_csv_path  = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv'
test_img_dir   = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'

# Load and inspect the training data
train_df = pd.read_csv(train_csv_path)
train_df.columns = train_df.columns.str.strip()  # Clean any accidental spaces

print("Columns in train_df:", train_df.columns)  # ✅ See what the actual column names are

# Set up label mapping
class_mapping = {
    'Alluvial soil': 0,
    'Black Soil': 1,
    'Clay soil': 2,
    'Red soil': 3
}

# 🛠 Check the column name here — most likely it is 'soil Type' with space
# So use the correct name as seen in the print statement above:
train_df['label'] = train_df['soil_type'].map(class_mapping)

In [None]:
train_csv_path = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv'
train_img_dir  = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'

test_csv_path  = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv'
test_img_dir   = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'

# Load and inspect the training data
train_df = pd.read_csv(train_csv_path)
train_df.columns = train_df.columns.str.strip()  # Clean any accidental spaces

print("Columns in train_df:", train_df.columns)  # ✅ See what the actual column names are

# Set up label mapping
class_mapping = {
    'Alluvial soil': 0,
    'Black Soil': 1,
    'Clay soil': 2,
    'Red soil': 3
}

# 🛠 Check the column name here — most likely it is 'soil Type' with space
# So use the correct name as seen in the print statement above:
train_df['label'] = train_df['soil_type'].map(class_mapping)

In [None]:
# Create a cleaned soil_classification.ipynb notebook file with the provided code
from pathlib import Path

notebook_code = """
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "import-libraries",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \\U0001F4CC SECTION 1: IMPORT LIBRARIES\\n",
    "import os\\n",
    "import numpy as np\\n",
    "import pandas as pd\\n",
    "from PIL import Image\\n",
    "import matplotlib.pyplot as plt\\n",
    "from sklearn.metrics import classification_report\\n",
    "from sklearn.model_selection import train_test_split\\n",
    "\\n",
    "import torch\\n",
    "from torch.utils.data import Dataset, DataLoader\\n",
    "from torchvision import transforms, models\\n",
    "\\n",
    "device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "load-data",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \\U0001F4CC SECTION 2: LOAD DATA (FIXED)\\n",
    "train_csv_path = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv'\\n",
    "train_img_dir  = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'\\n",
    "\\n",
    "test_csv_path  = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv'\\n",
    "test_img_dir   = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'\\n",
    "\\n",
    "# Load and clean training labels\\n",
    "train_df = pd.read_csv(train_csv_path)\\n",
    "train_df.columns = train_df.columns.str.strip()\\n",
    "\\n",
    "print(\\\"Columns in train_df:\\\", train_df.columns)\\n",
    "\\n",
    "# \\u2705 Define class mapping\\n",
    "class_mapping = {\\n",
    "    'Alluvial soil': 0,\\n",
    "    'Black Soil': 1,\\n",
    "    'Clay soil': 2,\\n",
    "    'Red soil': 3\\n",
    "}\\n",
    "train_df['label'] = train_df['soil Type'].map(class_mapping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "transformations",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \\U0001F4CC SECTION 3: IMAGE TRANSFORMATIONS\\n",
    "transform = transforms.Compose([\\n",
    "    transforms.Resize((224, 224)),\\n",
    "    transforms.RandomHorizontalFlip(),\\n",
    "    transforms.RandomRotation(10),\\n",
    "    transforms.ToTensor(),\\n",
    "    transforms.Normalize(mean=[0.485, 0.456, 0.406],\\n",
    "                         std=[0.229, 0.224, 0.225])\\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dataset-class",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \\U0001F4CC SECTION 4: CUSTOM DATASET CLASS\\n",
    "class SoilDataset(Dataset):\\n",
    "    def __init__(self, dataframe, image_dir, transform=None, is_test=False):\\n",
    "        self.dataframe = dataframe\\n",
    "        self.image_dir = image_dir\\n",
    "        self.transform = transform\\n",
    "        self.is_test = is_test\\n",
    "\\n",
    "    def __len__(self):\\n",
    "        return len(self.dataframe)\\n",
    "\\n",
    "    def __getitem__(self, idx):\\n",
    "        image_id = self.dataframe.iloc[idx, 0]\\n",
    "        image_path = os.path.join(self.image_dir, image_id)\\n",
    "        image = Image.open(image_path).convert(\\\"RGB\\\")\\n",
    "        if self.transform:\\n",
    "            image = self.transform(image)\\n",
    "        if self.is_test:\\n",
    "            return image, image_id\\n",
    "        else:\\n",
    "            label = self.dataframe.iloc[idx]['label']\\n",
    "            return image, label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "train-validation-split",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \\U0001F4CC SECTION 5: TRAIN/VALIDATION SPLIT\\n",
    "train_split, val_split = train_test_split(train_df, test_size=0.2, stratify=train_df['label'])\\n",
    "train_dataset = SoilDataset(train_split, train_img_dir, transform)\\n",
    "val_dataset = SoilDataset(val_split, train_img_dir, transform)\\n",
    "train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)\\n",
    "val_loader = DataLoader(val_dataset, batch_size=32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "model-setup",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \\U0001F4CC SECTION 6: MODEL SETUP\\n",
    "model = models.resnet18(pretrained=True)\\n",
    "model.fc = torch.nn.Linear(model.fc.in_features, 4)\\n",
    "model = model.to(device)\\n",
    "criterion = torch.nn.CrossEntropyLoss()\\n",
    "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "training-loop",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \\U0001F4CC SECTION 7: TRAINING LOOP\\n",
    "epochs = 5\\n",
    "for epoch in range(epochs):\\n",
    "    model.train()\\n",
    "    running_loss = 0.0\\n",
    "    for images, labels in train_loader:\\n",
    "        images, labels = images.to(device), labels.to(device)\\n",
    "        optimizer.zero_grad()\\n",
    "        outputs = model(images)\\n",
    "        loss = criterion(outputs, labels)\\n",
    "        loss.backward()\\n",
    "        optimizer.step()\\n",
    "        running_loss += loss.item()\\n",
    "    print(f\\\"Epoch {epoch+1}/{epochs}, Loss: {running_loss:.4f}\\\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "validation",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \\U0001F4CC SECTION 8: VALIDATION\\n",
    "model.eval()\\n",
    "y_true, y_pred = [], []\\n",
    "with torch.no_grad():\\n",
    "    for images, labels in val_loader:\\n",
    "        images = images.to(device)\\n",
    "        outputs = model(images)\\n",
    "        preds = torch.argmax(outputs, dim=1).cpu().numpy()\\n",
    "        y_pred.extend(preds)\\n",
    "        y_true.extend(labels.numpy())\\n",
    "print(\\\"Classification Report:\\\")\\n",
    "print(classification_report(y_true, y_pred, target_names=list(class_mapping.keys())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "submission",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \\U0001F4CC SECTION 9: PREDICTION ON TEST SET & SUBMISSION\\n",
    "test_df = pd.read_csv(test_csv_path)\\n",
    "test_dataset = SoilDataset(test_df, test_img_dir, transform, is_test=True)\\n",
    "test_loader = DataLoader(test_dataset, batch_size=32)\\n",
    "model.eval()\\n",
    "predictions, image_ids = [], []\\n",
    "with torch.no_grad():\\n",
    "    for images, ids in test_loader:\\n",
    "        images = images.to(device)\\n",
    "        outputs = model(images)\\n",
    "        preds = torch.argmax(outputs, dim=1).cpu().numpy()\\n",
    "        predictions.extend(preds)\\n",
    "        image_ids.extend(ids)\\n",
    "inverse_class_map = {v: k for k, v in class_mapping.items()}\\n",
    "predicted_labels = [inverse_class_map[p] for p in predictions]\\n",
    "submission_df = pd.DataFrame({'image_id': image_ids, 'soil_type': predicted_labels})\\n",
    "submission_df.to_csv('submission.csv', index=False)\\n",
    "print(\\\"\\u2705 submission.csv created successfully.\\\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": ""
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
"""

# Save as .ipynb file
notebook_path = Path("/mnt/data/soil_classification.ipynb")
notebook_path.write_text(notebook_code)
notebook_path.name


In [None]:

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": None,
   "id": "import-libraries",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \U0001F4CC SECTION 1: IMPORT LIBRARIES\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from PIL import Image\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "import torch\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "from torchvision import transforms, models\n",
    "\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "id": "load-data",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \U0001F4CC SECTION 2: LOAD DATA (FIXED)\n",
    "train_csv_path = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv'\n",
    "train_img_dir  = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'\n",
    "\n",
    "test_csv_path  = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv'\n",
    "test_img_dir   = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'\n",
    "\n",
    "train_df = pd.read_csv(train_csv_path)\n",
    "train_df.columns = train_df.columns.str.strip()\n",
    "print(\"Columns in train_df:\", train_df.columns)\n",
    "class_mapping = {\n",
    "    'Alluvial soil': 0,\n",
    "    'Black Soil': 1,\n",
    "    'Clay soil': 2,\n",
    "    'Red soil': 3\n",
    "}\n",
    "train_df['label'] = train_df['soil Type'].map(class_mapping)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


import pandas as pd

# Example: replace these with your actual output values
image_ids = ['img_001.jpg', 'img_002.jpg', 'img_003.jpg']
predictions = [0, 2, 1]

# Mapping numeric prediction to soil types
inverse_class_map = {
    0: 'Alluvial soil',
    1: 'Black Soil',
    2: 'Clay soil',
    3: 'Red soil'
}

# Convert predictions to readable soil types
predicted_labels = [inverse_class_map[p] for p in predictions]

# Create and save DataFrame
submission_df = pd.DataFrame({
    'image_id': image_ids,
    'soil_type': predicted_labels
})
submission_df.to_csv('submission.csv', index=False)

print("✅ submission.csv created successfully.")


import pandas as pd

# Example: replace these with your actual output values
image_ids = ['img_001.jpg', 'img_002.jpg', 'img_003.jpg']
predictions = [0, 2, 1]

# Mapping numeric prediction to soil types
inverse_class_map = {
    0: 'Alluvial soil',
    1: 'Black Soil',
    2: 'Clay soil',
    3: 'Red soil'
}

# Convert predictions to readable soil types
predicted_labels = [inverse_class_map[p] for p in predictions]

# Create and save DataFrame
submission_df = pd.DataFrame({
    'image_id': image_ids,
    'soil_type': predicted_labels
})
submission_df.to_csv('submission.csv', index=False)

print("✅ submission.csv created successfully.")


In [None]:
# ✅ Final prediction step (after model inference)

# Convert predicted class labels to names
inverse_class_map = {v: k for k, v in class_mapping.items()}
predicted_labels = [inverse_class_map[p] for p in predictions]

# Create DataFrame
submission_df = pd.DataFrame({
    'image_id': image_ids,
    'soil_type': predicted_labels
})

# Save to CSV
submission_df.to_csv('submission.csv', index=False)

print("✅ submission.csv created successfully.")


In [None]:
import pandas as pd

# Example: replace these with your actual output values
image_ids = ['img_001.jpg', 'img_002.jpg', 'img_003.jpg']
predictions = [0, 2, 1]

# Mapping numeric prediction to soil types
inverse_class_map = {
    0: 'Alluvial soil',
    1: 'Black Soil',
    2: 'Clay soil',
    3: 'Red soil'
}

# Convert predictions to readable soil types
predicted_labels = [inverse_class_map[p] for p in predictions]

# Create and save DataFrame
submission_df = pd.DataFrame({
    'image_id': image_ids,
    'soil_type': predicted_labels
})
submission_df.to_csv('submission.csv', index=False)

print("✅ submission.csv created successfully.")


In [4]:


import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications import EfficientNetB0

# Set random seed for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

# 1. Data Preparation ----------------------------------------------------------

# Define paths
DATA_DIR = "/kaggle/input/soil-image-classification-challenge/"
TRAIN_DIR = os.path.join(DATA_DIR, "/kaggle/input/soil-classification-part-2/soil_competition-2025/train")
TEST_DIR = os.path.join(DATA_DIR, "/kaggle/input/soil-classification-part-2/soil_competition-2025/train")

# Load labels
train_df = pd.read_csv(os.path.join(DATA_DIR, "/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv"))

# Binary classification: Convert to 0 (non-soil) and 1 (soil)
# Assuming label '0' is non-soil and '1' is soil in your dataset
# Adjust according to your actual label encoding

# 2. Data Preprocessing -------------------------------------------------------

# Image parameters
IMG_SIZE = (224, 224)  # Standard size for EfficientNet
BATCH_SIZE = 32
CHANNELS = 3

# Create data generators with augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.2  # Using 20% for validation
)

# Generator for validation (no augmentation)
val_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

# Create generators
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=TRAIN_DIR,
    x_col="image_id",
    y_col="soil_type",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='training'
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=TRAIN_DIR,
    x_col="image_id",
    y_col="soil_type",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='validation'
)

# 3. Model Building -----------------------------------------------------------

# Use transfer learning with EfficientNetB0
base_model = EfficientNetB0(
    input_shape=(IMG_SIZE[0], IMG_SIZE[1], CHANNELS),
    include_top=False,
    weights='imagenet'
)

# Freeze the base model
base_model.trainable = False

# Build custom head
inputs = tf.keras.Input(shape=(IMG_SIZE[0], IMG_SIZE[1], CHANNELS))
x = base_model(inputs, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = models.Model(inputs, outputs)

# Compile the model
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

model.summary()

# 4. Model Training -----------------------------------------------------------

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    'best_model.h5',
    monitor='val_f1_score',
    save_best_only=True,
    mode='max'
)

# Custom callback for F1 score
class F1ScoreCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        val_pred = (self.model.predict(val_generator) > 0.5).astype("int32")
        val_true = val_generator.labels
        f1 = f1_score(val_true, val_pred)
        print(f" - val_f1: {f1:.4f}")
        logs['val_f1_score'] = f1

# Train the model
history = model.fit(
    train_generator,
    epochs=30,
    validation_data=val_generator,
    callbacks=[early_stopping, checkpoint, F1ScoreCallback()]
)

# 5. Model Evaluation ---------------------------------------------------------

# Plot training history
def plot_history(history):
    plt.figure(figsize=(12, 4))
    
    # Plot accuracy
    plt.subplot(1, 3, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Val Accuracy')
    plt.title('Accuracy')
    plt.legend()
    
    # Plot loss
    plt.subplot(1, 3, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title('Loss')
    plt.legend()
    
    # Plot F1 score if available
    if 'val_f1_score' in history.history:
        plt.subplot(1, 3, 3)
        plt.plot(history.history['val_f1_score'], label='Val F1 Score')
        plt.title('F1 Score')
        plt.legend()
    
    plt.tight_layout()
    plt.show()

plot_history(history)

# Detailed classification report
val_pred = (model.predict(val_generator) > 0.5).astype("int32")
print(classification_report(val_generator.labels, val_pred))

# 6. Fine-tuning (Optional) --------------------------------------------------

# Unfreeze some layers for fine-tuning
base_model.trainable = True
for layer in base_model.layers[:100]:
    layer.trainable = False

# Recompile with lower learning rate
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-5),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

# Fine-tune for a few epochs
history_fine = model.fit(
    train_generator,
    epochs=10,
    initial_epoch=history.epoch[-1],
    validation_data=val_generator,
    callbacks=[early_stopping, checkpoint, F1ScoreCallback()]
)

# 7. Generate Predictions for Test Set ----------------------------------------

# Load best model
model = models.load_model('best_model.h5')

# Create test generator (no shuffle, no augmentation)
test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=TEST_DIR,
    x_col="image_id",
    y_col=None,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode=None,
    shuffle=False
)

# Generate predictions
test_pred = (model.predict(test_generator) > 0.5).astype("int32")

# Create submission file
submission = pd.DataFrame({
    'image_id': test_df['image_id'],
    'soil_type': test_pred.flatten()
})

submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

KeyError: 'soil_type'

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from sklearn.metrics import f1_score, classification_report

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

# 1. Data Loading with Robust Checks -------------------------------------------

DATA_DIR = "/kaggle/input/soil-classification-part-2/soil_competition-2025"
TRAIN_DIR = os.path.join(DATA_DIR, "/kaggle/input/soil-classification-part-2/soil_competition-2025/train")
TEST_DIR = os.path.join(DATA_DIR, "/kaggle/input/soil-classification-part-2/soil_competition-2025/test")

# Load training labels with column verification
try:
    train_df = pd.read_csv(os.path.join(DATA_DIR, "/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv"))
    
    # Check for expected columns - adjust based on your actual CSV
    if 'label' in train_df.columns:  # Common alternative column name
        train_df = train_df.rename(columns={'label': 'soil_type'})
    elif 'target' in train_df.columns:
        train_df = train_df.rename(columns={'target': 'soil_type'})
    
    # Verify we have the required columns
    assert 'image_id' in train_df.columns, "CSV missing 'image_id' column"
    assert 'soil_type' in train_df.columns, "CSV missing label column (tried 'soil_type', 'label', 'target')"
    
    print(f"Successfully loaded {len(train_df)} training samples")
    print("Sample data:")
    print(train_df.head())
    
except Exception as e:
    print(f"Error loading data: {e}")
    print("Available files in directory:")
    print(os.listdir(DATA_DIR))
    raise

# 2. Data Preparation ---------------------------------------------------------

# Convert labels to binary (0 or 1) if they aren't already
if set(train_df['soil_type'].unique()) != {0, 1}:
    print("Converting labels to binary format...")
    # Simple binary conversion - adjust based on your label encoding
    train_df['soil_type'] = train_df['soil_type'].apply(lambda x: 1 if x == 'soil' else 0)

# Image parameters
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

# 3. Data Generators with Enhanced Error Handling -----------------------------

def create_data_generator(df, directory, is_train=True):
    datagen_args = {
        'rescale': 1./255,
        'validation_split': 0.2
    }
    
    if is_train:
        datagen_args.update({
            'rotation_range': 20,
            'width_shift_range': 0.2,
            'height_shift_range': 0.2,
            'shear_range': 0.2,
            'zoom_range': 0.2,
            'horizontal_flip': True
        })
    
    datagen = ImageDataGenerator(**datagen_args)
    
    try:
        generator = datagen.flow_from_dataframe(
            dataframe=df,
            directory=directory,
            x_col="image_id",
            y_col="soil_type",
            target_size=IMG_SIZE,
            class_mode='binary',
            batch_size=BATCH_SIZE,
            subset='training' if is_train else 'validation',
            validate_filenames=True
        )
        
        # Test loading one batch to verify everything works
        test_batch = next(generator)
        print(f"Successfully created generator with {len(generator)} batches")
        print(f"Batch shape: {test_batch[0].shape}")
        
        return generator
        
    except Exception as e:
        print(f"Error creating generator: {e}")
        print("Troubleshooting info:")
        print(f"- Directory exists: {os.path.exists(directory)}")
        if len(df) > 0:
            sample_file = os.path.join(directory, df.iloc[0]['image_id'])
            print(f"- Sample file exists: {os.path.exists(sample_file)} (path: {sample_file})")
        raise

# Create generators
try:
    train_generator = create_data_generator(train_df, TRAIN_DIR, is_train=True)
    val_generator = create_data_generator(train_df, TRAIN_DIR, is_train=False)
except Exception as e:
    print("Failed to create data generators")
    raise

# 4. Model Building -----------------------------------------------------------

def build_model():
    try:
        # Use EfficientNetB0 with pre-trained weights
        base_model = EfficientNetB0(
            input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3),
            include_top=False,
            weights='imagenet'
        )
        base_model.trainable = False

        # Create custom model head
        inputs = tf.keras.Input(shape=(IMG_SIZE[0], IMG_SIZE[1], 3))
        x = base_model(inputs, training=False)
        x = layers.GlobalAveragePooling2D()(x)
        x = layers.Dense(256, activation='relu')(x)
        x = layers.Dropout(0.5)(x)
        outputs = layers.Dense(1, activation='sigmoid')(x)

        model = models.Model(inputs, outputs)

        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy', 
                    tf.keras.metrics.Precision(name='precision'),
                    tf.keras.metrics.Recall(name='recall')]
        )

        print("Model successfully built")
        model.summary()
        return model

    except Exception as e:
        print(f"Error building model: {e}")
        raise

model = build_model()

# 5. Training with F1-Score Tracking ------------------------------------------

class F1ScoreCallback(tf.keras.callbacks.Callback):
    def __init__(self, val_generator):
        super().__init__()
        self.val_generator = val_generator
    
    def on_epoch_end(self, epoch, logs=None):
        # Get all validation data
        val_true = []
        val_pred = []
        
        for i in range(len(self.val_generator)):
            x, y = self.val_generator[i]
            batch_pred = (self.model.predict(x, verbose=0) > 0.5).astype("int32")
            val_true.extend(y)
            val_pred.extend(batch_pred.flatten())
        
        f1 = f1_score(val_true, val_pred)
        print(f" - val_f1: {f1:.4f}")
        logs['val_f1'] = f1

# Training callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint('best_model.h5', save_best_only=True),
    F1ScoreCallback(val_generator)
]

# Train the model
history = model.fit(
    train_generator,
    epochs=30,
    validation_data=val_generator,
    callbacks=callbacks,
    verbose=1
)

# 6. Evaluation and Prediction ------------------------------------------------

# Load best model
model = models.load_model('best_model.h5')

# Evaluate on validation set
val_pred = (model.predict(val_generator) > 0.5).astype("int32")
print("\nClassification Report:")
print(classification_report(val_generator.labels, val_pred))

# Generate predictions for test set (if available)
if os.path.exists(TEST_DIR):
    test_df = pd.read_csv(os.path.join(DATA_DIR, "test_labels.csv"))
    
    test_datagen = ImageDataGenerator(rescale=1./255)
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_df,
        directory=TEST_DIR,
        x_col="image_id",
        y_col=None,
        target_size=IMG_SIZE,
        class_mode=None,
        shuffle=False,
        batch_size=BATCH_SIZE
    )
    
    test_pred = (model.predict(test_generator) > 0.5).astype("int32")
    
    # Create submission file
    submission = pd.DataFrame({
        'image_id': test_df['image_id'],
        'soil_type': test_pred.flatten()
    })
    submission.to_csv('submission.csv', index=False)
    print("Submission file created: submission.csv")

2025-05-27 09:16:48.437504: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748337408.749266      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748337408.835468      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Successfully loaded 1222 training samples
Sample data:
           image_id  soil_type
0  img_ed005410.jpg          1
1  img_0c5ecd2a.jpg          1
2  img_ed713bb5.jpg          1
3  img_12c58874.jpg          1
4  img_eff357af.jpg          1
Converting labels to binary format...
Error creating generator: If class_mode="binary", y_col="soil_type" column values must be strings.
Troubleshooting info:
- Directory exists: True
- Sample file exists: True (path: /kaggle/input/soil-classification-part-2/soil_competition-2025/train/img_ed005410.jpg)
Failed to create data generators


TypeError: If class_mode="binary", y_col="soil_type" column values must be strings.