In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold
import torch
from torchvision import models, transforms

# Paths
TRAIN_DIR = "/kaggle/input/soil-classification/soil_classification-2025/train"
TEST_DIR = "/kaggle/input/soil-classification/soil_classification-2025/test"
TRAIN_LABELS = pd.read_csv("/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv")
TEST_IDS = pd.read_csv("/kaggle/input/soil-classification/soil_classification-2025/test_ids.csv")

# Encode labels
le = LabelEncoder()
TRAIN_LABELS['label'] = le.fit_transform(TRAIN_LABELS['soil_type'])

# Image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model for feature extraction
model_cnn = models.resnet18()  # no weights, just architecture
model_cnn.load_state_dict(torch.load("/kaggle/input/resnet/pytorch/default/1/resnet18-f37072fd.pth", map_location="cpu"))
model_cnn.fc = torch.nn.Identity()
model_cnn = model_cnn.to(device)
model_cnn.eval()

# Feature extractor function
def extract_features(image_paths):
    features = []
    for path in tqdm(image_paths, desc="Extracting features"):
        img = Image.open(path).convert("RGB")
        img_tensor = transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            feat = model_cnn(img_tensor).cpu().numpy().flatten()
        features.append(feat)
    return np.array(features)

# Prepare paths
TRAIN_LABELS['filepath'] = TRAIN_LABELS['image_id'].apply(lambda x: os.path.join(TRAIN_DIR, x))
TEST_IDS['filepath'] = TEST_IDS['image_id'].apply(lambda x: os.path.join(TEST_DIR, x))

# Extract features
train_features = extract_features(TRAIN_LABELS['filepath'].values)
test_features = extract_features(TEST_IDS['filepath'].values)

# Train ML classifier
X = train_features
y = TRAIN_LABELS['label'].values
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_preds = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"--- Fold {fold+1} ---")
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X[train_idx], y[train_idx])
    val_preds = clf.predict(X[val_idx])
    print(classification_report(y[val_idx], val_preds, target_names=le.classes_))
    all_preds.append(f1_score(y[val_idx], val_preds, average='macro'))

print("Average F1 across folds:", np.mean(all_preds))

# Train on full data & predict test
final_model = RandomForestClassifier(n_estimators=200, random_state=42)
final_model.fit(X, y)
test_preds = final_model.predict(test_features)
test_labels = le.inverse_transform(test_preds)

# Save submission
submission = pd.DataFrame({
    'image_id': TEST_IDS['image_id'],
    'soil_type': test_labels
})
submission.to_csv("submission.csv", index=False)
print("✅ Submission file saved.")


Extracting features: 100%|██████████| 1222/1222 [01:32<00:00, 13.27it/s]
Extracting features: 100%|██████████| 341/341 [00:24<00:00, 13.83it/s]


--- Fold 1 ---
               precision    recall  f1-score   support

Alluvial soil       0.95      0.93      0.94       106
   Black Soil       0.95      0.91      0.93        46
    Clay soil       1.00      0.93      0.96        40
     Red soil       0.87      0.98      0.92        53

     accuracy                           0.94       245
    macro avg       0.94      0.94      0.94       245
 weighted avg       0.94      0.94      0.94       245

--- Fold 2 ---
               precision    recall  f1-score   support

Alluvial soil       0.95      0.94      0.95       106
   Black Soil       0.98      0.94      0.96        47
    Clay soil       0.90      0.90      0.90        40
     Red soil       0.93      0.98      0.95        52

     accuracy                           0.94       245
    macro avg       0.94      0.94      0.94       245
 weighted avg       0.94      0.94      0.94       245

--- Fold 3 ---
               precision    recall  f1-score   support

Alluvial soil