In [None]:
#!/usr/bin/env python3

"""

Author: Annam.ai IIT Ropar
Team Name: SoilClassifiers
Team Members: Krishnopreya , Debapriya , Shweta, Namya, Nikhil
Leaderboard Rank:101

"""
import os
import sys
import torch
import numpy as np

# Add src directory to path
sys.path.append('src')

from preprocessing import load_and_split_data, prepare_datasets
from training import train_model, SoilClassifier
from postprocessing import process_results

def main():
    """Main training pipeline"""

    print("Starting Soil Classification Training Pipeline")
    print("=" * 60)

    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    # Check device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Define paths
    DATA_DIR = "/kaggle/input/soil-classification/soil_classification-2025"
    TRAIN_DIR = os.path.join(DATA_DIR, "train")
    TEST_DIR = os.path.join(DATA_DIR, "test")
    TRAIN_CSV = os.path.join(DATA_DIR, "train_labels.csv")
    TEST_CSV = os.path.join(DATA_DIR, "test_ids.csv")

    # Output directory
    OUTPUT_DIR = "/kaggle/working"
    MODEL_PATH = os.path.join(OUTPUT_DIR, "best_model.pth")

    print(f"Data directory: {DATA_DIR}")
    print(f"Output directory: {OUTPUT_DIR}")

    # Step 1: Load and split data
    print("\n" + "="*60)
    print("STEP 1: Loading and splitting data")
    print("="*60)

    train_df, val_df = load_and_split_data(TRAIN_CSV)
    print(f"Training samples: {len(train_df)}")
    print(f"Validation samples: {len(val_df)}")

    # Step 2: Prepare datasets
    print("\n" + "="*60)
    print("STEP 2: Preparing datasets")
    print("="*60)

    train_dataset, val_dataset, test_dataset, test_df = prepare_datasets(
        train_df, val_df, TEST_CSV, TRAIN_DIR, TEST_DIR
    )

    print(f"Training dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}")

    # Step 3: Train model
    print("\n" + "="*60)
    print("STEP 3: Training model")
    print("="*60)

    model, history = train_model(
        train_dataset,
        val_dataset,
        device,
        MODEL_PATH,
        num_epochs=20,
        batch_size=32
    )

    print("Training completed!")

    # Step 4: Post-processing and evaluation
    print("\n" + "="*60)
    print("STEP 4: Post-processing and evaluation")
    print("="*60)

    # Load best model
    model.load_state_dict(torch.load(MODEL_PATH))
    model.eval()

    # Process results
    submission_df = process_results(
        model, val_dataset, test_dataset, test_df, history, device, OUTPUT_DIR
    )

    print("\n" + "="*60)
    print("TRAINING PIPELINE COMPLETED SUCCESSFULLY!")
    print("="*60)

    print(f"\nFiles generated:")
    print(f"- Model: {MODEL_PATH}")
    print(f"- Submission: {os.path.join(OUTPUT_DIR, 'submission.csv')}")
    print(f"- Confusion Matrix: {os.path.join(OUTPUT_DIR, 'confusion_matrix.png')}")
    print(f"- Metrics: {os.path.join(OUTPUT_DIR, 'ml-metrics.json')}")
    print(f"- Training History: {os.path.join(OUTPUT_DIR, 'training_history.png')}")
    print(f"- Architecture Info: {os.path.join(OUTPUT_DIR, 'architecture.json')}")

    print(f"\nFinal validation accuracy: {max(history['val_acc']):.2f}%")
    print(f"Submission shape: {submission_df.shape}")
    print("\nPrediction distribution:")
    print(submission_df['soil_type'].value_counts())

if __name__ == "__main__":
    main()
