# Data Exploration

Exploratory data analysis of the Australian soil dataset (2,625 samples).

In [None]:
import sys
sys.path.insert(0, "..")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")
%matplotlib inline

In [None]:
# Load raw soil data
df = pd.read_csv("../data/raw/soil_data_export.csv")
print(f"Samples: {len(df)}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Target variable summary statistics
targets = ["ph", "cec", "esp", "soc", "ca", "mg", "na"]
available = [t for t in targets if t in df.columns]
df[available].describe()

In [None]:
# Missing values per target
if available:
    missing = df[available].isnull().sum()
    print("Missing values:")
    print(missing)

In [None]:
# Distribution of target variables
if available:
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    axes = axes.flatten()
    for i, col in enumerate(available):
        axes[i].hist(df[col].dropna(), bins=50, edgecolor="black", alpha=0.7)
        axes[i].set_title(col.upper())
        axes[i].set_xlabel(col)
    for j in range(len(available), len(axes)):
        axes[j].set_visible(False)
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation matrix between targets
if len(available) > 1:
    corr = df[available].corr()
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, annot=True, cmap="RdBu_r", center=0, fmt=".2f")
    plt.title("Target Variable Correlations")
    plt.tight_layout()
    plt.show()

In [None]:
# Spatial distribution of samples
if "latitude" in df.columns and "longitude" in df.columns:
    plt.figure(figsize=(10, 8))
    plt.scatter(df["longitude"], df["latitude"], s=5, alpha=0.5)
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.title(f"Sample Locations (n={len(df)})")
    plt.tight_layout()
    plt.show()