In [39]:
'''
Docstring for models.ipynb
using spectral_feature_data.csv to train ML models and check their performance
the training and testing split is 80% - 20%
training is done using cross validation with 5 folds
models going to be used:
- Random Forest regressor
- XGBoost regressor
- CNN

'''
%pip install tensorflow scikit-learn pandas numpy tensorflow
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, r2_score
from tensorflow.keras import models, layers, Input

raw_features = pd.read_csv('spectral_feature_data.csv')

Note: you may need to restart the kernel to use updated packages.


In [40]:
#DEFINING EASY, MEDIUM, HARD FEATURES BASED ON OCCURANCE
total_rows = raw_features.shape[0]
df = pd.DataFrame()
print(raw_features.shape)
df_p_cols = [col for col in raw_features.columns if col.startswith('p')]
easy_features = []
print("Total rows in most occurance columns (>40k):")
for col in df_p_cols:
    if raw_features[col].isna().sum() < 10000:
        print(col,"\t\t", total_rows - raw_features[col].isna().sum())
        easy_features.append(col)

print("\n")
medium_features = [col for col in df_p_cols if col not in easy_features and raw_features[col].notna().sum() > 20000]
print("Total Rows in medium occurance columns (20k-40k):")
print(raw_features[medium_features].notna().sum())
hard_features = [col for col in df_p_cols if col not in easy_features and col not in medium_features]
print("\n")
print("Total Rows in hard occurance columns (<20k):") 
print(raw_features[hard_features].notna().sum())
spectral_cols = [col for col in raw_features.columns if col[0] != 'p']
print(f"\nSpectral Wavelengths: (No. of readings: {len(spectral_cols)})\n\n{spectral_cols}\n")

(44736, 36)
Total rows in most occurance columns (>40k):
p1.pH.index 		 44590
p2.N.wt_pct 		 40931
p2.OC.wt_pct 		 44624
p3.K.mg_kg 		 44488
p3.P.mg_kg 		 40764


Total Rows in medium occurance columns (20k-40k):
p1.EC.ds_m         21833
p1.Sand.wt_pct     27139
p4.CEC.cmolc_kg    22685
p4.CF.wt_pct       23237
dtype: int64


Total Rows in hard occurance columns (<20k):
p1.Clay.wt_pct          3915
p1.Silt.wt_pct          3901
p2.Zn.mg_kg                0
p3.Fe.mg_kg                0
p3.S.wt_pct              167
p4.BD.g_cm3             1162
p4.WR_10kPa.wt_pct       924
p4.WR_1500kPa.wt_pct     967
p4.WR_33kPa.wt_pct       919
dtype: int64

Spectral Wavelengths: (No. of readings: 18)

['410', '435', '460', '485', '510', '535', '560', '585', '610', '645', '680', '705', '730', '760', '810', '860', '900', '940']



In [41]:
print(medium_features)

['p1.EC.ds_m', 'p1.Sand.wt_pct', 'p4.CEC.cmolc_kg', 'p4.CF.wt_pct']


In [42]:
# ROW-WISE STANDARDIZATION (Standard Normal Variate - SNV) FOR SPECTRA
# ======================================================
def apply_snv(input_data):
    """
    Standard Normal Variate (SNV):
    Subtracts the Row Mean and divides by Row Std Dev.
    """
    # axis=1 means "calculate across the row"
    row_mean = input_data.mean(axis=1)
    row_std = input_data.std(axis=1)
    
    # We use numpy broadcasting to subtract/divide
    # (data - mean) / std
    snv_data = (input_data.sub(row_mean, axis=0)).div(row_std, axis=0)
    return snv_data

# Apply only to spectral columns
df_spectra_snv = apply_snv(raw_features[spectral_cols])


In [43]:
#this is to synthesize data by adding noise to existing data points
#helps to increase dataset size for better training of models especially deep learning models like CNNs
def augment_data(X_train, y_train, noise_level=0.01):
    """
    Doubles the dataset by adding a noisy version of every row.
    """
    # Generate Gaussian noise
    noise = np.random.normal(0, noise_level, X_train.shape)
    
    # Create new noisy samples
    X_noisy = X_train + noise
    
    # Combine original + noisy
    X_augmented = np.concatenate([X_train, X_noisy], axis=0)
    y_augmented = np.concatenate([y_train, y_train], axis=0) # Labels stay the same
    
    return X_augmented, y_augmented

For training the data, we will use a cnn on the easy features directly getting a 1D vector as output.\n
For the medium and hard data we will train individual models per feature getting a 0D output. \n
We will try using xgboost, and CNNs to see which model gives the best. \n
Final output will be a vector concatenating all of the outputs 

In [44]:

def build_flexible_model(input_shape, use_ph=False, use_ec=False):
    """
    Builds a model with optional pH and EC inputs.
    """
    inputs_list = []
    features_to_concat = []

    # --- 1. Spectral Branch (Always Present) ---
    input_spec = Input(shape=input_shape, name='spectral_in')
    inputs_list.append(input_spec)
    
    # Feature extraction logic
    x = layers.Conv1D(32, 3, activation='relu')(input_spec)
    x = layers.Conv1D(64, 3, activation='relu')(x)
    x = layers.GlobalAveragePooling1D()(x)
    features_to_concat.append(x)

    # --- 2. Conditional pH Branch ---
    if use_ph:
        input_ph = Input(shape=(1,), name='ph_in')
        inputs_list.append(input_ph)
        features_to_concat.append(input_ph)

    # --- 3. Conditional EC Branch ---
    if use_ec:
        input_ec = Input(shape=(1,), name='ec_in')
        inputs_list.append(input_ec)
        features_to_concat.append(input_ec)

    # --- 4. Merge & Output ---
    # If we have more than one feature source, we concatenate
    if len(features_to_concat) > 1:
        x = layers.Concatenate()(features_to_concat)
    else:
        x = features_to_concat[0] # Just the spectral features

    # Regression Head
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(32, activation='relu')(x)
    output = layers.Dense(1, name='output')(x)

    model = models.Model(inputs=inputs_list, outputs=output)
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    return model

In [45]:
#Defining dataset, Scaling the values
# dataset contains: 
# 1. scaled spectral columns
# 2. ph and ec columns
# 3. other target features

features = easy_features + medium_features + hard_features
df_final = pd.concat([df_spectra_snv, raw_features[easy_features + medium_features + hard_features]], axis=1) #has snv spectral with non scaled other features
print(df.shape)

#these are the scaled versions of ph and ec to be used as inputs when required
scaler = StandardScaler()
ec_scaled = scaler.fit_transform(raw_features[["p1.EC.ds_m"]])
ph_scaled = scaler.fit_transform(raw_features[["p1.pH.index"]])




(0, 0)


In [46]:

def run_spectral_experiment(X_spectral, X_ph, X_ec, y, feature_name="Target"):
    """
    Runs 5-Fold CV on 4 configurations for a specific target feature.
    """
    
    print(f"\n==========================================")
    print(f" EXPERIMENT: Predicting {feature_name}")
    print(f"==========================================\n")

    # Define Configurations
    configs = [
        {"name": "Spectral Only",      "ph": False, "ec": False},
        {"name": "Spectral + pH",      "ph": True,  "ec": False},
        {"name": "Spectral + EC",      "ph": False, "ec": True},
        {"name": "Spectral + pH + EC", "ph": True,  "ec": True}
    ]

    # Initialize K-Fold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Store aggregated results
    summary_results = []

    for config in configs:
        print(f">> Configuration: {config['name']}")
        
        fold_maes = []
        fold_r2s = []
        all_preds = [] # To track range across all folds
        
        # Cross-Validation Loop
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_spectral)):
            
            # --- Data Splitting ---
            # 1. Spectral
            x_train_s, x_val_s = X_spectral[train_idx], X_spectral[val_idx]
            
            # 2. Target
            y_train, y_val = y[train_idx], y[val_idx]
            
            # 3. Dynamic Inputs
            train_inputs = [x_train_s]
            val_inputs   = [x_val_s]
            
            # Add pH if config requires
            if config['ph']:
                x_train_p, x_val_p = X_ph[train_idx], X_ph[val_idx]
                train_inputs.append(x_train_p)
                val_inputs.append(x_val_p)

            # Add EC if config requires
            if config['ec']:
                x_train_e, x_val_e = X_ec[train_idx], X_ec[val_idx]
                train_inputs.append(x_train_e)
                val_inputs.append(x_val_e)

            # --- Build & Train Model ---
            # (Assuming build_flexible_model is defined elsewhere in your code)
            model = build_flexible_model(
                input_shape=(X_spectral.shape[1], 1),
                use_ph=config['ph'],
                use_ec=config['ec']
            )
            
            model.fit(
                x=train_inputs,
                y=y_train,
                validation_data=(val_inputs, y_val),
                epochs=30,
                batch_size=32,
                verbose=0
            )
            
            # --- Evaluation ---
            preds = model.predict(val_inputs, verbose=0).flatten()
            
            # Calculate Metrics
            mae = mean_absolute_error(y_val, preds)
            r2  = r2_score(y_val, preds)
            
            fold_maes.append(mae)
            fold_r2s.append(r2)
            all_preds.extend(preds)

        # --- Aggregate Config Results ---
        avg_mae = np.mean(fold_maes)
        avg_r2  = np.mean(fold_r2s)
        min_pred = np.min(all_preds)
        max_pred = np.max(all_preds)
        
        print(f"   Avg MAE: {avg_mae:.4f} | Avg RÂ²: {avg_r2:.4f}")
        print(f"   Output Range: [{min_pred:.2f}, {max_pred:.2f}]")
        print("-" * 30)

        summary_results.append({
            "Config": config['name'],
            "MAE": avg_mae,
            "R2": avg_r2,
            "Range": f"{min_pred:.2f} - {max_pred:.2f}"
        })

    return pd.DataFrame(summary_results)

# --- Usage Example ---
# Assuming you have your full arrays ready: X_all_spectral, X_all_ph, X_all_ec, y_all_nitrogen

# results_df = run_spectral_experiment(
#     X_spectral=X_all_spectral, 
#     X_ph=X_all_ph, 
#     X_ec=X_all_ec, 
#     y=y_all_nitrogen, 
#     feature_name="Nitrogen Content"
# )

# print("\nFinal Summary Table:")
# print(results_df)

In [47]:
'''# Define your 4 configurations

configs = [
    {"name": "Spectral Only",      "ph": False, "ec": False},
    {"name": "Spectral + pH",      "ph": True,  "ec": False},
    {"name": "Spectral + EC",      "ph": False, "ec": True},
    {"name": "Spectral + pH + EC", "ph": True,  "ec": True}
]

# Dictionary to store results
results = {}
print("Starting Experiment Loop...\n")

for config in configs:
    print(f"--- Training Model: {config['name']} ---")
    
    # 1. Build Model
    model = build_flexible_model(
        input_shape=(df_spectra_snv.shape[1], 1),
        use_ph=config['ph'],
        use_ec=config['ec']
    )
    
    # 2. Prepare Dynamic Input Lists
    # Always start with spectral
    train_inputs = [X_train_spectral]
    val_inputs   = [X_val_spectral]
    
    #---if ph and/or ec is as an input, it will be scaled otherwise untouched 
    if config['ph']:
    
        train_inputs.append(X_train_ph)
        val_inputs.append(X_val_ph)
        
    if config['ec']:
        train_inputs.append(X_train_ec)
        val_inputs.append(X_val_ec)
        
    # 3. Train
    history = model.fit(
        x=train_inputs,
        y=y_train,
        validation_data=(val_inputs, y_val),
        epochs=30,     # Adjust as needed
        batch_size=32,
        verbose=0      # Silent mode to keep output clean
    )
    
    # 4. Store final validation MAE
    final_mae = history.history['val_mae'][-1]
    results[config['name']] = final_mae
    print(f"Final Validation MAE: {final_mae:.4f}\n")

# --- Summary ---
print("--- Experiment Results ---")
for name, mae in results.items():
    print(f"{name}: {mae:.4f}")'''

'# Define your 4 configurations\n\nconfigs = [\n    {"name": "Spectral Only",      "ph": False, "ec": False},\n    {"name": "Spectral + pH",      "ph": True,  "ec": False},\n    {"name": "Spectral + EC",      "ph": False, "ec": True},\n    {"name": "Spectral + pH + EC", "ph": True,  "ec": True}\n]\n\n# Dictionary to store results\nresults = {}\nprint("Starting Experiment Loop...\n")\n\nfor config in configs:\n    print(f"--- Training Model: {config[\'name\']} ---")\n    \n    # 1. Build Model\n    model = build_flexible_model(\n        input_shape=(df_spectra_snv.shape[1], 1),\n        use_ph=config[\'ph\'],\n        use_ec=config[\'ec\']\n    )\n    \n    # 2. Prepare Dynamic Input Lists\n    # Always start with spectral\n    train_inputs = [X_train_spectral]\n    val_inputs   = [X_val_spectral]\n    \n    #---if ph and/or ec is as an input, it will be scaled otherwise untouched \n    if config[\'ph\']:\n    \n        train_inputs.append(X_train_ph)\n        val_inputs.append(X_val_ph

In [48]:
'''import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def build_and_train_cnn(X_train, y_train, X_val, y_val, feature_name):
    """
    Builds and trains a 1D-CNN regressor for a specific soil feature.
    """
    # Reshape input for 1D CNN: (samples, steps, channels)
    # Spectral data usually has 1 channel
    X_train_reshaped = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_val_reshaped = X_val.values.reshape(X_val.shape[0], X_val.shape[1], 1)

    model = models.Sequential([
        # First Layer: Detects local spectral patterns
        layers.Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        #layers.MaxPooling1D(pool_size=2),
        
        # Second Layer: Higher-level feature extraction
        layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
        layers.GlobalAveragePooling1D(), # Reduces dimensionality to prevent overfitting
        
        # Dense layers for regression
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2), # Regularization
        layers.Dense(32, activation='relu'),
        layers.Dense(1) # Final regression output
    ])

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    print(f"\n--- Training Model for Target: {feature_name} ---")
    history = model.fit(
        X_train_reshaped, y_train,
        validation_data=(X_val_reshaped, y_val),
        epochs=50,
        batch_size=16,
        verbose=1
    )
    
    return model, history

# --- Iterative Execution Logic ---

# Assuming 'df' contains: spectral_cols, 'ph', 'ec', and targets ['N', 'P', 'K', 'OC']
spectral_columns = [col for col in df.columns if col.startswith('wave_')] # adjust name
target_features = ['N', 'P', 'K', 'Organic_Carbon'] 

# Standardize spectral data
scaler = StandardScaler()
X = scaler.fit_transform(df[spectral_columns])
X = pd.DataFrame(X)

trained_models = {}

for target in target_features:
    y = df[target]
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train
    model, history = build_and_train_cnn(X_train, y_train, X_val, y_val, target)
    
    # Store the model for later prediction
    trained_models[target] = model'''

'import pandas as pd\nimport numpy as np\nimport tensorflow as tf\nfrom tensorflow.keras import layers, models\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\n\ndef build_and_train_cnn(X_train, y_train, X_val, y_val, feature_name):\n    """\n    Builds and trains a 1D-CNN regressor for a specific soil feature.\n    """\n    # Reshape input for 1D CNN: (samples, steps, channels)\n    # Spectral data usually has 1 channel\n    X_train_reshaped = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)\n    X_val_reshaped = X_val.values.reshape(X_val.shape[0], X_val.shape[1], 1)\n\n    model = models.Sequential([\n        # First Layer: Detects local spectral patterns\n        layers.Conv1D(filters=32, kernel_size=3, activation=\'relu\', input_shape=(X_train.shape[1], 1)),\n        #layers.MaxPooling1D(pool_size=2),\n        \n        # Second Layer: Higher-level feature extraction\n        layers.Conv1D(filters=64, kernel_size