Preprocessing

In [11]:
import numpy as np
import pandas as pd
from scipy.signal import savgol_filter


# Load the dataset (update the filename as needed)
file_path = "/content/drive/MyDrive/ImagoAI_Internship_Assignment/TASK-ML-INTERN.csv"  # Replace with actual file path
df = pd.read_csv(file_path)

# Extract hyperspectral features (excluding ID and target variable)
X = df.iloc[:, 1:-1].values  # Features (wavelength bands)
y = df['vomitoxin_ppb'].values  # Target variable

# ✅ Standard Normal Variate (SNV) Normalization
def snv(input_data):
    """Apply Standard Normal Variate (SNV) transformation."""
    return (input_data - np.mean(input_data, axis=1, keepdims=True)) / np.std(input_data, axis=1, keepdims=True)

X_snv = snv(X)

# ✅ Savitzky-Golay Filtering for Smoothing Spectral Data
X_sg = savgol_filter(X_snv, window_length=5, polyorder=2, axis=1)

# Convert to DataFrame for easier handling
df_preprocessed = pd.DataFrame(X_sg, columns=df.columns[1:-1])
df_preprocessed['vomitoxin_ppb'] = y

# Save preprocessed data for the next steps
preprocessed_file = "/content/drive/MyDrive/ImagoAI_Internship_Assignment/preprocessed_hyperspectral_data.csv"
df_preprocessed.to_csv(preprocessed_file, index=False)

# Return file path for verification
preprocessed_file

'/content/drive/MyDrive/ImagoAI_Internship_Assignment/preprocessed_hyperspectral_data.csv'

In [12]:
from sklearn.linear_model import LinearRegression
from scipy.spatial.distance import cdist

def successive_projections_algorithm(X, num_features):
    """
    Implements the Successive Projections Algorithm (SPA) for feature selection.

    Parameters:
        X (numpy.ndarray): Preprocessed hyperspectral data (samples x features).
        num_features (int): Number of relevant features to select.

    Returns:
        selected_features (list): Indices of selected wavelengths.
    """
    num_samples, num_wavelengths = X.shape
    selected_features = []
    remaining_features = list(range(num_wavelengths))

    # Start with the most representative feature (max variance)
    first_feature = np.argmax(np.var(X, axis=0))
    selected_features.append(first_feature)
    remaining_features.remove(first_feature)

    for _ in range(num_features - 1):
        # Compute distances between selected and remaining features
        distances = cdist(X[:, selected_features].T, X[:, remaining_features].T, metric='euclidean')

        # Select the feature that maximizes projection distance
        next_feature = remaining_features[np.argmax(np.min(distances, axis=0))]
        selected_features.append(next_feature)
        remaining_features.remove(next_feature)

    return selected_features

# Define number of wavelengths to retain (e.g., 20 most relevant)
num_selected_features = 20

# Apply SPA to preprocessed data
selected_feature_indices = successive_projections_algorithm(df_preprocessed.iloc[:, :-1].values, num_selected_features)

# Extract selected wavelengths
df_spa_selected = df_preprocessed.iloc[:, selected_feature_indices]
df_spa_selected['vomitoxin_ppb'] = df_preprocessed['vomitoxin_ppb']

# Save SPA-selected data for model training
spa_file = "/content/drive/MyDrive/ImagoAI_Internship_Assignment/spa_selected_hyperspectral_data.csv"
df_spa_selected.to_csv(spa_file, index=False)

# Return selected feature indices & file path for verification
selected_feature_indices, spa_file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_spa_selected['vomitoxin_ppb'] = df_preprocessed['vomitoxin_ppb']


([np.int64(0),
  339,
  120,
  43,
  98,
  180,
  79,
  91,
  422,
  144,
  106,
  84,
  23,
  6,
  250,
  203,
  88,
  447,
  94,
  112],
 '/content/drive/MyDrive/ImagoAI_Internship_Assignment/spa_selected_hyperspectral_data.csv')

In [18]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



# Load SPA-selected dataset (assuming it's saved from previous step)
spa_file_path = "/content/drive/MyDrive/ImagoAI_Internship_Assignment/spa_selected_hyperspectral_data.csv"
df_spa_selected = pd.read_csv(spa_file_path)

# Extract features and target
X = df_spa_selected.iloc[:, :-1].values  # Selected features
y = df_spa_selected['vomitoxin_ppb'].values  # Target variable

# Split into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# ✅ Train XGBoost Model
xgb_model = xgb.XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# ✅ Compute Evaluation Metrics
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return f"{model_name} → MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}"


xgb_results = evaluate_model(y_test, y_pred_xgb, "XGBoost")

# Return results
xgb_results


'XGBoost → MAE: 2558.92, RMSE: 8382.82, R²: 0.75'

In [19]:
best_xgb_model = xgb_model

In [20]:
import xgboost as xgb

# Save using XGBoost's recommended method
best_xgb_model.save_model("/content/drive/MyDrive/ImagoAI_Internship_Assignment/xgboost_model.json")