# SHAP Feature Selection for Yield Prediction
This notebook demonstrates how to apply SHAP (SHapley Additive exPlanations) to interpret a machine learning model for crop yield prediction. Each code cell is explained in detail to help you understand what it does, how to use it, and how to adapt it to your own data.

# Cell 1: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 1
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# No inputs or outputs here, just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import shap
import matplotlib.pyplot as plt
from dask import dataframe as dd
from dask.distributed import Client
from dask_ml.xgboost import XGBRegressor
import dask.array as da
import dask.dataframe as dd

# Configure Matplotlib for batch job processing
import matplotlib
matplotlib.use('Agg')

# Start a Dask cluster for parallel processing
client = Client()  # On Setonix, let Dask manage resources across CPUs

# Load the dataset into a Dask DataFrame
file_path = "/scratch/pawsey0988/mibrahim/updated_boundaries_with_NDVI_for_2022.gpkg"
data = gpd.read_file(file_path)
data = dd.from_pandas(data, npartitions=8)

# Define target variable and features
target_variable = 'yield'
features = data.drop(columns=[target_variable, 'ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 
                              'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry'])
target = data[target_variable]

# Standardize the features
scaler = StandardScaler()
features = dd.from_dask_array(scaler.fit_transform(features), columns=features.columns)

# Handle missing values
features = features.fillna(features.median().compute())
target = target.fillna(target.median().compute())

# Sample for SHAP feature importance calculation
sample = features.sample(frac=0.1).compute()  # Smaller subset for SHAP
target_sample = target.loc[sample.index].compute()

# Train a preliminary model on CPU for SHAP
from sklearn.ensemble import RandomForestRegressor
prelim_model = RandomForestRegressor(n_estimators=100, random_state=42)
prelim_model.fit(sample, target_sample)

# Use SHAP for feature importance
explainer = shap.Explainer(prelim_model)
shap_values = explainer(sample)

# Select important features (Top 50%)
shap_importance = np.abs(shap_values.values).mean(axis=0)
important_features = sample.columns[shap_importance > np.percentile(shap_importance, 50)]

# Filter important features
features = features[important_features]

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(features.compute(), target.compute(), test_size=0.2, random_state=42)

# Train the final model using Dask with XGBoost on CPU (use ROCm-compatible ML tools if required)
model = XGBRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Save prediction plot
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Yield")
plt.ylabel("Predicted Yield")
plt.title(f"XGBoost Prediction (R2={r2:.2f}, MSE={mse:.2f})")
plt.savefig("prediction_plot.png", bbox_inches='tight')

# Feature importance plot
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 8))
plt.title("Feature Importances")
plt.bar(range(len(important_features)), importances[indices], align="center")
plt.xticks(range(len(important_features)), important_features[indices], rotation=90)
plt.savefig("feature_importance_plot.png", bbox_inches='tight')

print("Processing complete.")

# Cell 2: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 2
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# No inputs or outputs here, just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask.dataframe as dd
import geopandas as gpd
import shap
import joblib
import numpy as np
from dask_ml.preprocessing import StandardScaler
from dask_ml.model_selection import train_test_split
from dask_ml.xgboost import XGBRegressor
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib

# Configure Matplotlib for batch job processing
matplotlib.use('Agg')

# Setup Dask Client across all available work partitions
client = Client(n_workers=256, threads_per_worker=1, memory_limit='4GB')
print(client)

# Load data as Dask DataFrame
file_path = "/scratch/pawsey0988/mibrahim/updated_boundaries_with_NDVI_for_2022.gpkg"
data = gpd.read_file(file_path)
data = dd.from_pandas(data, npartitions=256)

# Define target and features
target_column = 'yield'
drop_columns = ['ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
features = data.drop(columns=[target_column] + drop_columns)
target = data[target_column]

# Scale features
scaler = StandardScaler()
features = scaler.fit_transform(features.fillna(0))  # Handling NaNs by filling with 0

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train model on full dataset using Dask-optimized XGBoost
xgb_model = XGBRegressor(n_estimators=150, n_jobs=256, tree_method='hist')
xgb_model.fit(X_train, y_train)

# SHAP analysis
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test)

# Save SHAP values to a file for later combination if needed
joblib.dump(shap_values, "/scratch/pawsey0988/mibrahim/shap_values_partition.pkl")

# Save model to disk
joblib.dump(xgb_model, "/scratch/pawsey0988/mibrahim/final_yield_model.pkl")

# Evaluate and plot results
y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

plt.figure(figsize=(10, 6))
plt.scatter(y_test.compute(), y_pred.compute(), alpha=0.5)
plt.xlabel("Actual Yield")
plt.ylabel("Predicted Yield")
plt.title(f"XGBoost Prediction (R2={r2:.2f}, MSE={mse:.2f})")
plt.savefig("/scratch/pawsey0988/mibrahim/prediction_plot.png", bbox_inches='tight')

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_test, show=False)
plt.savefig("/scratch/pawsey0988/mibrahim/shap_summary_plot.png", bbox_inches='tight')

# Feature importance plot
importances = xgb_model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 8))
plt.title("Feature Importances")
plt.bar(range(len(important_features)), importances[indices], align="center")
plt.xticks(range(len(important_features)), important_features[indices], rotation=90)
plt.savefig("feature_importance_plot.png", bbox_inches='tight')


print("Distributed SHAP processing complete.")

# Cell 3: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 3
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# No inputs or outputs here, just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask.dataframe as dd
import dask_geopandas as dgpd
import shap
import joblib
import numpy as np
from dask_ml.preprocessing import StandardScaler
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt
import matplotlib
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Configure Matplotlib for batch job processing
matplotlib.use('Agg')

def main():
    # Setup Dask Client with adjusted resources
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data as Dask GeoDataFrame with specified partitions
    file_path = "/scratch/pawsey0988/mibrahim/updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print(data)

    # Define target and features
    target_column = 'yield'
    drop_columns = ['ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
    features = data.drop(columns=[target_column] + drop_columns)
    target = data[target_column]

    # Scale features
    scaler = StandardScaler()
    features = scaler.fit_transform(features.fillna(0))  # Handling NaNs by filling with 0
    print(features)

    # Preliminary SHAP analysis to select top 30% features
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        preliminary_rf_model.fit(features.compute(), target.compute())

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(features.compute())
    shap_importance = np.abs(shap_values).mean(axis=0)
    
     # Summary plot for SHAP values  important_features = features.columns  # Get the feature names

    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, features.compute(), feature_names=features.columns, show=False)
    plt.savefig("/scratch/pawsey0988/mibrahim/shap_summary_plot_before_training_rf.png", bbox_inches='tight')


    # # Use SHAP for feature importance
    # explainer = shap.Explainer(preliminary_rf_model)
    # shap_values = explainer(features)

    # # Select important features (Top 50%)
    # shap_importance = np.abs(shap_values.values).mean(axis=0)
    # important_features = sample.columns[shap_importance > np.percentile(shap_importance, 70)]

    # # Filter important features
    # selected_features = features[important_features]
    # selected_feature_names = selected_features.columns
    # print(f"Selected top 30% features: {selected_feature_names}")
    
    
    # Select top 30% features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(shap_importance)[-num_top_features:]
    selected_features = features[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Train-test split with selected features
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Saving the datasets
    joblib.dump(X_train, "/scratch/pawsey0988/mibrahim/X_train_selected.pkl")
    joblib.dump(X_test, "/scratch/pawsey0988/mibrahim/X_test_selected.pkl")
    joblib.dump(y_train, "/scratch/pawsey0988/mibrahim/y_train.pkl")
    joblib.dump(y_test, "/scratch/pawsey0988/mibrahim/y_test.pkl")

    # Train model using RandomForestRegressor with selected features
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        rf_model.fit(X_train, y_train)

    # Save the trained Random Forest model
    model_path = "/scratch/pawsey0988/mibrahim/rf_model.pkl"
    joblib.dump(rf_model, model_path)
    print("Random Forest model trained and saved.")

    # Evaluate and plot results
    y_pred = rf_model.predict(X_test.compute())
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)

    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("/scratch/pawsey0988/mibrahim/prediction_plot_rf.png", bbox_inches='tight')

    # Final SHAP analysis with the trained model
    explainer_final = shap.TreeExplainer(rf_model)
    shap_values_final = explainer_final.shap_values(X_test.compute())

    # Summary plot for SHAP values after training
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values_final, X_test.compute(), feature_names=selected_feature_names, show=False)
    plt.savefig("/scratch/pawsey0988/mibrahim/shap_summary_plot_after_training_rf.png", bbox_inches='tight')

    # Force plot for a single prediction (first test instance)
    plt.figure(figsize=(10, 6))
    shap.force_plot(explainer_final.expected_value, shap_values_final[0], X_test.compute().iloc[0], matplotlib=True)
    plt.savefig("/scratch/pawsey0988/mibrahim/shap_force_plot_rf.png", bbox_inches='tight')

    print("SHAP analysis completed and plots saved.")
    print("Distributed Random Forest processing complete.")

if __name__ == '__main__':
    main()

# Cell 4: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 4
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# No inputs or outputs here, just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask_geopandas as dgpd
import shap
import joblib
import numpy as np
from dask_ml.preprocessing import StandardScaler
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt
import matplotlib
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Configure Matplotlib for batch job processing
matplotlib.use('Agg')

def main():
    # Setup Dask Client with adjusted resources
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data as Dask GeoDataFrame with specified partitions
    file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development\Desktop//Food Agility project-homework//WA-Rainfall-Zone-Analysis//Complete dataset//updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print(data.shape)
    data = data[0:2000,:]
    print(data)

    # Define target and features
    target_column = 'yield'
    drop_columns = ['ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
    features = data.drop(columns=[target_column] + drop_columns)
    target = data[target_column]

    # Scale features
    scaler = StandardScaler()
    features = scaler.fit_transform(features.fillna(0))  # Handling NaNs by filling with 0
    print(features)

    # Preliminary SHAP analysis to select top 30% features
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        preliminary_rf_model.fit(features.compute(), target.compute())

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(features.compute())
    shap_importance = np.abs(shap_values).mean(axis=0)

     # Summary plot for SHAP values  important_features = features.columns  # Get the feature names

    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, features.compute(), feature_names=features.columns, show=False)
    plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')


    # # Use SHAP for feature importance
    # explainer = shap.Explainer(preliminary_rf_model)
    # shap_values = explainer(features)

    # # Select important features (Top 50%)
    # shap_importance = np.abs(shap_values.values).mean(axis=0)
    # important_features = sample.columns[shap_importance > np.percentile(shap_importance, 70)]

    # # Filter important features
    # selected_features = features[important_features]
    # selected_feature_names = selected_features.columns
    # print(f"Selected top 30% features: {selected_feature_names}")


    # Select top 30% features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(shap_importance)[-num_top_features:]
    selected_features = features[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Train-test split with selected features
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Saving the datasets
    # joblib.dump(X_train, "/scratch/pawsey0988/mibrahim/X_train_selected.pkl")
    # joblib.dump(X_test, "/scratch/pawsey0988/mibrahim/X_test_selected.pkl")
    # joblib.dump(y_train, "/scratch/pawsey0988/mibrahim/y_train.pkl")
    # joblib.dump(y_test, "/scratch/pawsey0988/mibrahim/y_test.pkl")

    # Train model using RandomForestRegressor with selected features
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        rf_model.fit(X_train, y_train)

    # Save the trained Random Forest model
    # model_path = "/scratch/pawsey0988/mibrahim/rf_model.pkl"
    # joblib.dump(rf_model, model_path)
    # print("Random Forest model trained and saved.")

    # Evaluate and plot results
    y_pred = rf_model.predict(X_test.compute())
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)

    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("prediction_plot_rf.png", bbox_inches='tight')

    # Final SHAP analysis with the trained model
    explainer_final = shap.TreeExplainer(rf_model)
    shap_values_final = explainer_final.shap_values(X_test.compute())

    # Summary plot for SHAP values after training
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values_final, X_test.compute(), feature_names=selected_feature_names, show=False)
    plt.savefig("shap_summary_plot_after_training_rf.png", bbox_inches='tight')

    # Force plot for a single prediction (first test instance)
    plt.figure(figsize=(10, 6))
    shap.force_plot(explainer_final.expected_value, shap_values_final[0], X_test.compute().iloc[0], matplotlib=True)
    plt.savefig("shap_force_plot_rf.png", bbox_inches='tight')

    print("SHAP analysis completed and plots saved.")
    print("Distributed Random Forest processing complete.")

if __name__ == '__main__':
    main()

# Cell 5: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 5
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# inputs = "updated_boundaries_with_NDVI_for_2022.gpkg" ,  just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask_geopandas as dgpd
import shap
import joblib
import numpy as np
from dask_ml.preprocessing import StandardScaler
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt
import matplotlib
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Setup Dask Client with adjusted resources
client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
print(client)

    # Load data as Dask GeoDataFrame with specified partitions
file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development\Desktop//Food Agility project-homework//WA-Rainfall-Zone-Analysis//Complete dataset//updated_boundaries_with_NDVI_for_2022.gpkg"
data = dgpd.read_file(file_path, npartitions=256)

target_column = 'yield'
drop_columns = [target_column,'ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area']
features = data.drop(columns=drop_columns)
target = data[target_column]
print(features)
print(target)


preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
with ProgressBar():
    preliminary_rf_model.fit(features, target.compute())

explainer = shap.TreeExplainer(preliminary_rf_model)
shap_values = explainer.shap_values(features)
shap_importance = np.abs(shap_values).mean(axis=0)

# Summary plot for SHAP values  important_features = features.columns  # Get the feature names

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, features.compute(), feature_names=features.columns, show=False)
plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')



# Select top 30% features based on SHAP importance
num_top_features = int(0.3 * features.shape[1])
top_feature_indices = np.argsort(shap_importance)[-num_top_features:]
selected_features = features[:, top_feature_indices]
selected_feature_names = features.columns[top_feature_indices]
print(f"Selected top 30% features: {selected_feature_names}")

   
X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)
   
rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
with ProgressBar():
    rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test.compute())
mse = mean_squared_error(y_test.compute(), y_pred)
r2 = r2_score(y_test.compute(), y_pred)

# Save the trained Random Forest model
#model_path = "/scratch/pawsey0988/mibrahim/rf_model.pkl"


plt.figure(figsize=(10, 6))
plt.scatter(y_test.compute(), y_pred, alpha=0.5)
plt.xlabel("Actual Yield")
plt.ylabel("Predicted Yield")
plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
plt.savefig("prediction_plot_rf.png", bbox_inches='tight')
# plt.savefig("/scratch/pawsey0988/mibrahim/prediction_plot_rf.png", bbox_inches='tight')

# Final SHAP analysis with the trained model
explainer_final = shap.TreeExplainer(rf_model)
shap_values_final = explainer_final.shap_values(X_test.compute())

    
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values_final, X_test.compute(), feature_names=selected_feature_names, show=False)
plt.savefig("shap_summary_plot_after_training_rf.png", bbox_inches='tight')

# Force plot for a single prediction (first test instance)
plt.figure(figsize=(10, 6))
shap.force_plot(explainer_final.expected_value, shap_values_final[0], X_test.compute().iloc[0], matplotlib=True)
plt.savefig("shap_force_plot_rf.png", bbox_inches='tight')

print("SHAP analysis completed and plots saved.")
print("Distributed Random Forest processing complete.")

# Cell 6: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 6
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# inputs = "updated_boundaries_with_NDVI_for_2022.gpkg", just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask_geopandas as dgpd
import shap
import joblib
import numpy as np
from dask_ml.preprocessing import StandardScaler
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt
import matplotlib
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Configure Matplotlib for batch job processing
matplotlib.use('Agg')

def main():
    # Setup Dask Client with adjusted resources
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data as Dask GeoDataFrame with specified partitions
    file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development/Desktop/Food Agility project-homework/WA-Rainfall-Zone-Analysis/Complete dataset/updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print(data.shape)

    # Select the initial 2000 rows for analysis
    data = data.head(2000)  # Use head() to take the first 2000 rows
    print(data)

    # Define target and features
    target_column = 'yield'
    drop_columns = ['ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
    features = data.drop(columns=[target_column] + drop_columns)
    target = data[target_column]

    # Scale features
    scaler = StandardScaler()
    features = scaler.fit_transform(features.fillna(0))  # Handling NaNs by filling with 0
    print(features)

    # Preliminary SHAP analysis to select top 30% features
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        preliminary_rf_model.fit(features.compute(), target.compute())

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(features.compute())
    shap_importance = np.abs(shap_values).mean(axis=0)

    # Summary plot for SHAP values
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, features.compute(), feature_names=features.columns, show=False)
    plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')

    # Select top 30% features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(shap_importance)[-num_top_features:]
    selected_features = features[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Train-test split with selected features
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Train model using RandomForestRegressor with selected features
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        rf_model.fit(X_train, y_train)

    # Evaluate and plot results
    y_pred = rf_model.predict(X_test.compute())
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)

    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("prediction_plot_rf.png", bbox_inches='tight')

    # Final SHAP analysis with the trained model
    explainer_final = shap.TreeExplainer(rf_model)
    shap_values_final = explainer_final.shap_values(X_test.compute())

    # Summary plot for SHAP values after training
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values_final, X_test.compute(), feature_names=selected_feature_names, show=False)
    plt.savefig("shap_summary_plot_after_training_rf.png", bbox_inches='tight')

    # Force plot for a single prediction (first test instance)
    plt.figure(figsize=(10, 6))
    shap.force_plot(explainer_final.expected_value, shap_values_final[0], X_test.compute().iloc[0], matplotlib=True)
    plt.savefig("shap_force_plot_rf.png", bbox_inches='tight')

    print("SHAP analysis completed and plots saved.")
    print("Distributed Random Forest processing complete.")

if __name__ == '__main__':
    main()

# Cell 7: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 7
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# inputs = "updated_boundaries_with_NDVI_for_2022.gpkg", just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask_geopandas as dgpd
from dask.distributed import Client
from dask_ml.preprocessing import StandardScaler
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from dask_ml.model_selection import train_test_split
from dask.diagnostics import ProgressBar

def main():
    # Setup Dask Client with adjusted resources
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data as Dask GeoDataFrame with specified partitions
    file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development//Desktop//Food Agility project-homework//WA-Rainfall-Zone-Analysis//Complete dataset//updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print(data.shape)
    
    # Use the initial 2000 rows for analysis
    data = data.head(2000)  # Retrieves the first 2000 rows
    print(data)

    # Define target and features
    target_column = 'yield'
    drop_columns = ['ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
    features = data.drop(columns=[target_column] + drop_columns)
    target = data[target_column]

    # Scale features using Dask
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features.fillna(0))  # Handling NaNs by filling with 0

    # Persist the scaled features in memory
    features_scaled = features_scaled.persist()

    # Preliminary SHAP analysis to select top 30% features
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        preliminary_rf_model.fit(features_scaled, target.compute())

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(features_scaled)

    # Summary plot for SHAP values
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, features_scaled, feature_names=features.columns, show=False)
    plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')

    # Select top 30% features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(np.abs(shap_values)).mean(axis=0)[-num_top_features:]  # Get the mean absolute SHAP values
    selected_features = features.iloc[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Train-test split with selected features
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Train model using RandomForestRegressor with selected features
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        rf_model.fit(X_train.compute(), y_train.compute())

    # Evaluate and plot results
    y_pred = rf_model.predict(X_test.compute())
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)

    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("prediction_plot_rf.png", bbox_inches='tight')

    # Final SHAP analysis with the trained model
    explainer_final = shap.TreeExplainer(rf_model)
    shap_values_final = explainer_final.shap_values(X_test.compute())

    # Summary plot for SHAP values after training
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values_final, X_test.compute(), feature_names=selected_feature_names, show=False)
    plt.savefig("shap_summary_plot_after_training_rf.png", bbox_inches='tight')

    # Force plot for a single prediction (first test instance)
    plt.figure(figsize=(10, 6))
    shap.force_plot(explainer_final.expected_value, shap_values_final[0], X_test.compute().iloc[0], matplotlib=True)
    plt.savefig("shap_force_plot_rf.png", bbox_inches='tight')

    print("SHAP analysis completed and plots saved.")
    print("Distributed Random Forest processing complete.")

if __name__ == '__main__':
    main()

# Cell 8: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 8
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# inputs = "updated_boundaries_with_NDVI_for_2022.gpkg", just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask_geopandas as dgpd
from dask.distributed import Client
from dask_ml.preprocessing import StandardScaler
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from dask_ml.model_selection import train_test_split
from dask.diagnostics import ProgressBar

def main():
    # Setup Dask Client with adjusted resources
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data as Dask GeoDataFrame with specified partitions
    file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development//Desktop//Food Agility project-homework//WA-Rainfall-Zone-Analysis//Complete dataset//updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print(data.shape)
    
    # Define target and features
    target_column = 'yield'
    drop_columns = [target_column,'ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
    features = data.drop(columns=drop_columns)
    target = data[target_column]
    
    features = features.fillna(features.median())

    # Scale features using Dask
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)  # Handling NaNs by filling with 0
    
    # Sample for SHAP feature importance calculation
    sample = features_scaled.sample(frac=0.05).compute()  # Smaller subset for SHAP
    target_sample = target.loc[sample.index].compute()


    # Persist the scaled features in memory
    # Preliminary SHAP analysis to select top 30% features
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        preliminary_rf_model.fit(features_scaled, target_sample)  # No need to call .compute() here

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(sample)

    # Summary plot for SHAP values
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, features_scaled.compute(), feature_names=features.columns, show=False)
    plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')
    # plt.savefig("/scratch/pawsey0988/mibrahim/shap_summary_plot_before_training_rf.png", bbox_inches='tight')
    
    print("SHAP analysis completed and plots saved.")

    # Select top 30% features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(np.abs(shap_values)).mean(axis=0)[-num_top_features:]  # Get the mean absolute SHAP values
    selected_features = features.iloc[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Train-test split with selected features
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Train model using RandomForestRegressor with selected features
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        rf_model.fit(X_train.compute(), y_train.compute())  # Call compute on train sets here

    # Evaluate and plot results
    y_pred = rf_model.predict(X_test.compute())  # Call compute on test set
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)
    
    # Save the trained Random Forest model
    # model_path = "/scratch/pawsey0988/mibrahim/rf_model.pkl"
    # joblib.dump(rf_model, model_path)
    # print("Random Forest model trained and saved.")


    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("prediction_plot_rf.png", bbox_inches='tight')
    
    # plt.savefig("/scratch/pawsey0988/mibrahim/prediction_plot_rf.png", bbox_inches='tight')

    
    print("Distributed Random Forest processing complete.")

if __name__ == '__main__':
    main()

# Cell 9: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 9
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# inputs = "updated_boundaries_with_NDVI_for_2022.gpkg", just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask_geopandas as dgpd
from dask.distributed import Client
from dask_ml.preprocessing import StandardScaler
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from dask_ml.model_selection import train_test_split
from dask.diagnostics import ProgressBar
import dask.array as da  # Import dask.array
def main():
    # Setup Dask Client with adjusted resources
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data as Dask GeoDataFrame with specified partitions
    file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development//Desktop//Food Agility project-homework//WA-Rainfall-Zone-Analysis//Complete dataset//updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print(data.shape)
    
    # Define target and features
    target_column = 'yield'
    drop_columns = [target_column,'ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
    features = data.drop(columns=drop_columns)
    target = data[target_column]
    
    features = features.fillna(0)
    
    # Scale features using Dask
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)  
    
    # Compute the number of samples
    num_samples = int(0.05 * features_scaled.shape[0].compute())

    # Get random indices for sampling
    sample_indices = np.random.choice(features_scaled.shape[0].compute(), num_samples, replace=False)

    # Create a boolean mask as a NumPy array
    mask = np.isin(np.arange(features_scaled.shape[0].compute()), sample_indices)

    # Apply the mask using .loc to select rows
    sample = features_scaled.loc[mask].compute()
    target_sample = target.loc[mask].compute()
    
    print(sample)

    
    # # Sample for SHAP feature importance calculation
    # sample = features_scaled.sample(frac=0.05).compute()  # Smaller subset for SHAP
    # target_sample = target.loc[sample.index].compute()


    # Persist the scaled features in memory
    # Preliminary SHAP analysis to select top 30% features
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        preliminary_rf_model.fit(sample, target_sample)  # No need to call .compute() here

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(sample)

    # Summary plot for SHAP values
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, features_scaled.compute(), feature_names=features.columns, show=False)
    plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')
    # plt.savefig("/scratch/pawsey0988/mibrahim/shap_summary_plot_before_training_rf.png", bbox_inches='tight')
    
    print("SHAP analysis completed and plots saved.")

    # Select top 30% features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(np.abs(shap_values)).mean(axis=0)[-num_top_features:]  # Get the mean absolute SHAP values
    selected_features = features.iloc[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Train-test split with selected features
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Train model using RandomForestRegressor with selected features
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        rf_model.fit(X_train.compute(), y_train.compute())  # Call compute on train sets here

    # Evaluate and plot results
    y_pred = rf_model.predict(X_test.compute())  # Call compute on test set
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)
    
    # Save the trained Random Forest model
    # model_path = "/scratch/pawsey0988/mibrahim/rf_model.pkl"
    # joblib.dump(rf_model, model_path)
    # print("Random Forest model trained and saved.")


    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("prediction_plot_rf.png", bbox_inches='tight')
    
    # plt.savefig("/scratch/pawsey0988/mibrahim/prediction_plot_rf.png", bbox_inches='tight')

    
    print("Distributed Random Forest processing complete.")

if __name__ == '__main__':
    main()

# Cell 10: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 10
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# inputs = "updated_boundaries_with_NDVI_for_2022.gpkg", just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask_geopandas as dgpd
from dask.distributed import Client
from dask_ml.preprocessing import StandardScaler
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from dask_ml.model_selection import train_test_split
from dask.diagnostics import ProgressBar
import dask.array as da  # Import dask.array

def main():
    # Setup Dask Client with adjusted resources
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data as Dask GeoDataFrame with specified partitions
    file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development//Desktop//Food Agility project-homework//WA-Rainfall-Zone-Analysis//Complete dataset//updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print(data.shape)
    
    # Define target and features
    target_column = 'yield'
    drop_columns = [target_column,'ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
    features = data.drop(columns=drop_columns)
    target = data[target_column]
    
    features = features.fillna(0)
    
    # Scale features using Dask
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features).compute()  # Ensure features_scaled is computed here
    target = target.compute()  # Ensure target is computed
    
    # Compute the number of samples
    num_samples = int(0.05 * len(features_scaled))

    # Get random indices for sampling
    sample_indices = np.random.choice(len(features_scaled), num_samples, replace=False)

    # Select rows by index
    sample = features_scaled[sample_indices]
    target_sample = target.iloc[sample_indices]
    
    print(sample)

    # Preliminary SHAP analysis to select top 30% features
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        preliminary_rf_model.fit(sample, target_sample)  # No need to call .compute() here

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(sample)

    # Summary plot for SHAP values
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, sample, feature_names=features.columns, show=False)
    plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')
    
    print("SHAP analysis completed and plots saved.")

    # Select top 30% features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(np.abs(shap_values)).mean(axis=0)[-num_top_features:]  # Get the mean absolute SHAP values
    selected_features = features.iloc[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Train-test split with selected features
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Train model using RandomForestRegressor with selected features
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        rf_model.fit(X_train.compute(), y_train.compute())  # Call compute on train sets here

    # Evaluate and plot results
    y_pred = rf_model.predict(X_test.compute())  # Call compute on test set
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("prediction_plot_rf.png", bbox_inches='tight')
    
    print("Distributed Random Forest processing complete.")

if __name__ == '__main__':
    main()

# Cell 11: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 11
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
#inputs = "updated_boundaries_with_NDVI_for_2022.gpkg", just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask_geopandas as dgpd
from dask.distributed import Client
from dask_ml.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from dask_ml.model_selection import train_test_split
from dask.diagnostics import ProgressBar
import dask.dataframe as dd

def main():
    # Setup Dask Client
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data as Dask GeoDataFrame
    file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development//Desktop//Food Agility project-homework//WA-Rainfall-Zone-Analysis//Complete dataset//updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print(data.shape)

    # Define target and features
    target_column = 'yield'
    drop_columns = [target_column, 'ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
    features = data.drop(columns=drop_columns)
    target = data[target_column]
    
    # Fill NaN values
    features = features.fillna(0)
    
    # Scale features and convert to Dask DataFrame
    scaler = StandardScaler()
    features_scaled_np = scaler.fit_transform(features).compute()  # Compute to NumPy array
    features_scaled_df = pd.DataFrame(features_scaled_np, columns=features.columns)  # Convert to Pandas DataFrame
    features_scaled_dask_df = dd.from_pandas(features_scaled_df, npartitions=16)  # Convert to Dask DataFrame
    
    # Compute the number of samples
    num_samples = int(0.05 * len(features_scaled_dask_df))
    
    # Sample indices
    sample_indices = np.random.choice(len(features_scaled_dask_df), num_samples, replace=False)
    
    # Select rows by index using .iloc
    sample = features_scaled_dask_df.iloc[sample_indices]
    target_sample = target.iloc[sample_indices]
    
    # Compute the samples
    sample = sample.compute()
    target_sample = target_sample.compute()
    
    print(sample)

    # Preliminary SHAP analysis
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        preliminary_rf_model.fit(sample, target_sample)

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(sample)

    # Summary plot for SHAP values
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, sample, feature_names=features.columns, show=False)
    plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')
    
    print("SHAP analysis completed and plots saved.")
    
    # Select top 30% features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(np.abs(shap_values)).mean(axis=0)[-num_top_features:]
    selected_features = features.iloc[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Train-test split with selected features
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Train model using RandomForestRegressor
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        rf_model.fit(X_train.compute(), y_train.compute())

    # Evaluate and plot results
    y_pred = rf_model.predict(X_test.compute())
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("prediction_plot_rf.png", bbox_inches='tight')
    
    print("Distributed Random Forest processing complete.")

if __name__ == '__main__':
    main()

# Cell 12: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 12
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# inputs = "updated_boundaries_with_NDVI_for_2022.gpkg", just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask_geopandas as dgpd
from dask.distributed import Client
from dask_ml.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from dask_ml.model_selection import train_test_split
from dask.diagnostics import ProgressBar
import dask.dataframe as dd

def main():
    # Setup Dask Client
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data as Dask GeoDataFrame
    file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development//Desktop//Food Agility project-homework//WA-Rainfall-Zone-Analysis//Complete dataset//updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print(data.shape)

    # Define target and features
    target_column = 'yield'
    drop_columns = [target_column, 'ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
    features = data.drop(columns=drop_columns)
    target = data[target_column]
    
    # Fill NaN values
    features = features.fillna(0)
    
    # Scale features and convert to Dask DataFrame
    scaler = StandardScaler()
    features_scaled_np = scaler.fit_transform(features).compute()  # Compute to NumPy array
    features_scaled_df = pd.DataFrame(features_scaled_np, columns=features.columns)  # Convert to Pandas DataFrame
    features_scaled_dask_df = dd.from_pandas(features_scaled_df, npartitions=16)  # Convert to Dask DataFrame
    
    # Sample 5% of the rows randomly
    sample = features_scaled_dask_df.sample(frac=0.05, random_state=42)
    target_sample = target.sample(frac=0.05, random_state=42)
    
    # Compute the samples
    sample = sample.compute()
    target_sample = target_sample.compute()
    
    print(sample)

    # Preliminary SHAP analysis
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        preliminary_rf_model.fit(sample, target_sample)

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(sample)

    # Summary plot for SHAP values
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, sample, feature_names=features.columns, show=False)
    plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')
    
    print("SHAP analysis completed and plots saved.")
    
    # Select top 30% features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(np.abs(shap_values)).mean(axis=0)[-num_top_features:]
    selected_features = features.iloc[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Train-test split with selected features
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Train model using RandomForestRegressor
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        rf_model.fit(X_train.compute(), y_train.compute())

    # Evaluate and plot results
    y_pred = rf_model.predict(X_test.compute())
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("prediction_plot_rf.png", bbox_inches='tight')
    
    print("Distributed Random Forest processing complete.")

if __name__ == '__main__':
    main()

# Cell 13: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 13
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# inputs = "updated_boundaries_with_NDVI_for_2022.gpkg", just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.




import dask_geopandas as dgpd
from dask.distributed import Client
from dask_ml.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from dask_ml.model_selection import train_test_split
from dask.diagnostics import ProgressBar
import dask.dataframe as dd

def main():
    # Setup Dask Client
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data as Dask GeoDataFrame
    file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development//Desktop//Food Agility project-homework//WA-Rainfall-Zone-Analysis//Complete dataset//updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print(data.shape)

    # Define target and features
    target_column = 'yield'
    drop_columns = [target_column, 'ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
    features = data.drop(columns=drop_columns)
    target = data[target_column]
    
    # Fill NaN values
    features = features.fillna(0)
    
    # Scale features and convert to Dask DataFrame
    scaler = StandardScaler()
    features_scaled_np = scaler.fit_transform(features).compute()  # Compute to NumPy array
    features_scaled_df = pd.DataFrame(features_scaled_np, columns=features.columns)  # Convert to Pandas DataFrame
    features_scaled_dask_df = dd.from_pandas(features_scaled_df, npartitions=16)  # Convert to Dask DataFrame
    
    # Sample 5% of the rows randomly
    sample = features_scaled_dask_df.sample(frac=0.05, random_state=42)
    target_sample = target.sample(frac=0.05, random_state=42)
    
    # Compute the samples
    sample = sample.compute()
    target_sample = target_sample.compute()
    
    print(sample)

    # Preliminary SHAP analysis
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        preliminary_rf_model.fit(sample, target_sample)

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(sample)

    # Summary plot for SHAP values
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, sample, feature_names=features.columns, show=False)
    plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')
    
    print("SHAP analysis completed and plots saved.")
    
    # Select top 30% features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(np.abs(shap_values)).mean(axis=0)[-num_top_features:]
    selected_features = features.iloc[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Train-test split with selected features
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Train model using RandomForestRegressor
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
    with ProgressBar():
        rf_model.fit(X_train.compute(), y_train.compute())

    # Evaluate and plot results
    y_pred = rf_model.predict(X_test.compute())
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("prediction_plot_rf.png", bbox_inches='tight')
    
    print("Distributed Random Forest processing complete.")

if __name__ == '__main__':
    main()


# # Import necessary libraries
# import dask_geopandas as dgpd
# from dask.distributed import Client
# from dask_ml.preprocessing import StandardScaler
# import numpy as np
# import pandas as pd
# import shap
# import matplotlib.pyplot as plt
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score
# from dask_ml.model_selection import train_test_split
# from dask.diagnostics import ProgressBar
# import dask.dataframe as dd

# def main():
#     # Setup Dask Client
#     client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
#     print(client)

#     # Load data as Dask GeoDataFrame
#     file_path = "C://Users//M.Ibrah//OneDrive - Department of Primary Industries And Regional Development//Desktop//Food Agility project-homework//WA-Rainfall-Zone-Analysis//Complete dataset//updated_boundaries_with_NDVI_for_2022.gpkg"
#     data = dgpd.read_file(file_path, npartitions=256)
#     print(data.shape)

#     # Define target and features
#     target_column = 'yield'
#     drop_columns = [target_column, 'ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry']
#     features = data.drop(columns=drop_columns)
#     target = data[target_column]
    
#     # Fill NaN values
#     features = features.fillna(0)
    
#     # Scale features and convert to Dask DataFrame
#     scaler = StandardScaler()
#     features_scaled_np = scaler.fit_transform(features).compute()  # Compute to NumPy array
#     features_scaled_df = pd.DataFrame(features_scaled_np, columns=features.columns)  # Convert to Pandas DataFrame
#     features_scaled_dask_df = dd.from_pandas(features_scaled_df, npartitions=16)  # Convert to Dask DataFrame
    
#     # Combine features and target into one DataFrame
#     combined_dask_df = dd.concat([features_scaled_dask_df, target], axis=1)
#     combined_dask_df.columns = list(features.columns) + [target_column]

#     # Sample 5% of the rows randomly from combined DataFrame
#     sampled_combined_df = combined_dask_df.sample(frac=0.05, random_state=42).compute()

#     # Separate features and target after sampling
#     sample = sampled_combined_df.drop(columns=[target_column])
#     target_sample = sampled_combined_df[target_column]
    
#     print(sample)

#     # Preliminary SHAP analysis
#     preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
#     with ProgressBar():
#         preliminary_rf_model.fit(sample, target_sample)

#     explainer = shap.TreeExplainer(preliminary_rf_model)
#     shap_values = explainer.shap_values(sample)

#     # Summary plot for SHAP values
#     plt.figure(figsize=(10, 8))
#     shap.summary_plot(shap_values, sample, feature_names=features.columns, show=False)
#     plt.savefig("shap_summary_plot_before_training_rf.png", bbox_inches='tight')
    
#     print("SHAP analysis completed and plots saved.")
    
#     # Select top 30% features based on SHAP importance
#     num_top_features = int(0.3 * features.shape[1])
#     top_feature_indices = np.argsort(np.abs(shap_values)).mean(axis=0)[-num_top_features:]
#     selected_features = features.iloc[:, top_feature_indices]
#     selected_feature_names = features.columns[top_feature_indices]
#     print(f"Selected top 30% features: {selected_feature_names}")

#     # Train-test split with selected features
#     X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

#     # Train model using RandomForestRegressor
#     rf_model = RandomForestRegressor(n_estimators=150, n_jobs=16)
#     with ProgressBar():
#         rf_model.fit(X_train.compute(), y_train.compute())

#     # Evaluate and plot results
#     y_pred = rf_model.predict(X_test.compute())
#     mse = mean_squared_error(y_test.compute(), y_pred)
#     r2 = r2_score(y_test.compute(), y_pred)
    
#     plt.figure(figsize=(10, 6))
#     plt.scatter(y_test.compute(), y_pred, alpha=0.5)
#     plt.xlabel("Actual Yield")
#     plt.ylabel("Predicted Yield")
#     plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
#     plt.savefig("prediction_plot_rf.png", bbox_inches='tight')
    
#     print("Distributed Random Forest processing complete.")

# if __name__ == '__main__':
#     main()

<Client: 'tcp://127.0.0.1:64081' processes=16 threads=32, memory=119.21 GiB>
(<dask_expr.expr.Scalar: expr=FromGraph(afe1101).size() // 56, dtype=int64>, 56)


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


        et_morton_actual  et_morton_potential  et_morton_wet  et_tall_crop  \
457            -1.606658             1.397905       0.884088      1.567229   
1239            2.315870            -0.213628       0.777707     -0.074257   
816            -0.819731            -0.543766      -0.986967     -0.520637   
862            -0.300157            -0.917375      -1.177381     -0.831200   
441             1.127409            -0.236733       0.234180     -0.171146   
...                  ...                  ...            ...           ...   
436788         -0.926672             0.350311      -0.011659      0.513132   
435862          1.238386            -0.799105      -0.367454     -0.678514   
436869          0.796496             0.292351       0.677583      0.007493   
437071         -0.128647             0.871563       0.935043      0.816774   
437041         -1.626835             1.665776       1.184458      1.655035   

        et_short_crop  evap_morton_lake  evap_pan  evap_syn  ma

# Cell 14: This cell imports all the required Python packages. Ensure packages such as `pandas`, `numpy`, `sklearn`, and `shap` are installed. Use `pip install package_name` if needed.

In [None]:
# Cell 14
# Description: Import required libraries for data manipulation, machine learning, and SHAP.
# inputs = "updated_boundaries_with_NDVI_for_2022.gpkg", just library imports.
# Requirements: Install pandas, numpy, sklearn, shap, matplotlib. Optionally geopandas if used.

# Import necessary libraries
import dask_geopandas as dgpd
import dask.dataframe as dd
from dask_ml.preprocessing import StandardScaler
from dask_ml.model_selection import train_test_split
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap
import numpy as np
import matplotlib.pyplot as plt

def main():
    # Initialize Dask Client
    client = Client(n_workers=16, threads_per_worker=2, memory_limit='8GB', dashboard_address=':8786')
    print(client)

    # Load data directly as Dask GeoDataFrame for optimized distributed processing
    file_path = "/scratch/pawsey0988/mibrahim/updated_boundaries_with_NDVI_for_2022.gpkg"
    data = dgpd.read_file(file_path, npartitions=256)
    print("Data Shape:", data.shape)

    # Define target and feature columns
    target_column = 'yield'
    drop_columns = [
        target_column, 'ID', 'OBJECTID', 'year', 'crop_name', 'pred_prob', 'area_ha', 
        'lga_name', 'production', 'Shape__Length', 'Shape__Area', 'geometry'
    ]
    features = data.drop(columns=drop_columns)
    target = data[target_column]

    # Fill NaN values in features to avoid missing data issues
    features = features.fillna(0)

    # Scale features using Dask StandardScaler without converting to a dense array
    scaler = StandardScaler()
    features_scaled_dask = scaler.fit_transform(features)  # Stay as Dask DataFrame

    # Sample a subset (5%) for preliminary SHAP analysis
    sampled_combined_df = features_scaled_dask.sample(frac=0.05, random_state=42)
    sampled_target = target.sample(frac=0.05, random_state=42)

    # Preliminary SHAP analysis on the sampled data
    print("Starting preliminary SHAP analysis...")
    preliminary_rf_model = RandomForestRegressor(n_estimators=150, n_jobs=-1)
    with ProgressBar():
        preliminary_rf_model.fit(sampled_combined_df.compute(), sampled_target.compute())

    explainer = shap.TreeExplainer(preliminary_rf_model)
    shap_values = explainer.shap_values(sampled_combined_df.compute())

    # Plot SHAP summary
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, sampled_combined_df.compute(), feature_names=features.columns, show=False)
    plt.savefig("/scratch/pawsey0988/mibrahim/shap_summary_plot_before_training_rf.png", bbox_inches='tight')
    print("SHAP summary plot saved.")

    # Select top 30% of features based on SHAP importance
    num_top_features = int(0.3 * features.shape[1])
    top_feature_indices = np.argsort(np.abs(shap_values).mean(axis=0))[-num_top_features:]
    selected_features = features_scaled_dask.iloc[:, top_feature_indices]
    selected_feature_names = features.columns[top_feature_indices]
    print(f"Selected top 30% features: {selected_feature_names}")

    # Split data into train-test sets using the selected features
    print("Starting train-test split...")
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target, test_size=0.2, random_state=42, shuffle=True)

    # Train the final RandomForest model using selected features
    print("Training RandomForestRegressor on selected features...")
    rf_model = RandomForestRegressor(n_estimators=150, n_jobs=-1)
    with ProgressBar():
        rf_model.fit(X_train.compute(), y_train.compute())

    # Make predictions and evaluate the model
    y_pred = rf_model.predict(X_test.compute())
    mse = mean_squared_error(y_test.compute(), y_pred)
    r2 = r2_score(y_test.compute(), y_pred)
    print(f"Model evaluation - MSE: {mse:.2f}, R2: {r2:.2f}")

    # Scatter plot of actual vs predicted values
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test.compute(), y_pred, alpha=0.5)
    plt.xlabel("Actual Yield")
    plt.ylabel("Predicted Yield")
    plt.title(f"Random Forest Prediction (R2={r2:.2f}, MSE={mse:.2f})")
    plt.savefig("/scratch/pawsey0988/mibrahim/prediction_plot_rfnew.png", bbox_inches='tight')
    print("Prediction plot saved. Model training and evaluation complete.")

if __name__ == '__main__':
    main()