# PACF Dataset Analysis
This notebook includes preprocessing, model training, and SHAP analysis.

In [None]:
# Install required libraries (run only once)
# pip install pycaret shap matplotlib seaborn pandas scikit-learn

import pandas as pd
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.regression import *

# Load the dataset
df = pd.read_csv("EGC_Lightweight_Data.csv")

# Define target variables
targets = ['Stress', 'Split', 'Flexure', 'Sorptivity']

# Loop through each target for individual regression modeling and SHAP analysis
for target in targets:
    print(f"\n\n=========== MODEL TRAINING FOR: {target} ===========\n")

    # Initialize PyCaret setup
    reg = setup(
        data=df,
        target=target,
        categorical_features=['Fibre'],
        numeric_features=['OPS_Replacement(%)'],
        session_id=123,
        normalize=True,
        silent=True,
        verbose=False
    )

    # Compare models and select the best one
    best_model = compare_models()

    # Evaluate model visually
    evaluate_model(best_model)

    # Plot feature importance
    print(f"\nFeature importance for {target}:")
    plot_model(best_model, plot='feature')

    # Prepare data for SHAP
    features = df.drop(columns=[target])
    y_true = df[target]

    print(f"\nGenerating SHAP analysis for {target}...")

    # Convert features to NumPy for SHAP (required for predict function)
    X_sample = features.copy()
    X_sample = pd.get_dummies(X_sample)

    try:
        explainer = shap.Explainer(best_model.predict, X_sample)
        shap_values = explainer(X_sample)
        shap.summary_plot(shap_values, X_sample, show=True)
    except Exception as e:
        print(f"SHAP analysis failed for {target}. Error: {e}")

    # Plot residuals
    print(f"\nResidual plot for {target}:")
    plot_model(best_model, plot='residuals')

    # Learning curve
    print(f"\nLearning curve for {target}:")
    plot_model(best_model, plot='learning')

    # Model predictions
    predictions = predict_model(best_model)
    print(f"\nActual vs Predicted for {target}:")
    print(predictions[[target, 'Label']].head())