# Actual vs Predicted

In [None]:
import os
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split

# Create output directory
os.makedirs("Actual_predicted", exist_ok=True)

# Load and preprocess dataset
data = pd.read_excel("multiwafer_database.xlsx")
data.columns = data.columns.str.replace(" ", "_")

# Define input features
features = [
    'P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'Tester',
    'Dark_Area_%', 'Defect_Area_%', 'Grain_Defect_Area_%',
    'Average_Life_Time', 'Sigma_Life_Time',
    'Resistivity', 'Wafer_Area', 'Vendor_name'
]
X = data[features]

# Define target variables and corresponding model paths
target_model_info = {
    "Voc": "../1. Training/(Voc)best_model_GBR.pkl",
    "Isc": "../1. Training/(Isc)best_model_XGB.pkl",
    "FF": "../1. Training/(FF)best_model_XGB.pkl",
    "Efficiency": "../1. Training/(Efficiency)best_model_ET.pkl"
}

for target, model_path in target_model_info.items():
    y = data[target]

    # Split: 60% training, 20% validation, 20% test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Combine training and validation sets for final model inference
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])

    # Load trained model and perform inference
    model = joblib.load(model_path)
    pred_train_val = model.predict(X_train_val)
    pred_test = model.predict(X_test)

    # Export actual vs. predicted values
    with pd.ExcelWriter(f"Actual_predicted/{target}_Actual_vs_Predicted.xlsx") as writer:
        pd.DataFrame({
            "Actual": y_train_val.values,
            "Predicted": pred_train_val
        }).to_excel(writer, sheet_name="Training_and_Validation", index=False)

        pd.DataFrame({
            "Actual": y_test.values,
            "Predicted": pred_test
        }).to_excel(writer, sheet_name="Test", index=False)

In [None]:
# Normalization

import pandas as pd

# File paths
db_path = "multiwafer_database.xlsx"
input_path = "Actual_predicted/Efficiency_Actual_vs_Predicted.xlsx"
output_path = "Actual_predicted/Efficiency_Actual_vs_Predicted_Normalized.xlsx"

# Load Efficiency range from database
eff_data = pd.read_excel(db_path)
eff_min = eff_data["Efficiency"].min()
eff_max = eff_data["Efficiency"].max()

# Target sheet names
sheets_to_normalize = ["Training_and_Validation", "Test"]
normalized_sheets = {}

# Normalize each sheet separately
for sheet in sheets_to_normalize:
    df = pd.read_excel(input_path, sheet_name=sheet)
    df_norm = df.copy()
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            df_norm[col] = (df[col] - eff_min) / (eff_max - eff_min)
    normalized_sheets[sheet] = df_norm

# Save normalized sheets to one Excel file
with pd.ExcelWriter(output_path) as writer:
    for sheet_name, df_norm in normalized_sheets.items():
        df_norm.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"Normalized file saved to: {output_path}")

Normalized file saved to: Actual_predicted/Efficiency_Actual_vs_Predicted_Normalized.xlsx


# SHAP analysis

In [None]:
import os
import joblib
import shap
import pandas as pd
from sklearn.model_selection import train_test_split

# Output directory
os.makedirs("SHAP", exist_ok=True)

# Load dataset
data = pd.read_excel("multiwafer_database.xlsx")
data.columns = data.columns.str.replace(" ", "_")

# Input features
features = [
    'P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'Tester',
    'Dark_Area_%', 'Defect_Area_%', 'Grain_Defect_Area_%',
    'Average_Life_Time', 'Sigma_Life_Time',
    'Resistivity', 'Wafer_Area', 'Vendor_name'
]
X = data[features]
y = data["Efficiency"]

# Data split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Load model
model = joblib.load("../1. Training/(Efficiency)best_model_ET.pkl")

# SHAP computation
explainer = shap.Explainer(model)
shap_values = explainer(X_test)

# Save SHAP dependence data
records = []
for i, feature in enumerate(X_test.columns):
    records.append(pd.DataFrame({
        "Feature": feature,
        "Feature_Value": X_test.iloc[:, i].values,
        "SHAP_Value": shap_values.values[:, i]
    }))

pd.concat(records, ignore_index=True).to_csv("SHAP/SHAP_dependence_Efficiency.csv", index=False)

# Feature Importance

In [None]:
import os
import joblib
import shap
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Output directory
output_dir = "SHAP"
os.makedirs(output_dir, exist_ok=True)

# Load dataset
data = pd.read_excel("multiwafer_database.xlsx")
data.columns = data.columns.str.replace(" ", "_")

# Input features
input_features = [
    'P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'Tester',
    'Dark_Area_%', 'Defect_Area_%', 'Grain_Defect_Area_%',
    'Average_Life_Time', 'Sigma_Life_Time',
    'Resistivity', 'Wafer_Area', 'Vendor_name'
]
X = data[input_features]

# Target names and corresponding model paths
targets = ["Voc", "Isc", "FF", "Efficiency"]
model_paths = {
    "Voc": "../1. Training/(Voc)best_model_GBR.pkl",
    "Isc": "../1. Training/(Isc)best_model_XGB.pkl",
    "FF": "../1. Training/(FF)best_model_XGB.pkl",
    "Efficiency": "../1. Training/(Efficiency)best_model_ET.pkl"
}

importance_dfs = []
shap_summary_df = None

for target in targets:
    print(f"Processing target: {target}")
    y = data[target]

    # Split: 60% training, 20% validation, 20% test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Load model
    model = joblib.load(model_paths[target])

    # Feature importance
    importance = model.feature_importances_
    df_importance = pd.DataFrame({
        "Target": target,
        "Feature": X.columns,
        "Importance": importance
    })
    importance_dfs.append(df_importance)

    # SHAP analysis (Efficiency only)
    if target == "Efficiency":
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        df_shap = pd.DataFrame({
            "Feature": X.columns,
            "Mean_SHAP_Value": np.abs(shap_values).mean(axis=0)
        }).sort_values(by="Mean_SHAP_Value", ascending=False)
        shap_summary_df = df_shap

# Save feature importance
all_importance_df = pd.concat(importance_dfs, ignore_index=True)
all_importance_df.to_excel(os.path.join(output_dir, "Feature_Importance_All_Models.xlsx"), index=False)

# Save SHAP results
if shap_summary_df is not None:
    shap_summary_df.to_excel(os.path.join(output_dir, "Efficiency_Average_SHAP_Values.xlsx"), index=False)

print("Feature importance and SHAP summary saved.")

Processing target: Voc
Processing target: Isc
Processing target: FF
Processing target: Efficiency
Feature importance and SHAP summary saved.
