# Top 1% Path Extraction

In [None]:
import os
import pandas as pd
import numpy as np

# Output directory
output_dir = "additional_analysis"
os.makedirs(output_dir, exist_ok=True)

# Load data
data = pd.read_excel("multiwafer_database.xlsx")
data.columns = data.columns.str.replace(" ", "_")

# Path features
path_features = ['P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'Tester']

# Top 1% by efficiency
threshold_top = np.percentile(data["Efficiency"], 99)
top_samples = data[data["Efficiency"] >= threshold_top]

# Save
top_paths = top_samples[path_features]
top_paths.to_excel(os.path.join(output_dir, "Top_1pct_Paths.xlsx"), index=False)

# Bottom 1% Path Extraction

In [None]:
import os
import pandas as pd
import numpy as np

# Output directory
output_dir = "additional_analysis"
os.makedirs(output_dir, exist_ok=True)

# Load data
data = pd.read_excel("multiwafer_database.xlsx")
data.columns = data.columns.str.replace(" ", "_")

# Path features
path_features = ['P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'Tester']

# Bottom 1% by efficiency
threshold_bottom = np.percentile(data["Efficiency"], 1)
bottom_samples = data[data["Efficiency"] <= threshold_bottom]

# Save
bottom_paths = bottom_samples[path_features]
bottom_paths.to_excel(os.path.join(output_dir, "Bottom_1pct_Paths.xlsx"), index=False)

# Cluster-Based Optimization

In [None]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import optuna
from datetime import timedelta

optuna.logging.set_verbosity(optuna.logging.WARNING)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

output_dir = "additional_analysis"
os.makedirs(output_dir, exist_ok=True)

# Load dataset and trained model
data = pd.read_excel("multiwafer_database.xlsx")
data.columns = data.columns.str.replace(" ", "_")
model = joblib.load("../1. Training/(Efficiency)best_model_ET.pkl")

# Define feature sets
equipment_features = ['P02', 'P03', 'P04', 'P05', 'P06', 'Tester']
quality_features = [
    'P01', 'Dark_Area_%', 'Defect_Area_%', 'Grain_Defect_Area_%',
    'Average_Life_Time', 'Sigma_Life_Time',
    'Resistivity', 'Wafer_Area', 'Vendor_name'
]
trained_features = model.feature_names_in_

# Define hyperparameter search space
param_ranges = {
    'P02': (1, 6), 'P03': (1, 15),
    'P04': (1, 6), 'P05': (1, 15),
    'P06': (1, 27), 'Tester': (1, 20)
}

# Select 10 clusters with lowest average efficiency
cluster_means = data.groupby("Cluster")["Efficiency"].mean().sort_values()
low_clusters = cluster_means.head(100).index
selected_clusters = np.random.choice(low_clusters, size=10, replace=False)

# Optimize samples from each selected cluster and save results per sheet
output_file = os.path.join(output_dir, "Cluster_Optimization_Result_low.xlsx")
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    for selected_cluster in selected_clusters:
        cluster_data = data[data["Cluster"] == selected_cluster]
        cluster_results = []

        for idx, row in cluster_data.iterrows():
            quality_values = row[quality_features]
            initial_eff = row["Efficiency"]
            initial_path = {key: row[key] for key in equipment_features}

            def objective(trial):
                path = {key: trial.suggest_int(key, *param_ranges[key]) for key in equipment_features}
                full_input = {**path, **quality_values.to_dict()}
                input_df = pd.DataFrame([full_input])[trained_features]
                return model.predict(input_df)[0]

            start = time.time()
            study = optuna.create_study(
                direction="maximize",
                sampler=optuna.samplers.TPESampler(n_startup_trials=20, seed=RANDOM_STATE)
            )
            study.optimize(objective, n_trials=300)
            elapsed = time.time() - start

            best_path = study.best_params
            result_row = {
                "Selected Cluster": selected_cluster,
                "Sample Index": idx,
                "Initial Efficiency": initial_eff,
                "Optimized Efficiency": study.best_value,
                "Elapsed Time": str(timedelta(seconds=int(elapsed)))
            }
            for key in equipment_features:
                result_row[f"Initial_{key}"] = initial_path[key]
                result_row[f"Best_{key}"] = best_path[key]

            cluster_results.append(result_row)

        df_cluster = pd.DataFrame(cluster_results)
        sheet_name = f"Cluster_{selected_cluster}"[:31]
        df_cluster.to_excel(writer, sheet_name=sheet_name, index=False)
        print(f"Saved cluster {selected_cluster} to sheet '{sheet_name}'")

print(f"\nAll results saved to: {output_file}")

Saved cluster 498 to sheet 'Cluster_498'
Saved cluster 422 to sheet 'Cluster_422'
Saved cluster 465 to sheet 'Cluster_465'
Saved cluster 319 to sheet 'Cluster_319'
Saved cluster 29 to sheet 'Cluster_29'
Saved cluster 218 to sheet 'Cluster_218'
Saved cluster 651 to sheet 'Cluster_651'
Saved cluster 540 to sheet 'Cluster_540'
Saved cluster 917 to sheet 'Cluster_917'
Saved cluster 321 to sheet 'Cluster_321'

All results saved to: additional_analysis/Cluster_Optimization_Result_low.xlsx


In [None]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import optuna
from datetime import timedelta

optuna.logging.set_verbosity(optuna.logging.WARNING)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

output_dir = "additional_analysis"
os.makedirs(output_dir, exist_ok=True)

# Load dataset and trained model
data = pd.read_excel("multiwafer_database.xlsx")
data.columns = data.columns.str.replace(" ", "_")
model = joblib.load("../1. Training/(Efficiency)best_model_ET.pkl")

# Define feature sets
equipment_features = ['P02', 'P03', 'P04', 'P05', 'P06', 'Tester']
quality_features = [
    'P01', 'Dark_Area_%', 'Defect_Area_%', 'Grain_Defect_Area_%',
    'Average_Life_Time', 'Sigma_Life_Time',
    'Resistivity', 'Wafer_Area', 'Vendor_name'
]
trained_features = model.feature_names_in_

# Define hyperparameter search space
param_ranges = {
    'P02': (1, 6), 'P03': (1, 15),
    'P04': (1, 6), 'P05': (1, 15),
    'P06': (1, 27), 'Tester': (1, 20)
}

# Randomly select 10 clusters from all available clusters
all_clusters = data["Cluster"].unique()
selected_clusters = np.random.choice(all_clusters, size=10, replace=False)

# Optimize samples from each selected cluster and save results per sheet
output_file = os.path.join(output_dir, "Cluster_Optimization_Result.xlsx")
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    for selected_cluster in selected_clusters:
        cluster_data = data[data["Cluster"] == selected_cluster]
        cluster_results = []

        for idx, row in cluster_data.iterrows():
            quality_values = row[quality_features]
            initial_eff = row["Efficiency"]
            initial_path = {key: row[key] for key in equipment_features}

            def objective(trial):
                path = {key: trial.suggest_int(key, *param_ranges[key]) for key in equipment_features}
                full_input = {**path, **quality_values.to_dict()}
                input_df = pd.DataFrame([full_input])[trained_features]
                return model.predict(input_df)[0]

            start = time.time()
            study = optuna.create_study(
                direction="maximize",
                sampler=optuna.samplers.TPESampler(n_startup_trials=20, seed=RANDOM_STATE)
            )
            study.optimize(objective, n_trials=300)
            elapsed = time.time() - start

            best_path = study.best_params
            result_row = {
                "Selected Cluster": selected_cluster,
                "Sample Index": idx,
                "Initial Efficiency": initial_eff,
                "Optimized Efficiency": study.best_value,
                "Elapsed Time": str(timedelta(seconds=int(elapsed)))
            }
            for key in equipment_features:
                result_row[f"Initial_{key}"] = initial_path[key]
                result_row[f"Best_{key}"] = best_path[key]

            cluster_results.append(result_row)

        df_cluster = pd.DataFrame(cluster_results)
        sheet_name = f"Cluster_{selected_cluster}"[:31]
        df_cluster.to_excel(writer, sheet_name=sheet_name, index=False)
        print(f"Saved cluster {selected_cluster} to sheet '{sheet_name}'")

print(f"\nAll results saved to: {output_file}")

Saved cluster 15 to sheet 'Cluster_15'
Saved cluster 118 to sheet 'Cluster_118'
Saved cluster 143 to sheet 'Cluster_143'
Saved cluster 206 to sheet 'Cluster_206'
Saved cluster 89 to sheet 'Cluster_89'
Saved cluster 264 to sheet 'Cluster_264'
Saved cluster 983 to sheet 'Cluster_983'
Saved cluster 5 to sheet 'Cluster_5'
Saved cluster 311 to sheet 'Cluster_311'
Saved cluster 956 to sheet 'Cluster_956'

All results saved to: additional_analysis/Cluster_Optimization_Result.xlsx


# Pathwise Efficiency Contribution Analysis

In [None]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import optuna
from datetime import timedelta

optuna.logging.set_verbosity(optuna.logging.WARNING)

data = pd.read_excel("multiwafer_database.xlsx")
data.columns = data.columns.str.replace(" ", "_")
model = joblib.load("../1. Training/(Efficiency)best_model_ET.pkl")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

equipment_features = ['P02', 'P03', 'P04', 'P05', 'P06', 'Tester']
quality_features = [
    'P01', 'Dark_Area_%', 'Defect_Area_%', 'Grain_Defect_Area_%',
    'Average_Life_Time', 'Sigma_Life_Time',
    'Resistivity', 'Wafer_Area', 'Vendor_name'
]
trained_features = model.feature_names_in_

param_ranges = {
    'P02': (1, 6), 'P03': (1, 15),
    'P04': (1, 6), 'P05': (1, 15), 'P06': (1, 27),
    'Tester': (1, 20)
}

# Select bottom 0.25% samples
threshold = np.percentile(data["Efficiency"], 0.25)
samples = data[data["Efficiency"] <= threshold].sample(n=100, random_state=RANDOM_STATE)

results = []

for idx, sample in samples.iterrows():
    quality_values = sample[quality_features]
    initial_eff = sample["Efficiency"]
    initial_path = {key: sample[key] for key in equipment_features}

    def objective(trial):
        path = {key: trial.suggest_int(key, *param_ranges[key]) for key in equipment_features}
        full_input = {**path, **quality_values.to_dict()}
        input_df = pd.DataFrame([full_input])[trained_features]
        return model.predict(input_df)[0]

    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(n_startup_trials=20, seed=RANDOM_STATE)
    )
    start = time.time()
    study.optimize(objective, n_trials=300)
    elapsed = time.time() - start

    best_path = study.best_params
    optimized_input = {**best_path, **quality_values.to_dict()}
    E_optimized = model.predict(pd.DataFrame([optimized_input])[trained_features])[0]

    row = {
        "Data Point": idx,
        "Initial Efficiency": initial_eff,
        "Optimized Efficiency": E_optimized,
        "Elapsed Time": str(timedelta(seconds=int(elapsed)))
    }

    for key in equipment_features:
        row[f"Initial_{key}"] = initial_path[key]
        row[f"Best_{key}"] = best_path[key]

    for key in equipment_features:
        partial_path = initial_path.copy()
        partial_path[key] = best_path[key]
        partial_input = {**partial_path, **quality_values.to_dict()}
        E_partial = model.predict(pd.DataFrame([partial_input])[trained_features])[0]
        row[f"ΔEff_{key}"] = E_partial - initial_eff

    results.append(row)

# Save
df = pd.DataFrame(results)
os.makedirs("additional_analysis", exist_ok=True)
df.to_excel("additional_analysis/Pathwise_Contribution_Trial1.xlsx", index=False)

In [None]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import optuna
from datetime import timedelta

optuna.logging.set_verbosity(optuna.logging.WARNING)

data = pd.read_excel("multiwafer_database.xlsx")
data.columns = data.columns.str.replace(" ", "_")
model = joblib.load("../1. Training/(Efficiency)best_model_ET.pkl")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

equipment_features = ['P02', 'P03', 'P04', 'P05', 'P06', 'Tester']
quality_features = [
    'P01', 'Dark_Area_%', 'Defect_Area_%', 'Grain_Defect_Area_%',
    'Average_Life_Time', 'Sigma_Life_Time',
    'Resistivity', 'Wafer_Area', 'Vendor_name'
]
trained_features = model.feature_names_in_

param_ranges = {
    'P02': (1, 6), 'P03': (1, 15),
    'P04': (1, 6), 'P05': (1, 15), 'P06': (1, 27),
    'Tester': (1, 20)
}

# Select bottom 0.25% samples
threshold = np.percentile(data["Efficiency"], 1)
samples = data[data["Efficiency"] <= threshold].sample(n=100, random_state=RANDOM_STATE)

results = []

for idx, sample in samples.iterrows():
    quality_values = sample[quality_features]
    initial_eff = sample["Efficiency"]
    initial_path = {key: sample[key] for key in equipment_features}

    def objective(trial):
        path = {key: trial.suggest_int(key, *param_ranges[key]) for key in equipment_features}
        full_input = {**path, **quality_values.to_dict()}
        input_df = pd.DataFrame([full_input])[trained_features]
        return model.predict(input_df)[0]

    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(n_startup_trials=20, seed=RANDOM_STATE)
    )
    start = time.time()
    study.optimize(objective, n_trials=300)
    elapsed = time.time() - start

    best_path = study.best_params
    optimized_input = {**best_path, **quality_values.to_dict()}
    E_optimized = model.predict(pd.DataFrame([optimized_input])[trained_features])[0]

    row = {
        "Data Point": idx,
        "Initial Efficiency": initial_eff,
        "Optimized Efficiency": E_optimized,
        "Elapsed Time": str(timedelta(seconds=int(elapsed)))
    }

    for key in equipment_features:
        row[f"Initial_{key}"] = initial_path[key]
        row[f"Best_{key}"] = best_path[key]

    for key in equipment_features:
        partial_path = initial_path.copy()
        partial_path[key] = best_path[key]
        partial_input = {**partial_path, **quality_values.to_dict()}
        E_partial = model.predict(pd.DataFrame([partial_input])[trained_features])[0]
        row[f"ΔEff_{key}"] = E_partial - initial_eff

    results.append(row)

# Save
df = pd.DataFrame(results)
os.makedirs("additional_analysis", exist_ok=True)
df.to_excel("additional_analysis/Pathwise_Contribution_Trial3.xlsx", index=False)