In [3]:
import os
import warnings
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Configure environment
warnings.filterwarnings("ignore")

# Set random seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Load dataset
file_path = os.path.join(os.getcwd(), "multiwafer_database_raw.xlsx")
data = pd.read_excel(file_path)

selected_features_raw = [
    "Dark Area %",
    "Defect Area %",
    "Grain Defect Area %",
    "Average Life Time",
    "Sigma Life Time",
    "Resistivity",
    "Wafer Area",
    "Vendor name",
]

TARGET_COL = "Efficiency"

# Output directory
output_dir = "Multiwafer_Database"
os.makedirs(output_dir, exist_ok=True)

# Utilities
def remove_groupwise_outliers(
    df: pd.DataFrame,
    group_col: str,
    target_col: str,
    lower_q: float = 0.01,
    upper_q: float = 0.99,
    min_group_size: int = 30,
) -> pd.DataFrame:
    def _filter(g: pd.DataFrame) -> pd.DataFrame:
        if g.shape[0] < min_group_size:
            return g
        lo = g[target_col].quantile(lower_q)
        hi = g[target_col].quantile(upper_q)
        return g[(g[target_col] >= lo) & (g[target_col] <= hi)]

    return df.groupby(group_col, group_keys=False).apply(_filter)


# Iterative filtering (K-means + percentile trimming)
N_ITER = 100
N_GROUPS = 1000

df_iter = data.copy()

for i in range(1, N_ITER + 1):
    df_iter = df_iter.dropna(subset=selected_features_raw + [TARGET_COL]).reset_index(drop=True)

    # Match previous training flow: rename only X columns (not data.columns)
    X = df_iter[selected_features_raw].copy()
    X.columns = X.columns.str.replace(" ", "_")

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X.values)

    # Distance-based grouping via K-means clustering
    n_clusters_eff = min(N_GROUPS, X_scaled.shape[0])
    kmeans = KMeans(
        n_clusters=n_clusters_eff,
        random_state=RANDOM_STATE,
        n_init=10,
    )
    cluster_labels = kmeans.fit_predict(X_scaled)

    df_iter = df_iter.copy()
    df_iter["KMeans_Group"] = cluster_labels

    # Remove top/bottom 1% within each group
    df_iter = remove_groupwise_outliers(
        df=df_iter,
        group_col="KMeans_Group",
        target_col=TARGET_COL,
        lower_q=0.01,
        upper_q=0.99,
        min_group_size=30,
    ).reset_index(drop=True)

    save_path = os.path.join(output_dir, f"multiwafer_database_{i}.xlsx")
    df_iter.to_excel(save_path, index=False)

    print(f"[Iteration {i:03d}] saved -> {save_path}")

[Iteration 001] saved -> Multiwafer_Database/multiwafer_database_1.xlsx
[Iteration 002] saved -> Multiwafer_Database/multiwafer_database_2.xlsx
[Iteration 003] saved -> Multiwafer_Database/multiwafer_database_3.xlsx
[Iteration 004] saved -> Multiwafer_Database/multiwafer_database_4.xlsx
[Iteration 005] saved -> Multiwafer_Database/multiwafer_database_5.xlsx
[Iteration 006] saved -> Multiwafer_Database/multiwafer_database_6.xlsx
[Iteration 007] saved -> Multiwafer_Database/multiwafer_database_7.xlsx
[Iteration 008] saved -> Multiwafer_Database/multiwafer_database_8.xlsx
[Iteration 009] saved -> Multiwafer_Database/multiwafer_database_9.xlsx
[Iteration 010] saved -> Multiwafer_Database/multiwafer_database_10.xlsx
[Iteration 011] saved -> Multiwafer_Database/multiwafer_database_11.xlsx
[Iteration 012] saved -> Multiwafer_Database/multiwafer_database_12.xlsx
[Iteration 013] saved -> Multiwafer_Database/multiwafer_database_13.xlsx
[Iteration 014] saved -> Multiwafer_Database/multiwafer_data