### Best Model
- Autoencoder 

In [None]:
!pip install nbformat nbclient

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from model.AE_anbormaly_detection import Autoencoder
from model.Deep_SVDD import DeepSVDD
import torch.nn as nn
import torch.optim as optim
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import random
import wandb as wb
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
wb.login(key='df322611202ac3afbc7fa10af4de4f06ebc84103') 

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\hyr69\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mglasshan914[0m ([33mglasshan9140[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
random.seed(42)  # For reproducibility
def sampling(df,normal_df,abnormal_df,features):
    #test에 사용할 normal data sampling
    test_lot_index=random.sample(normal_df.Lot.unique().tolist(),9)
    test_lot_index+=abnormal_df.Lot.unique().tolist()

    #train test split, test set에 normal Lot 9개, abnormal Lot 9개(전체)
    train_df=normal_df[~normal_df['Lot'].isin(test_lot_index)]
    test_df=df[df['Lot'].isin(test_lot_index)]

    try:
        features.remove('Lot')  
        features.remove('class')
    except:
        pass

    #scaled data
    scaler=StandardScaler()
    scaled_x_train=scaler.fit_transform(train_df[features])
    scaled_x_test=scaler.transform(test_df[features])
    return scaled_x_train,scaled_x_test,test_df,test_lot_index,features

In [4]:
df=pd.read_csv('E:/glass_git/ML-DL/Signal&Table/data/cromate/preped/df_merged.csv')
poly_features=['pH','Temp','Voltage','run_time']
features=['pH','Temp','Voltage','run_time','month','day','hour','minutes','seconds']
target=['class']

#polynominal feature 생성
poly=PolynomialFeatures(degree=(2,3),interaction_only=True,include_bias=True)
new_feature=poly.fit_transform(df[poly_features])

#TSNE로 차원 축소 변수 2개 생성
X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(df[poly_features])

# 데이터프레임에 새로운 feature와 TSNE 변수 추가
ddf=pd.concat([df[features+target], pd.DataFrame(new_feature, columns=poly.get_feature_names_out(poly_features))], axis=1)
ddf=pd.concat([ddf, pd.DataFrame(X_embedded, columns=['tnse1','tsne2'])], axis=1)
ddf.drop(columns=['1'],inplace=True)

ddf['class']=df['class']
ddf['Lot']=df['Lot']
normal_df=ddf[ddf['class']==0]
abnormal_df=ddf[ddf['class']==1]

train_features=ddf.columns.tolist()


In [None]:
import wandb as wb

def autoencoder_train(scaled_x_train, scaled_x_test, test_df, test_lot_index, features):
    with wb.init(project='Signal_abnormal_detection') as run:
        cfg = run.config

        # 안전하게 기본값 부여
        lr       = getattr(cfg, "learning_rate", 1e-3)
        epochs   = int(getattr(cfg, "epochs", 50))
        opt_name = getattr(cfg, "optimizer", "adam")
        thr_pct  = float(getattr(cfg, "threshold", 95))

        wb.log({"test_lot_index": test_lot_index, "features": features})

        scaled_x_train = torch.tensor(scaled_x_train, dtype=torch.float32)
        scaled_x_test  = torch.tensor(scaled_x_test,  dtype=torch.float32)

        train_loader = DataLoader(TensorDataset(scaled_x_train), batch_size=256, shuffle=False)

        # AE는 입력/출력 차원 동일해야 함!
        model = Autoencoder(input_dim=scaled_x_train.shape[1], output_dim=1)

        criterion = nn.MSELoss()
        if opt_name == "adam":
            optimizer = optim.Adam(model.parameters(), lr=lr)
        elif opt_name == "sgd":
            optimizer = optim.SGD(model.parameters(), lr=lr)
        elif opt_name == "adamw":
            optimizer = optim.AdamW(model.parameters(), lr=lr)
        else:
            raise ValueError(f"Unknown optimizer: {opt_name}")

        # 디버그 출력
        print("lr:", lr, "optimizer:", opt_name, "epochs:", epochs)

        # train
        for _ in range(epochs):
            epoch_loss = 0.0
            for (xb,) in train_loader:
                optimizer.zero_grad()
                out  = model(xb)
                loss = criterion(out, xb)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
            wb.log({"training_loss": epoch_loss / max(1, len(train_loader))})

        # reconstruction error (테스트셋)
        with torch.no_grad():
            recon  = model(scaled_x_test)
            errors = torch.mean((scaled_x_test - recon) ** 2, dim=1).numpy()
            wb.use_artifact({"reconstruction_error": epoch_loss / max(1, len(train_loader))})

        # 임계값: 퍼센타일(예: 95) — 보통은 "정상 데이터"의 error 분포에서 뽑는 걸 권장
        threshold = np.percentile(errors, thr_pct)
        AE_pred = (errors > threshold).astype(int)

        acc = accuracy_score(test_df[target], AE_pred)  # target 변수가 전역/상위에서 정의되어 있어야 함
        wb.log({"accuracy": acc})

# --- 스윕 설정 ---
sweep_configuration = {
    "method": "random",
    "metric": {"goal": "maximize", "name": "accuracy"},
    "parameters": {
        "optimizer": {"values": ["adam", "sgd", "adamw"]},
        "learning_rate": {"min": 1e-4, "max": 1e-1},
        "epochs": {"values": [50, 100, 150]},
        "threshold": {"values": [95]},
    },
}

# 샘플링/데이터 준비
scaled_x_train, scaled_x_test, test_df, test_lot_index, features = sampling(ddf, normal_df, abnormal_df, features)

# 0-인자 함수(클로저)로 감싸서 전달!
trainer_fn = lambda: autoencoder_train(scaled_x_train, scaled_x_test, test_df, test_lot_index, features)

sweep_id = wb.sweep(sweep=sweep_configuration, project="Signal_abnormal_detection")
wb.agent(sweep_id, function=trainer_fn, count=10)


Create sweep with ID: 3hyg1h1d
Sweep URL: https://wandb.ai/glasshan9140/Signal_abnormal_detection/sweeps/3hyg1h1d


[34m[1mwandb[0m: Agent Starting Run: kfl6ka0c with config:
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	learning_rate: 0.016289809875122623
[34m[1mwandb[0m: 	optimizer: adamw
[34m[1mwandb[0m: 	threshold: 95


lr: 0.016289809875122623 optimizer: adamw epochs: 50


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
training_loss,█▆▃▂▂▂▂▃▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁

0,1
accuracy,0.5278
training_loss,0.75058


[34m[1mwandb[0m: Agent Starting Run: 2v8udwid with config:
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	learning_rate: 0.016017748606744257
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	threshold: 95


lr: 0.016017748606744257 optimizer: sgd epochs: 50


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
training_loss,█▇▇▆▅▄▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.53444
training_loss,0.75073


[34m[1mwandb[0m: Agent Starting Run: fqabpstq with config:
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	learning_rate: 0.03374522233952644
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 95


lr: 0.03374522233952644 optimizer: adam epochs: 100


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
training_loss,▃▁▁▃▂▂▂▂▂▂▂▃▃▂▂█▆▅▅▅▅█▅▅▅▆▅▅▅▅▅▆▅▃▂▂▂▃▂▂

0,1
accuracy,0.53942
training_loss,0.76768


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: nsv3bdu6 with config:
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	learning_rate: 0.0913796375035852
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	threshold: 95


lr: 0.0913796375035852 optimizer: sgd epochs: 100


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
training_loss,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▂▃▂▂▂▂▂▂▂▂▂▃▃

0,1
accuracy,0.54606
training_loss,0.69927


[34m[1mwandb[0m: Agent Starting Run: oe53rj0v with config:
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	learning_rate: 0.029545404316146145
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	threshold: 95


lr: 0.029545404316146145 optimizer: sgd epochs: 100


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
training_loss,█▆▅▄▄▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁

0,1
accuracy,0.52282
training_loss,0.56714


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: uhonlqlm with config:
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	learning_rate: 0.051364742771434053
[34m[1mwandb[0m: 	optimizer: adamw
[34m[1mwandb[0m: 	threshold: 95


lr: 0.051364742771434053 optimizer: adamw epochs: 100


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
training_loss,▁▅▅▄▄▆▇██▅██▇▇██▅█▇█▇▅▇▅▇▄▇▄▄▄▅▅▄▄▄▄▆▄▄▅

0,1
accuracy,0.54938
training_loss,0.76137


[34m[1mwandb[0m: Agent Starting Run: 3adztosc with config:
[34m[1mwandb[0m: 	epochs: 150
[34m[1mwandb[0m: 	learning_rate: 0.022553010539216232
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	threshold: 95


lr: 0.022553010539216232 optimizer: sgd epochs: 150


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
training_loss,█▆▅▄▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.53278
training_loss,0.72364


[34m[1mwandb[0m: Agent Starting Run: hiroli22 with config:
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	learning_rate: 0.09430516802285444
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 95


lr: 0.09430516802285444 optimizer: adam epochs: 50


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
training_loss,▂▁▁▁▁▂▂▂▂▂▂█████████████████████████████

0,1
accuracy,0.53278
training_loss,0.9499


[34m[1mwandb[0m: Agent Starting Run: xc2p3m8k with config:
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	learning_rate: 0.01510605821143317
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 95


lr: 0.01510605821143317 optimizer: adam epochs: 100


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
training_loss,▃▄▄▂█▁▁▁▂▂▂▂▇▅▅▅▆▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅

0,1
accuracy,0.53942
training_loss,0.79352


[34m[1mwandb[0m: Agent Starting Run: 3906vya6 with config:
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	learning_rate: 0.08624930681494536
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	threshold: 95


lr: 0.08624930681494536 optimizer: sgd epochs: 50


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
training_loss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.52282
training_loss,0.66667


In [None]:


def autoencoder_train(scaled_x_train, scaled_x_test,test_df):
    #Autoencoder
    scaled_x_train = torch.tensor(scaled_x_train, dtype=torch.float32)  # 변환
    train_loader = DataLoader(TensorDataset(scaled_x_train), batch_size=256, shuffle=False)
    model = Autoencoder()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # train
    for epoch in range(50):
        for i,data in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(data[0])
            loss = criterion(outputs, data[0])
            loss.backward()
            optimizer.step()

    #reconstruciont error
    X_test = torch.tensor(scaled_x_test, dtype=torch.float32)

    with torch.no_grad():
        recon = model(X_test)
        errors = torch.mean((X_test - recon) ** 2, dim=1)  # MSE per sample
        wb.log({"AE_loss": errors})

    # threhold (정상 데이터 기준 99% 분위수)
    threshold = np.percentile(errors[:len(scaled_x_train)], 85)
    AE_pred = (errors > threshold).int()

    acc=accuracy_score(test_df[target], AE_pred)
    return acc*100

def deepsvdd_train(scaled_x_train, scaled_x_test, test_df):
    scaled_x_train = torch.tensor(scaled_x_train, dtype=torch.float32)  # 변환
    train_loader = DataLoader(TensorDataset(scaled_x_train), batch_size=256, shuffle=False)
    X_test = torch.tensor(scaled_x_test, dtype=torch.float32)
    # ---- Choose objective ----
    deepsvdd = DeepSVDD(in_dim=9, rep_dim=16, objective="one-class")             # hard-boundary
    # deepsvdd = DeepSVDD(in_dim=9, rep_dim=16, objective="soft-boundary", nu=0.05)  # soft-boundary

    deepsvdd.fit(train_loader, lr=1e-3, weight_decay=1e-6, epochs=50, R_update_freq=5)
    if deepsvdd.objective == "one-class":
        # 학습 분포의 95% 분위수를 임계값으로
        with torch.no_grad():
                s_train = deepsvdd.score(scaled_x_train)
        thr = torch.quantile(s_train, 0.97).item()
        y_pred, _ = deepsvdd.predict(X_test, threshold=thr)
        deepsvdd_pred= y_pred

    deepsvdd_thr_pred = (deepsvdd_pred > thr).int()
    acc=accuracy_score(test_df[target], deepsvdd_thr_pred)
    return acc*100





In [None]:
# X=ddf.drop(columns=['class'])
# y=ddf['class']
# clf = ExtraTreesClassifier(n_estimators=5000, random_state=42, n_jobs=-1)
# clf = clf.fit(X, y)
# clf.feature_importances_  

# tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
# best_df=ddf[:][tree_importance_sorted_idx[:100]]
# best_df.head()

# feature importance 시각화
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# import matplotlib
# from sklearn.inspection import permutation_importance
# from sklearn.utils.fixes import parse_version

# def plot_permutation_importance(clf, X, y, ax):
#     result = permutation_importance(clf, X, y, n_repeats=10, random_state=42, n_jobs=-1)
#     perm_sorted_idx = result.importances_mean.argsort()

#     # `labels` argument in boxplot is deprecated in matplotlib 3.9 and has been
#     # renamed to `tick_labels`. The following code handles this, but as a
#     # scikit-learn user you probably can write simpler code by using `labels=...`
#     # (matplotlib < 3.9) or `tick_labels=...` (matplotlib >= 3.9).
#     tick_labels_parameter_name = (
#         "tick_labels"
#         if parse_version(matplotlib.__version__) >= parse_version("3.9")
#         else "labels"
#     )
#     tick_labels_dict = {tick_labels_parameter_name: X.columns[perm_sorted_idx]}
#     ax.boxplot(result.importances[perm_sorted_idx].T, vert=False, **tick_labels_dict)
#     ax.axvline(x=0, color="k", linestyle="--")
#     return ax

# mdi_importances = pd.Series(clf.feature_importances_, index=X.columns)
# tree_importance_sorted_idx = np.argsort(clf.feature_importances_)

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 50))
# mdi_importances.sort_values().plot.barh(ax=ax1)
# ax1.set_xlabel("Gini importance")
# plot_permutation_importance(clf, X, y, ax2)
# ax2.set_xlabel("Decrease in accuracy score")
# fig.suptitle(
#     "Impurity-based vs. permutation importances on multicollinear features (train set)"
# )
# _ = fig.tight_layout()

In [15]:
reuslts={'deepsvdd':[], 'autoencoder':[]}
features=ddf.columns.tolist()
features.remove('class')  
print(features)

for i in range(10): 
    scaled_x_train,scaled_x_test,test_df=sampling(ddf,normal_df,abnormal_df,features)
    # deepsvdd_acc=deepsvdd_train(scaled_x_train, scaled_x_test, test_df)
    ae_acc=autoencoder_train(scaled_x_train, scaled_x_test,test_df)
    # reuslts['deepsvdd'].append(deepsvdd_acc)
    reuslts['autoencoder'].append(ae_acc)
    wb.log({"Accuracy": ae_acc})
reuslts

['pH', 'Temp', 'Voltage', 'run_time', 'month', 'day', 'hour', 'minutes', 'seconds', 'pH Temp', 'pH Voltage', 'pH run_time', 'Temp Voltage', 'Temp run_time', 'Voltage run_time', 'pH Temp Voltage', 'pH Temp run_time', 'pH Voltage run_time', 'Temp Voltage run_time', 'tnse1', 'tsne2', 'Lot']


{'deepsvdd': [],
 'autoencoder': [56.79933665008292,
  57.759336099585056,
  56.666666666666664,
  49.12427022518766,
  47.9933110367893,
  55.730897009966775,
  50.91362126245848,
  56.390977443609025,
  48.53801169590643,
  53.79482902418682]}