#**Project**: Semiconductor Wafer Defect Detection using Deep Learning (CNN)

##1️⃣ Import Libraries

In [1]:
# Basic setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.io as sio
idx = pd.RangeIndex(start=0, stop=10)
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


**Explanation:**

We’re using standard libraries for data loading (scipy.io), preprocessing (numpy, pandas), visualization (matplotlib, seaborn), and model building (TensorFlow/Keras).

## 2️⃣ Load Dataset

**Download the dataset**

Use the Kaggle API to ensure you get the full, intact file:

In [2]:
!pip install kaggle
!mkdir -p ~/.kaggle
!echo '{"username":"YOUR_KAGGLE_USERNAME","key":"YOUR_KAGGLE_KEY"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d qingyi/wm811k-wafer-map
!unzip -o wm811k-wafer-map.zip -d wm811k_dataset


Dataset URL: https://www.kaggle.com/datasets/qingyi/wm811k-wafer-map
License(s): CC0-1.0
Downloading wm811k-wafer-map.zip to /content
 98% 146M/149M [00:00<00:00, 1.52GB/s]
100% 149M/149M [00:00<00:00, 1.41GB/s]
Archive:  wm811k-wafer-map.zip
  inflating: wm811k_dataset/LSWMD.pkl  


In [3]:
import os
os.listdir('wm811k_dataset')


['LSWMD.pkl']

**Step-by-Step: Safely Loading LSWMD.pkl**
 1. Check the File Size First





In [4]:
import os
print(os.path.getsize('wm811k_dataset/LSWMD.pkl') / 1024**2, "MB")


1998.430230140686 MB


2. Load the Dataset

In [5]:
import sys, pickle, pandas as pd
import pandas.core.indexes as new_indexes
sys.modules['pandas.indexes'] = new_indexes  # redirect old pandas path

with open('wm811k_dataset/LSWMD.pkl', 'rb') as f:
    data = pickle.load(f, encoding='latin1')


3. Inspect the Loaded Data

In [6]:
print(type(data))
print(data.keys())


<class 'pandas.core.frame.DataFrame'>
Index(['waferMap', 'dieSize', 'lotName', 'waferIndex', 'trianTestLabel',
       'failureType'],
      dtype='object')


In [7]:
wafer_maps = data['waferMap']
labels = data['failureType']
train_test_split = data['trianTestLabel'] # Corrected key

print(f"Number of wafer maps: {len(wafer_maps)}")
print(f"Example label: {labels[0][0][0]}")
print(f"Example wafer map shape: {wafer_maps[0].shape}")

Number of wafer maps: 811457
Example label: none
Example wafer map shape: (45, 48)


In [8]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import cv2

# Filter out entries with empty failureType lists and flatten labels
filtered_data = data[data['failureType'].apply(lambda x: len(x) > 0)]
labels = np.array([lbl[0][0] for lbl in filtered_data['failureType']])

# Resize wafer maps to uniform shape
RESIZE_SHAPE = (48, 48)
# Apply the same filtering to wafer maps
X = np.array([cv2.resize(img, RESIZE_SHAPE, interpolation=cv2.INTER_NEAREST) for img in filtered_data['waferMap']])

# Normalize pixel values
X = X / np.max(X)

# Encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(labels)

# Show shape summary
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Classes:", encoder.classes_)

X shape: (172950, 48, 48)
y shape: (172950,)
Classes: ['Center' 'Donut' 'Edge-Loc' 'Edge-Ring' 'Loc' 'Near-full' 'Random'
 'Scratch' 'none']


In [9]:
import numpy as np

# Replace 'Loc' with 'Edge-Loc' for consistency
y_labels = np.array([lbl.replace('Loc', 'Edge-Loc') if lbl == 'Loc' else lbl for lbl in labels])

encoder = LabelEncoder()
y = encoder.fit_transform(y_labels)
print("Updated classes:", encoder.classes_)


Updated classes: ['Center' 'Donut' 'Edge-Loc' 'Edge-Ring' 'Near-full' 'Random' 'Scratch'
 'none']


# Building and training the CNN

Build and evaluate two baseline models for wafer defect classification:

1. Logistic Regression

2. Naive Bayes

…and handle class imbalance using:

1. Class weights or

2. Resampling (SMOTE / undersampling)

**Step 1: Prepare Data for Baselines**

Since both Logistic Regression and Naive Bayes require 1D feature vectors, we’ll flatten the wafer images and optionally sample a manageable subset (since 170K+ samples may be heavy).

In [13]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler

# --- sample a smaller subset for quick baselines ---
X_train_small, y_train_small = resample(X_train_flat, y_train_flat,
                                        n_samples=20000, random_state=42, stratify=y_train_flat)
X_test_small, y_test_small = resample(X_test_flat, y_test_flat,
                                      n_samples=5000, random_state=42, stratify=y_test_flat)

print("Train subset:", X_train_small.shape)
print("Test subset:", X_test_small.shape)



Train subset: (20000, 2304)
Test subset: (5000, 2304)


**Step 2: Dimensionality Reduction with PCA**



In [31]:
# Reduce from 2304 → 100 features for speed
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train_small)
X_test_pca = pca.transform(X_test_small)

print("After PCA:", X_train_pca.shape)



After PCA: (20000, 100)


**Step 3: Logistic Regression Baseline**

In [22]:
from sklearn.utils.class_weight import compute_class_weight

# compute balanced weights for the subset
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_small), y=y_train_small)
cw_dict = dict(enumerate(class_weights))

# train Logistic Regression
logreg = LogisticRegression(max_iter=100, solver='sag',
                            class_weight=cw_dict, n_jobs=-1, verbose=0)
logreg.fit(X_train_pca, y_train_small)

# predict
y_pred_lr = logreg.predict(X_test_pca)

print("=== Logistic Regression (PCA + subset) ===")
print("Accuracy:", round(accuracy_score(y_test_small, y_pred_lr), 4))
print(classification_report(y_test_small, y_pred_lr, target_names=encoder.classes_))


=== Logistic Regression (PCA + subset) ===
Accuracy: 0.6936
              precision    recall  f1-score   support

      Center       0.57      0.94      0.71       124
       Donut       0.30      0.94      0.45        16
    Edge-Loc       0.20      0.48      0.28       254
   Edge-Ring       0.90      0.96      0.93       280
   Near-full       0.33      1.00      0.50         4
      Random       0.26      0.24      0.25        25
     Scratch       0.02      0.34      0.03        35
        none       0.98      0.69      0.81      4262

    accuracy                           0.69      5000
   macro avg       0.44      0.70      0.49      5000
weighted avg       0.91      0.69      0.77      5000





**Naive Bayes Baseline (with Oversampling)**

In [32]:
from imblearn.over_sampling import RandomOverSampler

# balance training data for Naive Bayes
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train_pca, y_train_small)

nb = GaussianNB()
nb.fit(X_res, y_res)
y_pred_nb = nb.predict(X_test_pca)

print("\n=== Naive Bayes (PCA + oversampled subset) ===")
print("Accuracy:", round(accuracy_score(y_test_small, y_pred_nb), 4))
print(classification_report(y_test_small, y_pred_nb, target_names=encoder.classes_))



=== Naive Bayes (PCA + oversampled subset) ===
Accuracy: 0.6626
              precision    recall  f1-score   support

      Center       0.85      0.63      0.72       124
       Donut       0.46      0.81      0.59        16
    Edge-Loc       0.27      0.48      0.34       254
   Edge-Ring       0.86      0.86      0.86       280
   Near-full       0.75      0.75      0.75         4
      Random       0.49      0.84      0.62        25
     Scratch       0.01      0.49      0.03        35
        none       0.97      0.66      0.79      4262

    accuracy                           0.66      5000
   macro avg       0.58      0.69      0.59      5000
weighted avg       0.91      0.66      0.76      5000



**Quick Comparison Summary**

In [23]:
print("\nSummary of Baselines:")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test_small, y_pred_lr):.3f}")
print(f"Naive Bayes Accuracy:         {accuracy_score(y_test_small, y_pred_nb):.3f}")



Summary of Baselines:
Logistic Regression Accuracy: 0.694
Naive Bayes Accuracy:         0.663


In [33]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## **Advanced Model: Convolutional Neural Network (CNN)**

In [34]:
# Import Libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt


**Prepare Your Data**

Ensure:

Your X contains wafer maps (48×48)

Your y contains encoded defect classes

In [None]:
from sklearn.model_selection import train_test_split

# Add channel dimension for CNN
X = np.expand_dims(X, axis=-1)
X = X / 255.0 if X.max() > 1 else X  # normalize if not already scaled

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)
