#### Generate manually labeled dataset for model inference

Take json files from team manual labeling exercise and append labels to unlabeled WM-811K dataset.

In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle5 as pickle
import json
import ast

from helpers import *
from IPython.display import clear_output

In [2]:
# load unlabeled data
df = pd.read_pickle('WM-unlabeled-ldc.pkl')

# # remove corrupt imgs
# df['zero_count'] = df.WM.apply(lambda x: np.count_nonzero(x==0))
# df = df[df['zero_count'] < 730].copy()

# # add index column to identify specific wafers 
# df.reset_index(inplace=True)
# df = df.rename(columns={'index':'ID'})

# keep only needed columns
# df = df[['ID', 'waferMap', 'dieSize', 'lotName', 'waferIndex']]

print(df.shape)
df.head()

(638507, 5)


Unnamed: 0,ID,waferMap,dieSize,lotName,waferIndex
0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot2,11
1,1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot2,21
2,2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot3,20
3,3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot5,21
4,4,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2,...",533,lot7,9


In [3]:
users = ['daniel', 'erik', 'lea', 'naga', 'scott']

label_decoder = {0: 'none', 1: 'Edge-Ring', 2: 'Edge-Loc', 3: 'Center', 
                 4: 'Loc', 5: 'Scratch', 6: 'Random', 7: 'Donut', 8: 'Near-full', 
                 9: 'CORRUPT'}

In [4]:
df_labeled = pd.DataFrame()

for user in users:
    
    # load labels generated by user
    with open(f'WM_labels_{user}.json', 'r') as file:
        labels=ast.literal_eval(file.read())
        
    # get test lots labeled by user
    test_lots = list(labels.keys())
    
    for lot in test_lots:
        
        # select lot from df
        lot_df = df[df.lotName == lot]
        
        #for i in range(min(25, lot_df.shape[0])):
        for i in range(min(25, len(labels[lot]))):
  
            # loop over wafers per lot and add label to ith row from lot
            df_labeled = df_labeled.append({'ID': lot_df.ID.iloc[i],
                                            'waferMap': lot_df.waferMap.iloc[i],
                                            'dieSize': lot_df.dieSize.iloc[i],
                                            'lotName': lot_df.lotName.iloc[i],
                                            'waferIndex': lot_df.waferIndex.iloc[i],
                                            'failureType': label_decoder[labels[lot][i]],
                                            'user': user}, ignore_index=True)

df_labeled['ID'] = df_labeled['ID'].astype('int')

print(df_labeled.shape)
df_labeled.head()

(3097, 7)


Unnamed: 0,ID,dieSize,failureType,lotName,user,waferIndex,waferMap
0,4863,2770.0,Edge-Loc,lot2006,daniel,1.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,4864,2785.0,none,lot2006,daniel,2.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,4865,2801.0,Loc,lot2006,daniel,3.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,4866,2973.0,Edge-Loc,lot2006,daniel,4.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,4867,3003.0,Edge-Ring,lot2006,daniel,5.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [5]:
df_labeled.groupby('user')['user'].count()

user
daniel    622
erik      600
lea       625
naga      625
scott     625
Name: user, dtype: int64

In [6]:
# mark off waferMaps with max value = 1
# to check that they are not mistakenly labeled as near-full
df_labeled['marker'] = df_labeled.waferMap.apply(lambda x: 1 if np.max(x)==1 else 0)
df_labeled.head()

Unnamed: 0,ID,dieSize,failureType,lotName,user,waferIndex,waferMap,marker
0,4863,2770.0,Edge-Loc,lot2006,daniel,1.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
1,4864,2785.0,none,lot2006,daniel,2.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
2,4865,2801.0,Loc,lot2006,daniel,3.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
3,4866,2973.0,Edge-Loc,lot2006,daniel,4.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
4,4867,3003.0,Edge-Ring,lot2006,daniel,5.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0


In [7]:
check = df_labeled[df_labeled.marker == 1]
print(len(check))
check.groupby('failureType').count()

322


Unnamed: 0_level_0,ID,dieSize,lotName,user,waferIndex,waferMap,marker
failureType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Loc,5,5,5,5,5,5,5
Near-full,113,113,113,113,113,113,113
Scratch,3,3,3,3,3,3,3
none,201,201,201,201,201,201,201


In [8]:
check[check.failureType == 'Near-full'].groupby('user').count()

Unnamed: 0_level_0,ID,dieSize,failureType,lotName,waferIndex,waferMap,marker
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
naga,83,83,83,83,83,83,83
scott,30,30,30,30,30,30,30


In [9]:
check[check.failureType == 'Loc'].groupby('user').get_group('erik')

Unnamed: 0,ID,dieSize,failureType,lotName,user,waferIndex,waferMap,marker
766,33697,515.0,Loc,lot3675,erik,20.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,...",1
786,35636,939.0,Loc,lot3820,erik,15.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,...",1
820,35767,939.0,Loc,lot3825,erik,24.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,...",1
1188,53512,776.0,Loc,lot4857,erik,17.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1
1195,53519,776.0,Loc,lot4857,erik,24.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1


In [10]:
check[check.failureType == 'Scratch'].groupby('user').get_group('erik')

Unnamed: 0,ID,dieSize,failureType,lotName,user,waferIndex,waferMap,marker
764,33695,515.0,Scratch,lot3675,erik,18.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,...",1
789,35639,939.0,Scratch,lot3820,erik,18.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,...",1
1189,53513,776.0,Scratch,lot4857,erik,18.0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1


In [11]:
# correct mislabeled nones
for i in range(len(df_labeled)):
    if df_labeled.marker.loc[i] == 1 and df_labeled.failureType.loc[i] != 'none':
        df_labeled.at[i, 'failureType'] = 'none'

df_labeled[df_labeled.marker == 1].groupby('failureType').count()

Unnamed: 0_level_0,ID,dieSize,lotName,user,waferIndex,waferMap,marker
failureType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
none,322,322,322,322,322,322,322


In [12]:
# add classification model labels
fail_dict = {'none': 8, 'Loc': 0, 'Edge-Loc': 1, 'Center': 2, 'Edge-Ring': 3, 
             'Scratch': 4, 'Random': 5, 'Near-full': 6, 'Donut': 7}
df_labeled['classifyLabels'] = df_labeled['failureType'].apply(lambda x: fail_dict[x])

In [13]:
# # save manually labeled dataset
# with open('WM-manually-labeled-all.pkl', "wb") as f:
#     pickle.dump(df_labeled, f)

In [None]:
# create list of lots labeled by each user
user_lots = [daniel_lot:=[], erik_lot:=[], lea_lot:=[], naga_lot:=[], scott_lot:=[]]

for i in range(len(users)):
    x = df_labeled[df_labeled.user == users[i]].groupby('lotName')['lotName'].count().keys().tolist()
    user_lots[i].extend(x)

In [None]:
def show_lots(user):
    test_lots = user_lots[users.index(user)]
    for lot in test_lots: 
        plot_lot(df_labeled, lot, fig_size=(8,8), col='waferMap', cmap='gray_r')
        input('Press enter to see next lot')
        clear_output(wait=True)

In [None]:
show_lots('lea')