In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
import sys
import glob

In [3]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
DATA_ROOT_DIR = os.path.abspath('../data')
DATABASE_DATA_DIR = os.path.join(DATA_ROOT_DIR,'Database')
TEST_DATA_DIR = os.path.join(DATA_ROOT_DIR,'Test')
OUTPUT_DATA_DIR = os.path.join(DATA_ROOT_DIR,'Output')

files_DATABASE = glob.glob(os.path.join(DATABASE_DATA_DIR, 'Database.xlsx'))

In [7]:
#Test할 데이터의 기본적인 전처리를 하는 함수
def data_preprocess(df, features):
    
    #컬럼마가 다르게 NULL값 처리

    #(Char)'MAT_NAME','PRT_NAME','PROCESS','PRT_PART_NAME','CHEMICALS','DEV_STYLE_NUMBER'
    if set(['MAT_NAME','PRT_NAME','PROCESS','PRT_PART_NAME','CHEMICALS','DEV_STYLE_NUMBER']).issubset(df.columns):
        df[['MAT_NAME','PRT_NAME','PROCESS','PRT_PART_NAME','CHEMICALS','DEV_STYLE_NUMBER']]= df[['MAT_NAME','PRT_NAME','PROCESS','PRT_PART_NAME','CHEMICALS','DEV_STYLE_NUMBER']].fillna('NO_DATA')
    #(Char)'PRT_COLOR'
    if 'PRT_COLOR' in df:
        df['PRT_COLOR']=df['PRT_COLOR'].fillna('NO_COLOR')
    #(int)'GRAD','TIMES'
    if 'TIMES' in df:
        df['TIMES']=df['TIMES'].fillna(0)
    if 'GRAD' in df:
        df['GRAD']=df['GRAD'].fillna(0)
    #(int)'ROUNDS'
    if 'ROUNDS' in df:
        df['ROUNDS']=df['ROUNDS'].fillna(1)

    #전부 대문자로 통일, 빈칸 지우기
    input_features=features[:4]
    df[input_features] = df[input_features].applymap(string_preprocess)

    #더이상 사용되지 않는 PROCESS 제거
    #df=df[df.PROCESS!='POP CLEAR SCREEN PRINT']
    #df=df[df.PROCESS!='PRIMER']

    #PROCESS이름이 RULE과 다른 경우 교체
    df.loc[df['PROCESS']=='A-BOND','PROCESS']='A-BOND SCREEN PRINT'
    df.loc[df['PROCESS']=='U-BOND','PROCESS']='U-BOND SCREEN PRINT'
    df.loc[df['PROCESS']=='3D CLEAR(WG-100 12%) SCREEN PRINT','PROCESS']='3D CLEAR SCREEN PRINT'
    df.loc[df['PROCESS']=='TOP GLOSSY SCREEN PRINT','PROCESS']='TOP GLOSSY CLEAR SCREEN PRINT'
    df.loc[df['PROCESS']=='HD CLEAR SCREEN PRINT','PROCESS']='HD.CLEAR SCREEN PRINT'
    df.loc[df['PROCESS']=='3D PUFF SCREEN PRINT','PROCESS']='3D CLEAR SCREEN PRINT'
    
    #str, int 섞여있는경우 str으로 통일
    df = df.astype({'DEV_STYLE_NUMBER':'str'})
    
    #index 순서대로
    df=df.reset_index(drop=True)

    return df

In [9]:
def string_preprocess(s):
    s=str(s).upper()
    s_split=s.split()
    s_split = [x for x in s_split if not str(x).isdigit()]
    s=' '.join(s_split)
    return s

In [10]:
#Test 데이터 불러오기
filename=os.listdir(TEST_DATA_DIR)[0]
xls = pd.ExcelFile(os.path.join(TEST_DATA_DIR,filename))
df = xls.parse('Sheet1',header=0)
df.head()

#Test 데이터 전처리
features=['DEV_STYLE_NUMBER','PROCESS','PRT_PART_NAME','MAT_NAME','PRT_NAME','PRT_COLOR','CHEMICALS','GRAD','TIMES','ROUNDS']
df=data_preprocess(df, features)

In [None]:
#Feature는 

In [24]:
pd.set_option('display.max_rows',None)
dup_df=df[df.duplicated(['DEV_STYLE_NUMBER','PROCESS','PRT_PART_NAME','MAT_NAME','PRT_NAME','PRT_COLOR','CHEMICALS','GRAD'],keep=False)]
dup_df[features].sort_values(by=features)

Unnamed: 0,DEV_STYLE_NUMBER,PROCESS,PRT_PART_NAME,MAT_NAME,PRT_NAME,PRT_COLOR,CHEMICALS,GRAD,TIMES,ROUNDS
2254,,CLEAR SCREEN PRINT,HEEL TOP,"HM MILLON 3S, PU, 0.25MM, UM-T, NASA T, 0.10MM",제너럴 밀론/신세틱,NO_COLOR,HAP-60 NC 1C,0,2,1
2313,,CLEAR SCREEN PRINT,HEEL TOP,"HM MILLON 3S, PU, 0.25MM, UM-T, NASA T, 0.10MM",제너럴 밀론/신세틱,NO_COLOR,HAP-60 NC 1C,0,2,1
2276,,CLEAR SCREEN PRINT,STRAP,"HM MILLON 3S, PU, 0.25MM, UM-T, NASA T, 0.10MM",제너럴 밀론/신세틱,NO_COLOR,HAP-60 NC 1C,0,2,1
2324,,CLEAR SCREEN PRINT,STRAP,"HM MILLON 3S, PU, 0.25MM, UM-T, NASA T, 0.10MM",제너럴 밀론/신세틱,NO_COLOR,HAP-60 NC 1C,0,2,1
5101,,CLEAR SCREEN PRINT,TONGUE LABEL,"VEIL SKIN LITE, PU, 0.45MM, S-MATTE",밀론/신세틱,NO_COLOR,MSP# 60 Series,0,4,2
5104,,CLEAR SCREEN PRINT,TONGUE LABEL,"VEIL SKIN LITE, PU, 0.45MM, S-MATTE",밀론/신세틱,NO_COLOR,MSP# 60 Series,0,4,2
2266,,CLEAR SCREEN PRINT,TONGUE OLAY,"HM MILLON 3S, PU, 0.25MM, UM-T, NASA T, 0.10MM",제너럴 밀론/신세틱,NO_COLOR,HAP-60 NC 1C,0,2,1
2273,,CLEAR SCREEN PRINT,TONGUE OLAY,"HM MILLON 3S, PU, 0.25MM, UM-T, NASA T, 0.10MM",제너럴 밀론/신세틱,NO_COLOR,HAP-60 NC 1C,0,2,1
2316,,CLEAR SCREEN PRINT,TONGUE OLAY,"HM MILLON 3S, PU, 0.25MM, UM-T, NASA T, 0.10MM",제너럴 밀론/신세틱,NO_COLOR,HAP-60 NC 1C,0,2,1
2321,,CLEAR SCREEN PRINT,TONGUE OLAY,"HM MILLON 3S, PU, 0.25MM, UM-T, NASA T, 0.10MM",제너럴 밀론/신세틱,NO_COLOR,HAP-60 NC 1C,0,2,1


In [23]:
dup_df[features].drop_duplicates(features,keep=False).sort_values(by=features)

Unnamed: 0,DEV_STYLE_NUMBER,PROCESS,PRT_PART_NAME,MAT_NAME,PRT_NAME,PRT_COLOR,CHEMICALS,GRAD,TIMES,ROUNDS
3713,DX7998,COLOR INK SCREEN PRINT,SOCKLINER,"(PM) LINING, KNIT, BRUSHED [ RIO ], REC-RYP",수성전사지 (경성),BLACK,WPL#2010 series,0,4,4
2,DX7999,A-BOND SCREEN PRINT,SOCKLINER,"(PM) LINING, KNIT, BRUSHED [ RIO ], REC-RYP",수성전사지 (경성),NO_COLOR,MSP# 60 Series,0,6,3
0,DX7999,CLEAR SCREEN PRINT,SOCKLINER,"(PM) LINING, KNIT, BRUSHED [ RIO ], REC-RYP",수성전사지 (경성),NO_COLOR,WPL#2010 series,0,4,2
1584,FJ5476,LOW CLEAR SCREEN PRINT,TONGUE LABEL BASE,"AQUA SUEDE, 1.0MM",매시/텍스타일 (경성),NO_COLOR,WPL#2010 series,0,12,6


In [31]:
df.loc[[612,3696,3713,471,3737,3758,3779,2,469,3734,3756,3777,0,2932,2938,1584]].to_excel(os.path.join(TEST_DATA_DIR,'check.xlsx'))