# 모듈 가져오기 (IMPORT 단계)

In [1]:
# 데이터 구성: Series, DataFrame
import pandas as pd
import numpy as np


#탐색적 분석을 위한 패키지
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale,minmax_scale,robust_scale
from scipy.stats import shapiro

#ARIMA
from statsmodels.tsa.arima_model import ARIMA



#matplotlib 이용한 background 그래프 실행 및 결과 확인 
%matplotlib inline


#export_graphviz: 나무구조 생성 및 저장
from sklearn.tree import export_graphviz
#graphviz: 나무 구조 시각화(.dot 확장자 파일 불러오기 등)
import graphviz


# 데이터 분할: train,test
from sklearn.model_selection import train_test_split

#데이터 scaling => 거리기반 알고리즘은 필수!!!
from sklearn.preprocessing import StandardScaler
# 데이터 분할: train,test
from sklearn.model_selection import train_test_split
# 분류 DT
from sklearn.tree import DecisionTreeClassifier
#분류  RF
from sklearn.ensemble import RandomForestClassifier
# 분류 GB
from sklearn.ensemble import GradientBoostingClassifier
# 분류 SVM
from sklearn.svm import SVC
# 분류 KNN
from sklearn.neighbors import KNeighborsClassifier
# 분류 NN
from sklearn.neural_network import MLPClassifier

# 최적모델, 파라미터 탐색
from sklearn.model_selection import GridSearchCV
# 모델 성능 평가 
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [2]:
# 한글 폰트 적용
font_path = '/usr/share/fonts/opentype/noto/NotoSerifCJK-ExtraLight.ttc'
font_name = matplotlib.font_manager.FontProperties(fname=font_path).get_name()
matplotlib.rc('font', family = font_name)

In [3]:
#결과 창 확대
pd.set_option('display.max_rows', 100000000000) # 최대 줄 수 설정
pd.set_option('display.max_columns', 100000000000) # 최대 열 수 설정 
pd.set_option('display.width', 1000) # 표시할 가로의 길이

## 데이터 받아오기

In [4]:
# 데이터 구성하기(MFG_MACHINE)
MFG_MACHINE = pd.read_csv("/home/pirl/빅데이터 프로젝트/cleansing/outlier/MFG_MACHINE_NULL_COL_OUT.csv", engine="python", encoding='EUC-KR')
#MFG_MACHINE = MFG_MACHINE.drop(MFG_MACHINE.columns[0], axis=1)

In [5]:
# 데이터 구성하기(MFG_PROCESS)
MFG_PROCESS = pd.read_csv("/home/pirl/빅데이터 프로젝트/cleansing/outlier/MFG_PROCESS_NULL_COL_OUT.csv", engine="python", encoding='EUC-KR')
#MFG_PROCESS = MFG_PROCESS.drop(MFG_PROCESS.columns[0], axis=1)

In [6]:
# 데이터 구성하기(MFG_TAT)
MFG_TAT = pd.read_csv("/home/pirl/빅데이터 프로젝트/cleansing/outlier/MFG_TAT_NULL_COL_OUT.csv", engine="python",encoding='EUC-KR')
#MFG_TAT = MFG_TAT.drop(MFG_TAT.columns[0], axis=1)

## MACHINE 

In [7]:
# drop: X변수와 변수 삭제
df_raw_x = MFG_MACHINE.drop(["JUDGE","PANEL_ID","LOT_ID","PANEL_ID","TIME_1BLACK","TIME_2AG","TIME_3BUS","TIME_4DIELEC","TIME_5MGO","TIME_6PHOS","TIME_7PASTE","TIME_8AGING"], axis=1, inplace=False)
df_raw_y = MFG_MACHINE["JUDGE"]
# get_dummies: 데이터의 문자형 변수에 대한 더미변수 생성
df_x_dummy = pd.get_dummies(df_raw_x)
df_raw_y = np.where(df_raw_y == "양품", 0, 1)

In [8]:
# 데이터 분할(train,test 데이터 7:3 비율로 분할)
# train_test_split(x: 설명변수 데이터 )
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_x_dummy,df_raw_y,test_size=0.3,random_state=1234)
print("train data X size:{}".format(df_train_x.shape))
print("train data Y size:{}".format(df_train_y.shape))
print("test data X size:{}".format(df_test_x.shape))
print("test data Y size:{}".format(df_test_y.shape))

train data X size:(2787, 36)
train data Y size:(2787,)
test data X size:(1195, 36)
test data Y size:(1195,)


In [9]:
tree_final = RandomForestClassifier()
tree_final.fit(df_train_x, df_train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [10]:
v_feature_name = df_train_x.columns
df_importance = pd.DataFrame()
df_importance["Feature"] = v_feature_name
df_importance["Importance"] = tree_final.feature_importances_

df_importance.sort_values("Importance", ascending=False, inplace = True)
df_importance.round(3)

Unnamed: 0,Feature,Importance
8,MC_2AG_3EXPOSURE_1호기,0.043
1,MC_1BLACK_1PRINT_2호기,0.039
27,MC_6PHOS_3B_1DISPENSER_1호기,0.038
15,MC_4DIELEC_1FIRE_2호기,0.038
21,MC_6PHOS_1R_2DRY_1호기,0.038
33,MC_7PASTE_2호기,0.035
31,MC_6PHOS_4FIRE_3호기,0.034
13,MC_3BUS_2FIRE_2호기,0.034
5,MC_2AG_1PRINT_2호기,0.033
12,MC_3BUS_2FIRE_1호기,0.033


## PROCESS

In [24]:
# drop: X변수와 변수 삭제
df_raw_x = MFG_PROCESS.drop(["JUDGE","PANEL_ID","LOT_ID","PANEL_ID","TIME_1BLACK","TIME_2AG","TIME_3BUS","TIME_4DIELEC","TIME_5MGO","TIME_6PHOS","TIME_7PASTE","TIME_8AGING"], axis=1, inplace=False)
df_raw_y = MFG_PROCESS["JUDGE"]
# get_dummies: 데이터의 문자형 변수에 대한 더미변수 생성
df_x_dummy = pd.get_dummies(df_raw_x)
df_raw_y = np.where(df_raw_y == "양품", 0, 1)

In [25]:
# 데이터 분할(train,test 데이터 7:3 비율로 분할)
# train_test_split(x: 설명변수 데이터 )
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_x_dummy,df_raw_y,test_size=0.3,random_state=1234)
print("train data X size:{}".format(df_train_x.shape))
print("train data Y size:{}".format(df_train_y.shape))
print("test data X size:{}".format(df_test_x.shape))
print("test data Y size:{}".format(df_test_y.shape))

train data X size:(2787, 70)
train data Y size:(2787,)
test data X size:(1195, 70)
test data Y size:(1195,)


In [26]:
tree_final = RandomForestClassifier()
tree_final.fit(df_train_x, df_train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
v_feature_name = df_train_x.columns
df_importance = pd.DataFrame()
df_importance["Feature"] = v_feature_name
df_importance["Importance"] = tree_final.feature_importances_

df_importance.sort_values("Importance", ascending=False, inplace = True)
df_importance.round(3)

Unnamed: 0,Feature,Importance
15,AG_RTD_TEMP_GLASS_OUT,0.293
23,BUS_DEVELOP_TEMP_TANK2,0.073
18,AG_EXPOSURE_TEMP_MASK,0.034
41,DIELEC_FIRE_EXHAUST_HEAT5,0.029
52,PHOS_R_DRY_TEMP_HOOD_MAX,0.029
67,PHOS_FIRE_EXHAUST_HEAT2,0.025
51,PHOS_R_DRY_TEMP_HOOD_START,0.023
11,AG_RTD_TEMP_WALL_START,0.021
68,PHOS_FIRE_EXHAUST_HEAT3,0.021
47,PHOS_R_DRY_TEMP_WALL_MAX,0.02


# TAT

In [15]:
# drop: X변수와 변수 삭제
df_raw_x = MFG_TAT.drop(["JUDGE","PANEL_ID","LOT_ID","PANEL_ID","TIME_1BLACK","TIME_2AG","TIME_3BUS","TIME_4DIELEC","TIME_5MGO","TIME_6PHOS","TIME_7PASTE","TIME_8AGING"], axis=1, inplace=False)
df_raw_y = MFG_TAT["JUDGE"]
# get_dummies: 데이터의 문자형 변수에 대한 더미변수 생성
df_x_dummy = pd.get_dummies(df_raw_x)
df_raw_y = np.where(df_raw_y == "양품", 0, 1)

In [16]:
# 데이터 분할(train,test 데이터 7:3 비율로 분할)
# train_test_split(x: 설명변수 데이터 )
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_x_dummy,df_raw_y,test_size=0.3,random_state=1234)
print("train data X size:{}".format(df_train_x.shape))
print("train data Y size:{}".format(df_train_y.shape))
print("test data X size:{}".format(df_test_x.shape))
print("test data Y size:{}".format(df_test_y.shape))

train data X size:(2787, 16)
train data Y size:(2787,)
test data X size:(1195, 16)
test data Y size:(1195,)


In [17]:
tree_final = RandomForestClassifier()
tree_final.fit(df_train_x, df_train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
v_feature_name = df_train_x.columns
df_importance = pd.DataFrame()
df_importance["Feature"] = v_feature_name
df_importance["Importance"] = tree_final.feature_importances_

df_importance.sort_values("Importance", ascending=False, inplace = True)
df_importance.round(3)

Unnamed: 0,Feature,Importance
7,TAT_4DIELEC_FIRE,0.167
6,TAT_3BUS_2FIRE,0.098
15,TAT_8AGING,0.088
14,TAT_6PHOS_4FIRE,0.085
5,TAT_3BUS_1DEVELOP,0.072
13,TAT_6PHOS_3B_1DISPENSER,0.068
10,TAT_6PHOS_1R_2DRY,0.067
4,TAT_2AG_3EXPOSURE,0.063
2,TAT_2AG_1PRINT,0.048
11,TAT_6PHOS_2G_1DISPENSER,0.042


------------------------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------

# MERGE

In [19]:
# 데이터 구성하기(MFG_MACHINE)
MFG_MERGE = pd.read_csv("/home/pirl/빅데이터 프로젝트/cleansing/merge/MFG_MERGE.csv", engine="python", encoding='EUC-KR')
MFG_MERGE.head()

Unnamed: 0,JUDGE,AG_RTD_TEMP_GLASS_OUT,BUS_DEVELOP_TEMP_TANK2,DIELEC_FIRE_EXHAUST_HEAT2,PHOS_R_DRY_TEMP_WALL_MAX,PHOS_R_DRY_TEMP_HOOD_START,PHOS_FIRE_EXHAUST_HEAT3,BUS_DEVELOP_TEMP_TANK1,AG_EXPOSURE_TEMP_MASK,PHOS_FIRE_EXHAUST_HEAT2,TAT_6PHOS_4FIRE,TAT_3BUS_2FIRE,TAT_4DIELEC_FIRE,TAT_8AGING,TAT_1BLACK_1PRINT,TAT_1BLACK_2RTD,TAT_2AG_2RTD,TAT_6PHOS_1R_1DISPENSER
0,양품,41.3,30.753,2200,167.8,126.0,4432,23.496,23.66,3973.0,7140,6900,6447,10074,86,220,74,76
1,양품,45.6,30.8,2132,167.2,131.0,4499,23.42,23.34,4097.0,6956,6693,6600,10075,86,210,71,80
2,양품,43.6,30.694,2178,168.5,131.0,4460,23.139,23.6,3972.0,6840,6747,6597,10645,78,220,69,80
3,양품,45.1,30.691,2220,166.5,118.0,4466,23.376,23.424,4011.0,6720,7137,6420,10076,86,210,69,78
4,양품,45.7,31.032,2128,168.1,102.0,4482,23.358,23.666,4073.0,7110,7527,6453,10074,86,210,69,76


In [20]:
# drop: X변수와 변수 삭제
df_raw_x = MFG_MERGE.drop(["JUDGE"], axis=1, inplace=False)
df_raw_y = MFG_MERGE["JUDGE"]
# get_dummies: 데이터의 문자형 변수에 대한 더미변수 생성
df_x_dummy = pd.get_dummies(df_raw_x)
df_raw_y = np.where(df_raw_y == "양품", 0, 1)

In [21]:
# 데이터 분할(train,test 데이터 7:3 비율로 분할)
# train_test_split(x: 설명변수 데이터 )
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_x_dummy,df_raw_y,test_size=0.3,random_state=1234)
print("train data X size:{}".format(df_train_x.shape))
print("train data Y size:{}".format(df_train_y.shape))
print("test data X size:{}".format(df_test_x.shape))
print("test data Y size:{}".format(df_test_y.shape))

train data X size:(2787, 17)
train data Y size:(2787,)
test data X size:(1195, 17)
test data Y size:(1195,)


In [22]:
rf_final = RandomForestClassifier()
rf_final.fit(df_train_x, df_train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
v_feature_name = df_train_x.columns
df_importance = pd.DataFrame()
df_importance["Feature"] = v_feature_name
df_importance["Importance"] = rf_final.feature_importances_

df_importance.sort_values("Importance", ascending=False, inplace = True)
df_importance.round(3)

Unnamed: 0,Feature,Importance
0,AG_RTD_TEMP_GLASS_OUT,0.295
1,BUS_DEVELOP_TEMP_TANK2,0.183
3,PHOS_R_DRY_TEMP_WALL_MAX,0.073
11,TAT_4DIELEC_FIRE,0.062
5,PHOS_FIRE_EXHAUST_HEAT3,0.058
6,BUS_DEVELOP_TEMP_TANK1,0.051
7,AG_EXPOSURE_TEMP_MASK,0.049
10,TAT_3BUS_2FIRE,0.037
2,DIELEC_FIRE_EXHAUST_HEAT2,0.035
8,PHOS_FIRE_EXHAUST_HEAT2,0.032
