In [140]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
import numpy as np
import math

In [80]:
%matplotlib inline

In [81]:
warnings.filterwarnings("ignore")

In [82]:
path = './data/'

In [83]:
encodings = ['cp949', 'euc-kr', 'utf-8']

def get_df(name):
    for encoding in encodings:
        try:
            df = pd.read_csv(path + f'{name}', encoding=encoding)
            break
        except UnicodeDecodeError:
            continue
    return df

## 데이터 확인

In [84]:
data = get_df('data.csv')

In [85]:
data.head()

Unnamed: 0,TAG_MIN,배정번호,건조 1존 OP,건조 2존 OP,건조로 온도 1 Zone,건조로 온도 2 Zone,세정기,소입1존 OP,소입2존 OP,소입3존 OP,...,소입로 CP 값,소입로 CP 모니터 값,소입로 온도 1 Zone,소입로 온도 2 Zone,소입로 온도 3 Zone,소입로 온도 4 Zone,솔트 컨베이어 온도 1 Zone,솔트 컨베이어 온도 2 Zone,솔트조 온도 1 Zone,솔트조 온도 2 Zone
0,2022-01-03 11:22:07,102410,75.6648,30.0155,,,68.4386,72.8403,59.7862,51.7169,...,0.450421,,,859.854,,,,,,
1,2022-01-03 11:22:08,102410,75.6706,32.2732,,,68.4386,78.4415,61.6286,50.4453,...,0.450356,1.14626e-10,860.338,859.78,860.044,859.786,294.658,272.538,328.734,328.865
2,2022-01-03 11:22:09,102410,75.6776,32.1592,98.8533,99.146,68.4386,78.1099,61.5414,52.0196,...,0.450341,1.1452e-10,860.338,859.78,859.981,859.724,294.658,272.538,328.734,328.805
3,2022-01-03 11:22:11,102410,75.8656,30.8312,98.7918,99.17675,68.4999,77.50725,60.6663,52.69425,...,0.450201,1.14467e-10,860.338,859.842,859.95,859.599,294.719,272.538,328.674,328.865
4,2022-01-03 11:22:12,102410,73.6468,29.5274,98.7918,99.2075,68.4386,76.0262,61.1634,51.6915,...,0.450235,1.14536e-10,860.351,859.791,859.991,859.731,294.721,272.599,328.74,328.808


In [88]:
column_names_1 = {'배정번호': 'AN', '건조 1존 OP': 'DZ1_OP', '건조 2존 OP': 'DZ2_OP'
                ,'건조로 온도 1 Zone': 'DZ1_TEMP', '건조로 온도 2 Zone': 'DZ2_TEMP', '세정기': 'CLEAN'
                ,'소입1존 OP': 'HDZ1_OP', '소입2존 OP': 'HDZ2_OP', '소입3존 OP': 'HDZ3_OP'
                ,'소입4존 OP': 'HDZ4_OP', '소입로 CP 값': 'HDZ_CP', '소입로 CP 모니터 값': 'HDZ_CPM'
                ,'소입로 온도 1 Zone': 'HDZ1_TEMP', '소입로 온도 2 Zone': 'HDZ2_TEMP', '소입로 온도 3 Zone': 'HDZ3_TEMP'
                ,'소입로 온도 4 Zone': 'HDZ4_TEMP', '솔트 컨베이어 온도 1 Zone': 'SCZ1_TEMP', '솔트 컨베이어 온도 2 Zone': 'SCZ2_TEMP'
                ,'솔트조 온도 1 Zone': 'STZ1_TEMP', '솔트조 온도 2 Zone': 'STZ2_TEMP'}

data.rename(columns=column_names_1, inplace=True)

In [89]:
data.isnull().sum()

TAG_MIN         0
AN              0
DZ1_OP          1
DZ2_OP          1
DZ1_TEMP      116
DZ2_TEMP      148
CLEAN          91
HDZ1_OP      4288
HDZ2_OP         0
HDZ3_OP         2
HDZ4_OP         3
HDZ_CP          1
HDZ_CPM       147
HDZ1_TEMP     130
HDZ2_TEMP     128
HDZ3_TEMP     157
HDZ4_TEMP     170
SCZ1_TEMP     106
SCZ2_TEMP     142
STZ1_TEMP     209
STZ2_TEMP     203
dtype: int64

In [90]:
data.describe()

Unnamed: 0,AN,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,HDZ4_OP,HDZ_CP,HDZ_CPM,HDZ1_TEMP,HDZ2_TEMP,HDZ3_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP
count,2939722.0,2939721.0,2939721.0,2939606.0,2939574.0,2939631.0,2935434.0,2939722.0,2939720.0,2939719.0,2939721.0,2939575.0,2939592.0,2939594.0,2939565.0,2939552.0,2939616.0,2939580.0,2939513.0,2939519.0
mean,128442.2,69.89404,20.44708,100.0061,100.0198,67.71864,75.64373,54.86239,53.86029,71.08925,0.4488618,1.14558e-10,859.2077,860.0021,860.0029,860.0062,283.9963,279.9293,331.8062,332.1773
std,12637.04,4.014802,5.217123,0.4360371,0.3623526,1.630768,25.16083,4.429079,2.664304,2.556959,0.01886477,6.012797e-13,3.647667,0.557848,0.3518205,0.4551778,9.51277,6.611579,0.7827379,0.8732977
min,102410.0,47.2532,0.000118883,97.3421,97.8706,60.6244,0.000850055,8.62001,0.0437045,0.0062442,0.00509637,1.11662e-10,840.298,855.929,858.28,857.992,266.23,266.426,328.161,328.073
25%,119448.0,68.4288,18.9176,99.8144,99.8901,66.5694,64.9627,53.3259,52.3891,69.6781,0.4484415,1.142615e-10,857.949,859.776,859.829,859.843,274.754,273.502,331.867,332.178
50%,129889.0,70.5166,21.2931,100.002,100.019,67.6972,82.2104,55.6654,53.8862,71.0454,0.450062,1.14555e-10,859.575,860.022,860.002,860.0,284.586,280.02,332.017,332.423
75%,139116.0,72.3781,23.3884,100.191,100.161,68.9799,95.3666,57.5733,55.4145,72.4771,0.451707,1.14844e-10,860.258,860.249,860.172,860.158,293.343,286.334,332.141,332.626
max,148069.0,87.2995,47.5395,102.469,101.843,71.4901,100.0,77.2709,66.015,87.3907,0.909111,1.32929e-10,877.228,866.034,870.119,882.148,298.53,291.696,332.717,333.179


In [91]:
quality = pd.read_excel(path+'quality.xlsx')

In [92]:
quality.head()

Unnamed: 0,배정번호,작업일,공정명,설비명,양품수량,불량수량,총수량
0,102410,2022-01-03,열처리,열처리 염욕_1,15160,3,15163
1,102585,2022-01-03,열처리,열처리 염욕_1,29892,10,29902
2,102930,2022-01-04,열처리,열처리 염욕_1,59616,30,59646
3,103142,2022-01-05,열처리,열처리 염욕_1,74730,13,74743
4,103675,2022-01-06,열처리,열처리 염욕_1,14979,2,14981


In [93]:
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   배정번호    136 non-null    int64         
 1   작업일     136 non-null    datetime64[ns]
 2   공정명     136 non-null    object        
 3   설비명     136 non-null    object        
 4   양품수량    136 non-null    int64         
 5   불량수량    136 non-null    int64         
 6   총수량     136 non-null    int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 7.6+ KB


In [94]:
column_names_2 = {'배정번호': 'AN', '작업일': 'WD', '공정명': 'PN'
                ,'설비명': 'EN', '양품수량': 'GQ', '불량수량': 'BQ'
                ,'총수량': 'TQ'}

quality.rename(columns=column_names_2, inplace=True)

In [95]:
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   AN      136 non-null    int64         
 1   WD      136 non-null    datetime64[ns]
 2   PN      136 non-null    object        
 3   EN      136 non-null    object        
 4   GQ      136 non-null    int64         
 5   BQ      136 non-null    int64         
 6   TQ      136 non-null    int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 7.6+ KB


In [96]:
quality.describe()

Unnamed: 0,AN,GQ,BQ,TQ
count,136.0,136.0,136.0,136.0
mean,128897.191176,45012.301471,15.117647,45027.419118
std,12403.393924,25548.197992,18.549657,25554.54871
min,102410.0,8412.0,0.0,8414.0
25%,120467.75,29736.0,4.0,29755.0
50%,130199.0,44003.0,8.5,44020.5
75%,138982.5,60120.75,17.0,60135.5
max,148069.0,104740.0,120.0,104761.0


In [97]:
quality.isnull().sum()

AN    0
WD    0
PN    0
EN    0
GQ    0
BQ    0
TQ    0
dtype: int64

In [98]:
train = get_df('train.csv')

In [99]:
train.head()

Unnamed: 0.1,Unnamed: 0,건조 1존 OP_Avg,건조 1존 OP_Std,건조 2존 OP_Avg,건조 2존 OP_Std,건조로 온도 1 Zone_Avg,건조로 온도 1 Zone_Std,건조로 온도 2 Zone_Avg,건조로 온도 2 Zone_Std,세정기_Avg,...,소입로 온도 4 Zone_Std,솔트 컨베이어 온도 1 Zone_Avg,솔트 컨베이어 온도 1 Zone_Std,솔트 컨베이어 온도 2 Zone_Avg,솔트 컨베이어 온도 2 Zone_Std,솔트조 온도 1 Zone_Avg,솔트조 온도 1 Zone_Std,솔트조 온도 2 Zone_Avg,솔트조 온도 2 Zone_Std,불량단계
0,97,69.497726,3.274577,20.310463,3.490991,99.999143,0.435237,100.001123,0.401438,67.864965,...,0.304168,284.699659,9.60185,280.411936,6.940009,332.111266,0.152253,332.712474,0.153026,안정
1,125,68.7767,3.548587,16.547672,4.161717,100.07776,0.394062,100.107134,0.291589,69.61422,...,0.3037,285.00715,9.239152,280.646734,6.650701,332.123215,0.225985,332.093658,0.329912,위험
2,11,73.502913,2.645737,21.218347,2.218216,100.006615,0.387797,99.992686,0.281373,66.220995,...,0.40205,283.120448,9.426413,279.110908,6.064772,332.182887,0.099322,332.407261,0.10744,안정
3,129,68.062513,3.439085,4.366498,3.727635,100.040387,0.418439,100.139576,0.334492,69.242707,...,0.360325,285.074759,9.475964,280.790056,6.699134,332.277923,0.121808,332.261568,0.184196,안정
4,48,68.820299,3.946638,19.902113,3.765778,100.07193,0.367971,100.097453,0.291788,65.512487,...,0.211795,284.166005,9.847216,279.587268,6.606966,331.943223,0.189118,332.503069,0.359428,안정


In [100]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 38 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             108 non-null    int64  
 1   건조 1존 OP_Avg           108 non-null    float64
 2   건조 1존 OP_Std           108 non-null    float64
 3   건조 2존 OP_Avg           108 non-null    float64
 4   건조 2존 OP_Std           108 non-null    float64
 5   건조로 온도 1 Zone_Avg      108 non-null    float64
 6   건조로 온도 1 Zone_Std      108 non-null    float64
 7   건조로 온도 2 Zone_Avg      108 non-null    float64
 8   건조로 온도 2 Zone_Std      108 non-null    float64
 9   세정기_Avg                108 non-null    float64
 10  세정기_Std                108 non-null    float64
 11  소입1존 OP_Avg            108 non-null    float64
 12  소입1존 OP_Std            108 non-null    float64
 13  소입2존 OP_Avg            108 non-null    float64
 14  소입2존 OP_Std            108 non-null    float64
 15  소입3존 O

In [101]:
column_names_3 = {'Unnamed: 0' : 'UNNAMED', '건조 1존 OP': 'DZ1_OP', '건조 2존 OP': 'DZ2_OP'
                ,'건조로 온도 1 Zone': 'DZ1_TEMP', '건조로 온도 2 Zone': 'DZ2_TEMP', '세정기': 'CLEAN'
                ,'소입1존 OP': 'HDZ1_OP', '소입2존 OP': 'HDZ2_OP', '소입3존 OP': 'HDZ3_OP'
                ,'소입4존 OP': 'HDZ4_OP', '소입로 CP 값': 'HDZ_CP', '소입로 CP 모니터 값 ': 'HDZ_CPM'
                ,'소입로 온도 1 Zone': 'HDZ1_TEMP', '소입로 온도 2 Zone': 'HDZ2_TEMP', '소입로 온도 3 Zone': 'HDZ3_TEMP'
                ,'소입로 온도 4 Zone': 'HDZ4_TEMP', '솔트 컨베이어 온도 1 Zone': 'SCZ1_TEMP', '솔트 컨베이어 온도 2 Zone': 'SCZ2_TEMP'
                ,'솔트조 온도 1 Zone': 'STZ1_TEMP', '솔트조 온도 2 Zone': 'STZ2_TEMP', '불량단계' : 'FS'}

column_names_combined = {}
for key, value in column_names_3.items():
    column_names_combined[key] = value
    column_names_combined[f'{key}_Avg'] = f'{value}_Avg'
    column_names_combined[f'{key}_Std'] = f'{value}_Std'
    
train.rename(columns=column_names_combined, inplace=True)

In [102]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   UNNAMED        108 non-null    int64  
 1   DZ1_OP_Avg     108 non-null    float64
 2   DZ1_OP_Std     108 non-null    float64
 3   DZ2_OP_Avg     108 non-null    float64
 4   DZ2_OP_Std     108 non-null    float64
 5   DZ1_TEMP_Avg   108 non-null    float64
 6   DZ1_TEMP_Std   108 non-null    float64
 7   DZ2_TEMP_Avg   108 non-null    float64
 8   DZ2_TEMP_Std   108 non-null    float64
 9   CLEAN_Avg      108 non-null    float64
 10  CLEAN_Std      108 non-null    float64
 11  HDZ1_OP_Avg    108 non-null    float64
 12  HDZ1_OP_Std    108 non-null    float64
 13  HDZ2_OP_Avg    108 non-null    float64
 14  HDZ2_OP_Std    108 non-null    float64
 15  HDZ3_OP_Avg    108 non-null    float64
 16  HDZ3_OP_Std    108 non-null    float64
 17  HDZ4_OP_Avg    108 non-null    float64
 18  HDZ4_OP_St

## 데이터 전처리

### 날짜 변환

In [103]:
data['TAG_MIN'] = pd.to_datetime(data['TAG_MIN'])

In [104]:
data['Year'] = data['TAG_MIN'].dt.year
data['Month'] = data['TAG_MIN'].dt.month
data['Day'] = data['TAG_MIN'].dt.day
data['Hour'] = data['TAG_MIN'].dt.hour
data['Minute'] = data['TAG_MIN'].dt.minute
data['Second'] = data['TAG_MIN'].dt.second

In [108]:
data['Day']

0           3
1           3
2           3
3           3
4           3
           ..
2939717    19
2939718    19
2939719    19
2939720    19
2939721    19
Name: Day, Length: 2939722, dtype: int64

### 결측치 처리

In [51]:
# from impyute.imputation.cs import mice

# np_imputed=mice(data.drop('TAG_MIN', axis=1).values)
# data_imputed=pd.DataFrame(np_imputed)
# data_imputed

## 데이터 시각화

## 데이터 병합, 종속변수 생성, 차원 축소

### 데이터 병합

In [59]:
columns_to_exclude = ['AN', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']  
included_columns = [col for col in data.columns if col not in columns_to_exclude]

data_stat = data.groupby('AN')[included_columns].agg(['mean', 'std'])
data_stat

Unnamed: 0_level_0,DZ1_OP,DZ1_OP,DZ2_OP,DZ2_OP,DZ1_TEMP,DZ1_TEMP,DZ2_TEMP,DZ2_TEMP,CLEAN,CLEAN,...,HDZ4_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ1_TEMP,SCZ2_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ1_TEMP,STZ2_TEMP,STZ2_TEMP
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
AN,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
102410,72.252727,3.696537,21.354464,4.348928,99.943460,0.593971,100.061896,0.483518,69.602560,0.845406,...,860.010592,0.553463,282.581303,9.371745,280.149012,6.034272,329.015940,0.122747,329.070466,0.116526
102585,72.235643,3.365000,18.602563,2.859741,99.987431,0.515429,100.065032,0.356142,69.591183,1.064200,...,859.991765,0.480499,282.788156,9.499565,279.772316,7.161542,328.998615,0.101219,328.924151,0.089118
102930,70.720207,3.231776,20.911928,2.582097,99.995592,0.472736,100.021642,0.343075,69.529495,1.097923,...,860.007487,0.418431,283.330828,9.680618,279.308958,6.665152,329.133511,0.120112,329.148656,0.115569
103142,72.424229,2.635245,22.250186,2.402781,100.005056,0.331403,100.009675,0.251791,69.536974,1.064265,...,860.003481,0.296754,282.882341,9.495495,279.241057,6.537643,329.081883,0.097665,329.073103,0.100489
103675,72.774648,4.159221,21.865151,3.622806,99.983502,0.655347,100.043710,0.470749,69.320977,0.991705,...,860.007201,0.571261,283.581648,9.705562,277.544769,5.365901,329.010418,0.089961,329.114051,0.078853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147292,69.486127,3.123360,4.376536,2.514362,100.019260,0.381383,100.079487,0.273451,69.639903,0.764689,...,859.994792,0.328228,284.629741,9.223762,280.419514,6.696554,332.184151,0.110639,332.210106,0.129191
147546,69.718808,2.662344,17.836625,3.098225,100.024208,0.314559,100.005331,0.259923,70.128396,0.499345,...,859.998661,0.206363,284.236529,9.317693,279.987625,6.612389,332.155767,0.173019,332.162465,0.260722
147982,69.799029,3.164459,17.913929,3.446134,100.028487,0.347958,100.026178,0.323294,69.695840,1.139830,...,860.002395,0.313158,284.190848,9.223516,279.977802,6.538111,332.209281,0.096231,332.125705,0.118020
147996,69.991809,3.564122,16.868628,5.317683,99.990732,0.381893,100.002368,0.378060,69.460694,1.019985,...,859.991821,0.343028,284.781555,9.271321,280.511153,6.783697,332.150655,0.158543,332.088618,0.237818


In [62]:
chg_name = {'mean': '_Avg', 'std': '_Std'}
data_stat.columns = list(map(lambda x: x[0] + chg_name[x[1]], data_stat.columns))
data_stat.reset_index(drop=False, inplace=True)

In [63]:
data_stat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 39 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AN             136 non-null    int64  
 1   DZ1_OP_Avg     136 non-null    float64
 2   DZ1_OP_Std     136 non-null    float64
 3   DZ2_OP_Avg     136 non-null    float64
 4   DZ2_OP_Std     136 non-null    float64
 5   DZ1_TEMP_Avg   136 non-null    float64
 6   DZ1_TEMP_Std   136 non-null    float64
 7   DZ2_TEMP_Avg   136 non-null    float64
 8   DZ2_TEMP_Std   136 non-null    float64
 9   CLEAN_Avg      136 non-null    float64
 10  CLEAN_Std      136 non-null    float64
 11  HDZ1_OP_Avg    136 non-null    float64
 12  HDZ1_OP_Std    136 non-null    float64
 13  HDZ2_OP_Avg    136 non-null    float64
 14  HDZ2_OP_Std    136 non-null    float64
 15  HDZ3_OP_Avg    136 non-null    float64
 16  HDZ3_OP_Std    136 non-null    float64
 17  HDZ4_OP_Avg    136 non-null    float64
 18  HDZ4_OP_St

In [64]:
df_total = pd.merge(quality, data_stat, on='AN', how='left')
df_total.head()

Unnamed: 0,AN,WD,PN,EN,GQ,BQ,TQ,DZ1_OP_Avg,DZ1_OP_Std,DZ2_OP_Avg,...,HDZ4_TEMP_Avg,HDZ4_TEMP_Std,SCZ1_TEMP_Avg,SCZ1_TEMP_Std,SCZ2_TEMP_Avg,SCZ2_TEMP_Std,STZ1_TEMP_Avg,STZ1_TEMP_Std,STZ2_TEMP_Avg,STZ2_TEMP_Std
0,102410,2022-01-03,열처리,열처리 염욕_1,15160,3,15163,72.252727,3.696537,21.354464,...,860.010592,0.553463,282.581303,9.371745,280.149012,6.034272,329.01594,0.122747,329.070466,0.116526
1,102585,2022-01-03,열처리,열처리 염욕_1,29892,10,29902,72.235643,3.365,18.602563,...,859.991765,0.480499,282.788156,9.499565,279.772316,7.161542,328.998615,0.101219,328.924151,0.089118
2,102930,2022-01-04,열처리,열처리 염욕_1,59616,30,59646,70.720207,3.231776,20.911928,...,860.007487,0.418431,283.330828,9.680618,279.308958,6.665152,329.133511,0.120112,329.148656,0.115569
3,103142,2022-01-05,열처리,열처리 염욕_1,74730,13,74743,72.424229,2.635245,22.250186,...,860.003481,0.296754,282.882341,9.495495,279.241057,6.537643,329.081883,0.097665,329.073103,0.100489
4,103675,2022-01-06,열처리,열처리 염욕_1,14979,2,14981,72.774648,4.159221,21.865151,...,860.007201,0.571261,283.581648,9.705562,277.544769,5.365901,329.010418,0.089961,329.114051,0.078853


In [65]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 0 to 135
Data columns (total 45 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   AN             136 non-null    int64         
 1   WD             136 non-null    datetime64[ns]
 2   PN             136 non-null    object        
 3   EN             136 non-null    object        
 4   GQ             136 non-null    int64         
 5   BQ             136 non-null    int64         
 6   TQ             136 non-null    int64         
 7   DZ1_OP_Avg     136 non-null    float64       
 8   DZ1_OP_Std     136 non-null    float64       
 9   DZ2_OP_Avg     136 non-null    float64       
 10  DZ2_OP_Std     136 non-null    float64       
 11  DZ1_TEMP_Avg   136 non-null    float64       
 12  DZ1_TEMP_Std   136 non-null    float64       
 13  DZ2_TEMP_Avg   136 non-null    float64       
 14  DZ2_TEMP_Std   136 non-null    float64       
 15  CLEAN_Avg      136 non-

### 종속변수 생성

In [66]:
# 종속변수 생성을 위한 파생변수 생성
df_total["FR"] = round(df_total["BQ"] / df_total["TQ"] *100, 3)

In [136]:
df_total.groupby('AN')['FR'].mean()

AN
102410    0.020
102585    0.033
102930    0.050
103142    0.017
103675    0.013
          ...  
147292    0.027
147546    0.027
147982    0.029
147996    0.030
148069    0.000
Name: FR, Length: 136, dtype: float64

In [146]:
# 6시그마 기준으로 하여 불량여부 판단

def calculate_lower_bound(row):
    return row['FR'] - 3 * math.sqrt((row['FR'] * (1 - row['FR'])) / row['TQ'])

df_total['lower_bound'] = df_total.apply(calculate_lower_bound, axis=1)

def set_fr_level(row):
    if row['FR'] > row['lower_bound']:
        return 'dangerous'
    else:
        return 'stable'

df_total['FR_Level'] = df_total.apply(set_fr_level, axis=1)
df_total = df_total.drop('lower_bound', axis=1)                      

In [147]:
df_total['FR_Level'].value_counts()

dangerous    133
stable         3
Name: FR_Level, dtype: int64

In [115]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 0 to 135
Data columns (total 47 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   AN             136 non-null    int64         
 1   WD             136 non-null    datetime64[ns]
 2   PN             136 non-null    object        
 3   EN             136 non-null    object        
 4   GQ             136 non-null    int64         
 5   BQ             136 non-null    int64         
 6   TQ             136 non-null    int64         
 7   DZ1_OP_Avg     136 non-null    float64       
 8   DZ1_OP_Std     136 non-null    float64       
 9   DZ2_OP_Avg     136 non-null    float64       
 10  DZ2_OP_Std     136 non-null    float64       
 11  DZ1_TEMP_Avg   136 non-null    float64       
 12  DZ1_TEMP_Std   136 non-null    float64       
 13  DZ2_TEMP_Avg   136 non-null    float64       
 14  DZ2_TEMP_Std   136 non-null    float64       
 15  CLEAN_Avg      136 non-

### 차원축소

#### 공정별 불량품과의 PCC

In [121]:
df = data.drop(['TAG_MIN', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second'] , axis=1)

In [122]:
for col in df.columns.drop('AN'):
    # Calculate the first and third quartiles for the 'Dry 1-zone OP' column
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)

    # Calculate the IQR for the 'Dry 1-zone OP' column
    iqr = 1.5 * (q3 - q1)

    # Create a boolean mask for outliers in the 'Dry 1-zone OP' column
    outlier_mask = (df[col] < (q1 - iqr)) | (df[col] > (q3 + iqr))

    # Create a new column 'Outlier' to label rows with 1 for outliers and 0 for non-outliers
    df[col] = np.where(outlier_mask, 1, 0)

In [123]:
merged_df = quality.merge(df.groupby('AN').sum(), how='inner', on='AN')

In [125]:
merged_df

Unnamed: 0,AN,WD,PN,EN,GQ,BQ,TQ,DZ1_OP,DZ2_OP,DZ1_TEMP,...,HDZ_CP,HDZ_CPM,HDZ1_TEMP,HDZ2_TEMP,HDZ3_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP
0,102410,2022-01-03,열처리,열처리 염욕_1,15160,3,15163,550,708,1082,...,763,262,964,271,262,1085,0,0,7341,7341
1,102585,2022-01-03,열처리,열처리 염욕_1,29892,10,29902,564,501,1591,...,441,251,1390,567,253,1934,0,0,13299,13299
2,102930,2022-01-04,열처리,열처리 염욕_1,59616,30,59646,963,399,2629,...,3041,667,2914,1032,659,3029,0,0,26981,26981
3,103142,2022-01-05,열처리,열처리 염욕_1,74730,13,74743,566,369,1076,...,977,42,819,0,41,1140,0,0,31727,31728
4,103675,2022-01-06,열처리,열처리 염욕_1,14979,2,14981,556,301,870,...,361,332,878,0,332,976,0,0,6217,6217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,147292,2022-07-14,열처리,열처리 염욕_1,43765,12,43777,833,20149,1149,...,1764,36,1129,527,32,1244,0,0,0,0
132,147546,2022-07-15,열처리,열처리 염욕_1,59957,16,59973,665,1384,795,...,829,291,773,365,290,244,0,0,11,2073
133,147982,2022-07-18,열처리,열처리 염욕_1,40981,12,40993,862,1117,861,...,990,555,1348,1040,560,1178,0,0,0,0
134,147996,2022-07-18,열처리,열처리 염욕_1,30239,9,30248,931,2281,821,...,1015,643,1313,882,651,1135,0,0,0,404


In [124]:
merged_df.corr()['BQ'].sort_values(ascending=False)

BQ           1.000000
HDZ1_TEMP    0.392567
HDZ1_OP      0.354210
TQ           0.342684
GQ           0.342043
HDZ2_TEMP    0.329278
DZ1_TEMP     0.293382
DZ2_TEMP     0.285482
HDZ4_TEMP    0.269034
HDZ3_TEMP    0.245543
HDZ_CPM      0.244564
HDZ_CP       0.241514
HDZ4_OP      0.056438
AN           0.054524
HDZ3_OP     -0.019865
STZ1_TEMP   -0.040367
DZ2_OP      -0.045525
CLEAN       -0.059197
DZ1_OP      -0.066733
STZ2_TEMP   -0.067383
HDZ2_OP     -0.071493
SCZ1_TEMP         NaN
SCZ2_TEMP         NaN
Name: BQ, dtype: float64