In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
import numpy as np
import math

In [2]:
%matplotlib inline

In [3]:
warnings.filterwarnings("ignore")

In [4]:
path = './data/'

In [5]:
encodings = ['cp949', 'euc-kr', 'utf-8']

def get_df(name):
    for encoding in encodings:
        try:
            df = pd.read_csv(path + f'{name}', encoding=encoding)
            break
        except UnicodeDecodeError:
            continue
    return df

## 데이터 확인

In [6]:
data = get_df('data.csv')

In [7]:
data.head()

Unnamed: 0,TAG_MIN,배정번호,건조 1존 OP,건조 2존 OP,건조로 온도 1 Zone,건조로 온도 2 Zone,세정기,소입1존 OP,소입2존 OP,소입3존 OP,...,소입로 CP 값,소입로 CP 모니터 값,소입로 온도 1 Zone,소입로 온도 2 Zone,소입로 온도 3 Zone,소입로 온도 4 Zone,솔트 컨베이어 온도 1 Zone,솔트 컨베이어 온도 2 Zone,솔트조 온도 1 Zone,솔트조 온도 2 Zone
0,2022-01-03 11:22:07,102410,75.6648,30.0155,,,68.4386,72.8403,59.7862,51.7169,...,0.450421,,,859.854,,,,,,
1,2022-01-03 11:22:08,102410,75.6706,32.2732,,,68.4386,78.4415,61.6286,50.4453,...,0.450356,1.14626e-10,860.338,859.78,860.044,859.786,294.658,272.538,328.734,328.865
2,2022-01-03 11:22:09,102410,75.6776,32.1592,98.8533,99.146,68.4386,78.1099,61.5414,52.0196,...,0.450341,1.1452e-10,860.338,859.78,859.981,859.724,294.658,272.538,328.734,328.805
3,2022-01-03 11:22:11,102410,75.8656,30.8312,98.7918,99.17675,68.4999,77.50725,60.6663,52.69425,...,0.450201,1.14467e-10,860.338,859.842,859.95,859.599,294.719,272.538,328.674,328.865
4,2022-01-03 11:22:12,102410,73.6468,29.5274,98.7918,99.2075,68.4386,76.0262,61.1634,51.6915,...,0.450235,1.14536e-10,860.351,859.791,859.991,859.731,294.721,272.599,328.74,328.808


In [8]:
column_names_1 = {'배정번호': 'AN', '건조 1존 OP': 'DZ1_OP', '건조 2존 OP': 'DZ2_OP'
                ,'건조로 온도 1 Zone': 'DZ1_TEMP', '건조로 온도 2 Zone': 'DZ2_TEMP', '세정기': 'CLEAN'
                ,'소입1존 OP': 'HDZ1_OP', '소입2존 OP': 'HDZ2_OP', '소입3존 OP': 'HDZ3_OP'
                ,'소입4존 OP': 'HDZ4_OP', '소입로 CP 값': 'HDZ_CP', '소입로 CP 모니터 값': 'HDZ_CPM'
                ,'소입로 온도 1 Zone': 'HDZ1_TEMP', '소입로 온도 2 Zone': 'HDZ2_TEMP', '소입로 온도 3 Zone': 'HDZ3_TEMP'
                ,'소입로 온도 4 Zone': 'HDZ4_TEMP', '솔트 컨베이어 온도 1 Zone': 'SCZ1_TEMP', '솔트 컨베이어 온도 2 Zone': 'SCZ2_TEMP'
                ,'솔트조 온도 1 Zone': 'STZ1_TEMP', '솔트조 온도 2 Zone': 'STZ2_TEMP'}

data.rename(columns=column_names_1, inplace=True)

In [9]:
data.describe()

Unnamed: 0,AN,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,HDZ4_OP,HDZ_CP,HDZ_CPM,HDZ1_TEMP,HDZ2_TEMP,HDZ3_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP
count,2939722.0,2939721.0,2939721.0,2939606.0,2939574.0,2939631.0,2935434.0,2939722.0,2939720.0,2939719.0,2939721.0,2939575.0,2939592.0,2939594.0,2939565.0,2939552.0,2939616.0,2939580.0,2939513.0,2939519.0
mean,128442.2,69.89404,20.44708,100.0061,100.0198,67.71864,75.64373,54.86239,53.86029,71.08925,0.4488618,1.14558e-10,859.2077,860.0021,860.0029,860.0062,283.9963,279.9293,331.8062,332.1773
std,12637.04,4.014802,5.217123,0.4360371,0.3623526,1.630768,25.16083,4.429079,2.664304,2.556959,0.01886477,6.012797e-13,3.647667,0.557848,0.3518205,0.4551778,9.51277,6.611579,0.7827379,0.8732977
min,102410.0,47.2532,0.000118883,97.3421,97.8706,60.6244,0.000850055,8.62001,0.0437045,0.0062442,0.00509637,1.11662e-10,840.298,855.929,858.28,857.992,266.23,266.426,328.161,328.073
25%,119448.0,68.4288,18.9176,99.8144,99.8901,66.5694,64.9627,53.3259,52.3891,69.6781,0.4484415,1.142615e-10,857.949,859.776,859.829,859.843,274.754,273.502,331.867,332.178
50%,129889.0,70.5166,21.2931,100.002,100.019,67.6972,82.2104,55.6654,53.8862,71.0454,0.450062,1.14555e-10,859.575,860.022,860.002,860.0,284.586,280.02,332.017,332.423
75%,139116.0,72.3781,23.3884,100.191,100.161,68.9799,95.3666,57.5733,55.4145,72.4771,0.451707,1.14844e-10,860.258,860.249,860.172,860.158,293.343,286.334,332.141,332.626
max,148069.0,87.2995,47.5395,102.469,101.843,71.4901,100.0,77.2709,66.015,87.3907,0.909111,1.32929e-10,877.228,866.034,870.119,882.148,298.53,291.696,332.717,333.179


In [10]:
quality = pd.read_excel(path+'quality.xlsx')

In [11]:
quality.head()

Unnamed: 0,배정번호,작업일,공정명,설비명,양품수량,불량수량,총수량
0,102410,2022-01-03,열처리,열처리 염욕_1,15160,3,15163
1,102585,2022-01-03,열처리,열처리 염욕_1,29892,10,29902
2,102930,2022-01-04,열처리,열처리 염욕_1,59616,30,59646
3,103142,2022-01-05,열처리,열처리 염욕_1,74730,13,74743
4,103675,2022-01-06,열처리,열처리 염욕_1,14979,2,14981


In [12]:
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   배정번호    136 non-null    int64         
 1   작업일     136 non-null    datetime64[ns]
 2   공정명     136 non-null    object        
 3   설비명     136 non-null    object        
 4   양품수량    136 non-null    int64         
 5   불량수량    136 non-null    int64         
 6   총수량     136 non-null    int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 7.6+ KB


In [13]:
column_names_2 = {'배정번호': 'AN', '작업일': 'WD', '공정명': 'PN'
                ,'설비명': 'EN', '양품수량': 'GQ', '불량수량': 'BQ'
                ,'총수량': 'TQ'}

quality.rename(columns=column_names_2, inplace=True)

In [14]:
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   AN      136 non-null    int64         
 1   WD      136 non-null    datetime64[ns]
 2   PN      136 non-null    object        
 3   EN      136 non-null    object        
 4   GQ      136 non-null    int64         
 5   BQ      136 non-null    int64         
 6   TQ      136 non-null    int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 7.6+ KB


In [15]:
quality.describe()

Unnamed: 0,AN,GQ,BQ,TQ
count,136.0,136.0,136.0,136.0
mean,128897.191176,45012.301471,15.117647,45027.419118
std,12403.393924,25548.197992,18.549657,25554.54871
min,102410.0,8412.0,0.0,8414.0
25%,120467.75,29736.0,4.0,29755.0
50%,130199.0,44003.0,8.5,44020.5
75%,138982.5,60120.75,17.0,60135.5
max,148069.0,104740.0,120.0,104761.0


In [16]:
quality.isnull().sum()

AN    0
WD    0
PN    0
EN    0
GQ    0
BQ    0
TQ    0
dtype: int64

In [17]:
train = get_df('train.csv')

In [18]:
train.head()

Unnamed: 0.1,Unnamed: 0,건조 1존 OP_Avg,건조 1존 OP_Std,건조 2존 OP_Avg,건조 2존 OP_Std,건조로 온도 1 Zone_Avg,건조로 온도 1 Zone_Std,건조로 온도 2 Zone_Avg,건조로 온도 2 Zone_Std,세정기_Avg,...,소입로 온도 4 Zone_Std,솔트 컨베이어 온도 1 Zone_Avg,솔트 컨베이어 온도 1 Zone_Std,솔트 컨베이어 온도 2 Zone_Avg,솔트 컨베이어 온도 2 Zone_Std,솔트조 온도 1 Zone_Avg,솔트조 온도 1 Zone_Std,솔트조 온도 2 Zone_Avg,솔트조 온도 2 Zone_Std,불량단계
0,97,69.497726,3.274577,20.310463,3.490991,99.999143,0.435237,100.001123,0.401438,67.864965,...,0.304168,284.699659,9.60185,280.411936,6.940009,332.111266,0.152253,332.712474,0.153026,안정
1,125,68.7767,3.548587,16.547672,4.161717,100.07776,0.394062,100.107134,0.291589,69.61422,...,0.3037,285.00715,9.239152,280.646734,6.650701,332.123215,0.225985,332.093658,0.329912,위험
2,11,73.502913,2.645737,21.218347,2.218216,100.006615,0.387797,99.992686,0.281373,66.220995,...,0.40205,283.120448,9.426413,279.110908,6.064772,332.182887,0.099322,332.407261,0.10744,안정
3,129,68.062513,3.439085,4.366498,3.727635,100.040387,0.418439,100.139576,0.334492,69.242707,...,0.360325,285.074759,9.475964,280.790056,6.699134,332.277923,0.121808,332.261568,0.184196,안정
4,48,68.820299,3.946638,19.902113,3.765778,100.07193,0.367971,100.097453,0.291788,65.512487,...,0.211795,284.166005,9.847216,279.587268,6.606966,331.943223,0.189118,332.503069,0.359428,안정


In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 38 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             108 non-null    int64  
 1   건조 1존 OP_Avg           108 non-null    float64
 2   건조 1존 OP_Std           108 non-null    float64
 3   건조 2존 OP_Avg           108 non-null    float64
 4   건조 2존 OP_Std           108 non-null    float64
 5   건조로 온도 1 Zone_Avg      108 non-null    float64
 6   건조로 온도 1 Zone_Std      108 non-null    float64
 7   건조로 온도 2 Zone_Avg      108 non-null    float64
 8   건조로 온도 2 Zone_Std      108 non-null    float64
 9   세정기_Avg                108 non-null    float64
 10  세정기_Std                108 non-null    float64
 11  소입1존 OP_Avg            108 non-null    float64
 12  소입1존 OP_Std            108 non-null    float64
 13  소입2존 OP_Avg            108 non-null    float64
 14  소입2존 OP_Std            108 non-null    float64
 15  소입3존 O

In [20]:
column_names_3 = {'Unnamed: 0' : 'UNNAMED', '건조 1존 OP': 'DZ1_OP', '건조 2존 OP': 'DZ2_OP'
                ,'건조로 온도 1 Zone': 'DZ1_TEMP', '건조로 온도 2 Zone': 'DZ2_TEMP', '세정기': 'CLEAN'
                ,'소입1존 OP': 'HDZ1_OP', '소입2존 OP': 'HDZ2_OP', '소입3존 OP': 'HDZ3_OP'
                ,'소입4존 OP': 'HDZ4_OP', '소입로 CP 값': 'HDZ_CP', '소입로 CP 모니터 값 ': 'HDZ_CPM'
                ,'소입로 온도 1 Zone': 'HDZ1_TEMP', '소입로 온도 2 Zone': 'HDZ2_TEMP', '소입로 온도 3 Zone': 'HDZ3_TEMP'
                ,'소입로 온도 4 Zone': 'HDZ4_TEMP', '솔트 컨베이어 온도 1 Zone': 'SCZ1_TEMP', '솔트 컨베이어 온도 2 Zone': 'SCZ2_TEMP'
                ,'솔트조 온도 1 Zone': 'STZ1_TEMP', '솔트조 온도 2 Zone': 'STZ2_TEMP', '불량단계' : 'FS'}

column_names_combined = {}
for key, value in column_names_3.items():
    column_names_combined[key] = value
    column_names_combined[f'{key}_Avg'] = f'{value}_Avg'
    column_names_combined[f'{key}_Std'] = f'{value}_Std'
    
train.rename(columns=column_names_combined, inplace=True)

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   UNNAMED        108 non-null    int64  
 1   DZ1_OP_Avg     108 non-null    float64
 2   DZ1_OP_Std     108 non-null    float64
 3   DZ2_OP_Avg     108 non-null    float64
 4   DZ2_OP_Std     108 non-null    float64
 5   DZ1_TEMP_Avg   108 non-null    float64
 6   DZ1_TEMP_Std   108 non-null    float64
 7   DZ2_TEMP_Avg   108 non-null    float64
 8   DZ2_TEMP_Std   108 non-null    float64
 9   CLEAN_Avg      108 non-null    float64
 10  CLEAN_Std      108 non-null    float64
 11  HDZ1_OP_Avg    108 non-null    float64
 12  HDZ1_OP_Std    108 non-null    float64
 13  HDZ2_OP_Avg    108 non-null    float64
 14  HDZ2_OP_Std    108 non-null    float64
 15  HDZ3_OP_Avg    108 non-null    float64
 16  HDZ3_OP_Std    108 non-null    float64
 17  HDZ4_OP_Avg    108 non-null    float64
 18  HDZ4_OP_St

## 데이터 전처리

### 날짜 변환

In [22]:
data['TAG_MIN'] = pd.to_datetime(data['TAG_MIN'])

In [23]:
data['Year'] = data['TAG_MIN'].dt.year
data['Month'] = data['TAG_MIN'].dt.month
data['Day'] = data['TAG_MIN'].dt.day
data['Hour'] = data['TAG_MIN'].dt.hour
data['Minute'] = data['TAG_MIN'].dt.minute
data['Second'] = data['TAG_MIN'].dt.second

In [24]:
data['Day']

0           3
1           3
2           3
3           3
4           3
           ..
2939717    19
2939718    19
2939719    19
2939720    19
2939721    19
Name: Day, Length: 2939722, dtype: int64

### 결측치 처리

In [25]:
data.isnull().sum()

TAG_MIN         0
AN              0
DZ1_OP          1
DZ2_OP          1
DZ1_TEMP      116
DZ2_TEMP      148
CLEAN          91
HDZ1_OP      4288
HDZ2_OP         0
HDZ3_OP         2
HDZ4_OP         3
HDZ_CP          1
HDZ_CPM       147
HDZ1_TEMP     130
HDZ2_TEMP     128
HDZ3_TEMP     157
HDZ4_TEMP     170
SCZ1_TEMP     106
SCZ2_TEMP     142
STZ1_TEMP     209
STZ2_TEMP     203
Year            0
Month           0
Day             0
Hour            0
Minute          0
Second          0
dtype: int64

In [27]:
data = data.fillna(data.median())

## 데이터 시각화

## 데이터 병합, 종속변수 생성, 차원 축소

### 데이터 병합

In [28]:
columns_to_exclude = ['AN', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']  
included_columns = [col for col in data.columns if col not in columns_to_exclude]

data_stat = data.groupby('AN')[included_columns].agg(['mean', 'std'])
data_stat

Unnamed: 0_level_0,DZ1_OP,DZ1_OP,DZ2_OP,DZ2_OP,DZ1_TEMP,DZ1_TEMP,DZ2_TEMP,DZ2_TEMP,CLEAN,CLEAN,...,HDZ4_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ1_TEMP,SCZ2_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ1_TEMP,STZ2_TEMP,STZ2_TEMP
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
AN,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
102410,72.252727,3.696537,21.354464,4.348928,99.943476,0.593891,100.061884,0.483452,69.602560,0.845406,...,860.010591,0.553425,282.581576,9.371136,280.148995,6.033861,329.016349,0.127638,329.070923,0.122912
102585,72.235643,3.365000,18.602563,2.859741,99.987431,0.515429,100.065032,0.356142,69.591183,1.064200,...,859.991765,0.480499,282.788156,9.499565,279.772316,7.161542,328.998615,0.101219,328.924151,0.089118
102930,70.720207,3.231776,20.911928,2.582097,99.995592,0.472719,100.021641,0.343024,69.529495,1.097923,...,860.007487,0.418431,283.330874,9.680441,279.308958,6.665152,329.133618,0.121386,329.148777,0.117273
103142,72.424229,2.635245,22.250186,2.402781,100.005055,0.331393,100.009675,0.251783,69.536859,1.064331,...,860.003481,0.296744,282.882341,9.495495,279.241106,6.537439,329.082068,0.100404,329.073209,0.102232
103675,72.774648,4.159221,21.865151,3.622806,99.983502,0.655347,100.043710,0.470749,69.320977,0.991705,...,860.007199,0.571169,283.581648,9.705562,277.544769,5.365901,329.010901,0.097701,329.114583,0.089317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147292,69.486127,3.123360,4.376536,2.514362,100.019260,0.381383,100.079487,0.273451,69.639903,0.764689,...,859.994794,0.328179,284.629721,9.221702,280.419455,6.696057,332.184151,0.110639,332.210159,0.129219
147546,69.718808,2.662344,17.836625,3.098225,100.024207,0.314547,100.005331,0.259923,70.128396,0.499345,...,859.998661,0.206355,284.236555,9.317357,279.987627,6.612269,332.155757,0.173016,332.162484,0.260722
147982,69.799029,3.164459,17.913929,3.446134,100.028487,0.347958,100.026178,0.323286,69.695840,1.139830,...,860.002395,0.313149,284.190848,9.223516,279.977804,6.537935,332.209271,0.096239,332.125721,0.118037
147996,69.991809,3.564122,16.868628,5.317683,99.990732,0.381893,100.002368,0.378060,69.460694,1.019985,...,859.991821,0.343028,284.781531,9.270745,280.511153,6.783697,332.150655,0.158543,332.088618,0.237818


In [29]:
chg_name = {'mean': '_Avg', 'std': '_Std'}
data_stat.columns = list(map(lambda x: x[0] + chg_name[x[1]], data_stat.columns))
data_stat.reset_index(drop=False, inplace=True)

In [30]:
data_stat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 39 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AN             136 non-null    int64  
 1   DZ1_OP_Avg     136 non-null    float64
 2   DZ1_OP_Std     136 non-null    float64
 3   DZ2_OP_Avg     136 non-null    float64
 4   DZ2_OP_Std     136 non-null    float64
 5   DZ1_TEMP_Avg   136 non-null    float64
 6   DZ1_TEMP_Std   136 non-null    float64
 7   DZ2_TEMP_Avg   136 non-null    float64
 8   DZ2_TEMP_Std   136 non-null    float64
 9   CLEAN_Avg      136 non-null    float64
 10  CLEAN_Std      136 non-null    float64
 11  HDZ1_OP_Avg    136 non-null    float64
 12  HDZ1_OP_Std    136 non-null    float64
 13  HDZ2_OP_Avg    136 non-null    float64
 14  HDZ2_OP_Std    136 non-null    float64
 15  HDZ3_OP_Avg    136 non-null    float64
 16  HDZ3_OP_Std    136 non-null    float64
 17  HDZ4_OP_Avg    136 non-null    float64
 18  HDZ4_OP_St

In [31]:
df_total = pd.merge(quality, data_stat, on='AN', how='left')
df_total.head()

Unnamed: 0,AN,WD,PN,EN,GQ,BQ,TQ,DZ1_OP_Avg,DZ1_OP_Std,DZ2_OP_Avg,...,HDZ4_TEMP_Avg,HDZ4_TEMP_Std,SCZ1_TEMP_Avg,SCZ1_TEMP_Std,SCZ2_TEMP_Avg,SCZ2_TEMP_Std,STZ1_TEMP_Avg,STZ1_TEMP_Std,STZ2_TEMP_Avg,STZ2_TEMP_Std
0,102410,2022-01-03,열처리,열처리 염욕_1,15160,3,15163,72.252727,3.696537,21.354464,...,860.010591,0.553425,282.581576,9.371136,280.148995,6.033861,329.016349,0.127638,329.070923,0.122912
1,102585,2022-01-03,열처리,열처리 염욕_1,29892,10,29902,72.235643,3.365,18.602563,...,859.991765,0.480499,282.788156,9.499565,279.772316,7.161542,328.998615,0.101219,328.924151,0.089118
2,102930,2022-01-04,열처리,열처리 염욕_1,59616,30,59646,70.720207,3.231776,20.911928,...,860.007487,0.418431,283.330874,9.680441,279.308958,6.665152,329.133618,0.121386,329.148777,0.117273
3,103142,2022-01-05,열처리,열처리 염욕_1,74730,13,74743,72.424229,2.635245,22.250186,...,860.003481,0.296744,282.882341,9.495495,279.241106,6.537439,329.082068,0.100404,329.073209,0.102232
4,103675,2022-01-06,열처리,열처리 염욕_1,14979,2,14981,72.774648,4.159221,21.865151,...,860.007199,0.571169,283.581648,9.705562,277.544769,5.365901,329.010901,0.097701,329.114583,0.089317


In [32]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 0 to 135
Data columns (total 45 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   AN             136 non-null    int64         
 1   WD             136 non-null    datetime64[ns]
 2   PN             136 non-null    object        
 3   EN             136 non-null    object        
 4   GQ             136 non-null    int64         
 5   BQ             136 non-null    int64         
 6   TQ             136 non-null    int64         
 7   DZ1_OP_Avg     136 non-null    float64       
 8   DZ1_OP_Std     136 non-null    float64       
 9   DZ2_OP_Avg     136 non-null    float64       
 10  DZ2_OP_Std     136 non-null    float64       
 11  DZ1_TEMP_Avg   136 non-null    float64       
 12  DZ1_TEMP_Std   136 non-null    float64       
 13  DZ2_TEMP_Avg   136 non-null    float64       
 14  DZ2_TEMP_Std   136 non-null    float64       
 15  CLEAN_Avg      136 non-

### 종속변수 생성

In [33]:
# 종속변수 생성을 위한 파생변수 생성
df_total["FR"] = round(df_total["BQ"] / df_total["TQ"] *100, 3)

In [34]:
df_total.groupby('AN')['FR'].mean()

AN
102410    0.020
102585    0.033
102930    0.050
103142    0.017
103675    0.013
          ...  
147292    0.027
147546    0.027
147982    0.029
147996    0.030
148069    0.000
Name: FR, Length: 136, dtype: float64

In [35]:
# 6시그마 기준으로 하여 불량여부 판단

def calculate_upper_bound(row):
    return row['FR'] + 3 * math.sqrt((row['FR'] * (1 - row['FR'])) / row['TQ'])

df_total['upper_bound'] = df_total.apply(calculate_upper_bound, axis=1)

def set_fr_level(row):
    if row['FR'] > row['upper_bound']:
        return 'dangerous'
    else:
        return 'stable'

df_total['FR_Level'] = df_total.apply(set_fr_level, axis=1)
df_total = df_total.drop('upper_bound', axis=1)                      

In [36]:
df_total['FR_Level'].value_counts()

stable    136
Name: FR_Level, dtype: int64

In [37]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 0 to 135
Data columns (total 47 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   AN             136 non-null    int64         
 1   WD             136 non-null    datetime64[ns]
 2   PN             136 non-null    object        
 3   EN             136 non-null    object        
 4   GQ             136 non-null    int64         
 5   BQ             136 non-null    int64         
 6   TQ             136 non-null    int64         
 7   DZ1_OP_Avg     136 non-null    float64       
 8   DZ1_OP_Std     136 non-null    float64       
 9   DZ2_OP_Avg     136 non-null    float64       
 10  DZ2_OP_Std     136 non-null    float64       
 11  DZ1_TEMP_Avg   136 non-null    float64       
 12  DZ1_TEMP_Std   136 non-null    float64       
 13  DZ2_TEMP_Avg   136 non-null    float64       
 14  DZ2_TEMP_Std   136 non-null    float64       
 15  CLEAN_Avg      136 non-

### 차원축소

#### 공정별 불량품과의 PCC

In [40]:
df = data.drop(['TAG_MIN', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second'] , axis=1)

In [41]:
for col in df.columns.drop('AN'):
    # Calculate the first and third quartiles for the 'Dry 1-zone OP' column
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)

    # Calculate the IQR for the 'Dry 1-zone OP' column
    iqr = 1.5 * (q3 - q1)

    # Create a boolean mask for outliers in the 'Dry 1-zone OP' column
    outlier_mask = (df[col] < (q1 - iqr)) | (df[col] > (q3 + iqr))

    # Create a new column 'Outlier' to label rows with 1 for outliers and 0 for non-outliers
    df[col] = np.where(outlier_mask, 1, 0)

In [42]:
merged_df = quality.merge(df.groupby('AN').sum(), how='inner', on='AN')

In [None]:
merged_df

In [None]:
merged_df.corr()['BQ'].sort_values(ascending=False)

## 확인

In [175]:
df_fr = df_total[['AN','FR']]

In [176]:
df_fr

Unnamed: 0,AN,FR
0,102410,0.020
1,102585,0.033
2,102930,0.050
3,103142,0.017
4,103675,0.013
...,...,...
131,147292,0.027
132,147546,0.027
133,147982,0.029
134,147996,0.030


In [188]:
data = get_df('data.csv')

In [189]:
column_names_1 = {'배정번호': 'AN', '건조 1존 OP': 'DZ1_OP', '건조 2존 OP': 'DZ2_OP'
                ,'건조로 온도 1 Zone': 'DZ1_TEMP', '건조로 온도 2 Zone': 'DZ2_TEMP', '세정기': 'CLEAN'
                ,'소입1존 OP': 'HDZ1_OP', '소입2존 OP': 'HDZ2_OP', '소입3존 OP': 'HDZ3_OP'
                ,'소입4존 OP': 'HDZ4_OP', '소입로 CP 값': 'HDZ_CP', '소입로 CP 모니터 값': 'HDZ_CPM'
                ,'소입로 온도 1 Zone': 'HDZ1_TEMP', '소입로 온도 2 Zone': 'HDZ2_TEMP', '소입로 온도 3 Zone': 'HDZ3_TEMP'
                ,'소입로 온도 4 Zone': 'HDZ4_TEMP', '솔트 컨베이어 온도 1 Zone': 'SCZ1_TEMP', '솔트 컨베이어 온도 2 Zone': 'SCZ2_TEMP'
                ,'솔트조 온도 1 Zone': 'STZ1_TEMP', '솔트조 온도 2 Zone': 'STZ2_TEMP'}

data.rename(columns=column_names_1, inplace=True)

In [190]:
data

Unnamed: 0,TAG_MIN,AN,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,...,HDZ_CP,HDZ_CPM,HDZ1_TEMP,HDZ2_TEMP,HDZ3_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP
0,2022-01-03 11:22:07,102410,75.6648,30.015500,,,68.4386,72.84030,59.7862,51.71690,...,0.450421,,,859.854,,,,,,
1,2022-01-03 11:22:08,102410,75.6706,32.273200,,,68.4386,78.44150,61.6286,50.44530,...,0.450356,1.146260e-10,860.338,859.780,860.044,859.786,294.658,272.538,328.734,328.865
2,2022-01-03 11:22:09,102410,75.6776,32.159200,98.8533,99.14600,68.4386,78.10990,61.5414,52.01960,...,0.450341,1.145200e-10,860.338,859.780,859.981,859.724,294.658,272.538,328.734,328.805
3,2022-01-03 11:22:11,102410,75.8656,30.831200,98.7918,99.17675,68.4999,77.50725,60.6663,52.69425,...,0.450201,1.144670e-10,860.338,859.842,859.950,859.599,294.719,272.538,328.674,328.865
4,2022-01-03 11:22:12,102410,73.6468,29.527400,98.7918,99.20750,68.4386,76.02620,61.1634,51.69150,...,0.450235,1.145360e-10,860.351,859.791,859.991,859.731,294.721,272.599,328.740,328.808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939717,2022-07-19 19:08:55,148069,61.2937,0.093966,99.9056,100.48700,67.1140,71.83560,60.7118,50.37770,...,0.447112,1.144320e-10,858.986,859.421,859.930,859.419,280.798,272.102,332.058,332.247
2939718,2022-07-19 19:08:56,148069,61.7260,0.283887,99.8440,100.48700,67.1140,77.17830,60.6722,55.03920,...,0.447982,1.141160e-10,858.923,859.421,859.743,859.419,280.859,272.163,332.058,332.247
2939719,2022-07-19 19:08:57,148069,61.6784,0.205745,99.7825,100.48700,67.1140,73.17290,62.1574,56.49890,...,0.448688,1.140110e-10,858.986,859.358,859.681,859.419,280.859,272.163,332.058,332.186
2939720,2022-07-19 19:08:58,148069,61.5148,0.136414,99.8440,100.48700,67.1140,73.84430,62.0722,54.81390,...,0.448501,1.141160e-10,858.986,859.358,859.743,859.419,280.921,272.163,332.058,332.186


In [191]:
data = data.fillna(data.median())

In [192]:
data['이상치개수'] = 0

# Calculate quantiles once
quantiles = data.quantile([0.25, 0.75])

# Calculate the interquartile range for each column
iqr = 1.5 * (quantiles.loc[0.75] - quantiles.loc[0.25])

# Calculate the upper and lower bounds for outliers for each column
outlierhigh = quantiles.loc[0.75] + iqr
outlierlow = quantiles.loc[0.25] - iqr

# Create a boolean DataFrame where each element is True if it's an outlier and False otherwise
is_outlier = (data > outlierhigh) | (data < outlierlow)

# Sum the number of outliers for each row and store it in the '이상치개수' column
data['이상치개수'] = is_outlier.sum(axis=1)

data.head()

Unnamed: 0,TAG_MIN,AN,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,...,HDZ_CPM,HDZ1_TEMP,HDZ2_TEMP,HDZ3_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP,이상치개수
0,2022-01-03 11:22:07,102410,75.6648,30.0155,100.002,100.019,68.4386,72.8403,59.7862,51.7169,...,1.14555e-10,859.575,859.854,860.002,860.0,284.586,280.02,332.017,332.423,0
1,2022-01-03 11:22:08,102410,75.6706,32.2732,100.002,100.019,68.4386,78.4415,61.6286,50.4453,...,1.14626e-10,860.338,859.78,860.044,859.786,294.658,272.538,328.734,328.865,3
2,2022-01-03 11:22:09,102410,75.6776,32.1592,98.8533,99.146,68.4386,78.1099,61.5414,52.0196,...,1.1452e-10,860.338,859.78,859.981,859.724,294.658,272.538,328.734,328.805,5
3,2022-01-03 11:22:11,102410,75.8656,30.8312,98.7918,99.17675,68.4999,77.50725,60.6663,52.69425,...,1.14467e-10,860.338,859.842,859.95,859.599,294.719,272.538,328.674,328.865,5
4,2022-01-03 11:22:12,102410,73.6468,29.5274,98.7918,99.2075,68.4386,76.0262,61.1634,51.6915,...,1.14536e-10,860.351,859.791,859.991,859.731,294.721,272.599,328.74,328.808,4


In [193]:
data['설비 이상신호'] =np.where(data['이상치개수'] < 9, 0, 1)
data['설비 이상신호'].value_counts()

0    2936301
1       3421
Name: 설비 이상신호, dtype: int64

In [194]:
data.drop('이상치개수', axis=1, inplace=True)
data.head()

Unnamed: 0,TAG_MIN,AN,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,...,HDZ_CPM,HDZ1_TEMP,HDZ2_TEMP,HDZ3_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP,설비 이상신호
0,2022-01-03 11:22:07,102410,75.6648,30.0155,100.002,100.019,68.4386,72.8403,59.7862,51.7169,...,1.14555e-10,859.575,859.854,860.002,860.0,284.586,280.02,332.017,332.423,0
1,2022-01-03 11:22:08,102410,75.6706,32.2732,100.002,100.019,68.4386,78.4415,61.6286,50.4453,...,1.14626e-10,860.338,859.78,860.044,859.786,294.658,272.538,328.734,328.865,0
2,2022-01-03 11:22:09,102410,75.6776,32.1592,98.8533,99.146,68.4386,78.1099,61.5414,52.0196,...,1.1452e-10,860.338,859.78,859.981,859.724,294.658,272.538,328.734,328.805,0
3,2022-01-03 11:22:11,102410,75.8656,30.8312,98.7918,99.17675,68.4999,77.50725,60.6663,52.69425,...,1.14467e-10,860.338,859.842,859.95,859.599,294.719,272.538,328.674,328.865,0
4,2022-01-03 11:22:12,102410,73.6468,29.5274,98.7918,99.2075,68.4386,76.0262,61.1634,51.6915,...,1.14536e-10,860.351,859.791,859.991,859.731,294.721,272.599,328.74,328.808,0


In [197]:
data_danger = data.groupby('AN').sum()['설비 이상신호'].values
data_danger

array([  10,    0,  290,    0,    7,    0,  357,    3,    0,   21,    0,
          0,    0,    0,    5,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0, 1380,    0,    0,    0,    0,    0,    0,    0,
          0,   47,    0,   60,   12,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   89,    0,    0,   67,
          0,    0,    0,  124,    0,    0,   18,    0,    2,   25,    0,
          0,   76,    1,    0,    0,    3,    0,    0,    0,    0,   24,
          0,    0,    0,    0,    0,    0,    0,    0,   90,    2,  201,
          1,    0,  180,   10,    0,    0,    0,    0,    6,    2,   16,
        126,    1,    0,    0,    0,    0,    9,    6,    8,    0,    1,
          2,   57,    0,    0,    0,    0,    0,    0,    0,   80,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    2,    0,    0,
          0,    0,    0,    0])

In [198]:
data_total = data.groupby('AN').count()['설비 이상신호'].values
data_total

array([ 7342, 13299, 26982, 31729,  6218, 25708, 12559, 46836, 35348,
        4334, 37633, 18763, 38237, 40402, 19606, 14627,  5753, 40057,
       37640, 20529, 43475,  6204, 14443, 21983, 41014, 25377, 14105,
       36199,  8097, 13102,  7951, 37246, 15759, 22940,  7664, 38011,
       42779, 41806, 28685,   214, 32809,  6888, 20224, 16664, 22988,
       19280, 20257, 12473, 10519,  8231, 43021, 14806, 27634, 15063,
       30085, 13616, 26607, 19544,  7732, 36974,  4941, 42941, 36203,
        6903, 14201,  4591, 13430, 15948, 19730, 23284, 34413, 19616,
       17369, 31945, 24944, 12832, 13565,  3932, 33140, 26365, 13927,
       14433, 18563, 19765, 18852, 19203, 20852, 44749,  6578, 12562,
       13870,  7204, 20590, 13092, 40777, 34319, 23082, 36699,  5939,
       16914, 30588, 13675, 28375, 13166,  6961, 17844, 43503, 41353,
       28272, 14672, 33171, 41287,  8796, 28240, 20612, 13856, 28263,
        7474,  7774, 20476,  7122, 22788, 14292,  7006,  7992, 13396,
       13263,  9070,

In [199]:
df_equip = pd.DataFrame({'danger': data_danger, 'total': data_total})
df_equip

Unnamed: 0,danger,total
0,10,7342
1,0,13299
2,290,26982
3,0,31729
4,7,6218
...,...,...
131,0,20151
132,0,27651
133,0,18558
134,0,16092


In [203]:
df_equip['AN'] = quality['AN']

In [205]:
df_equip = df_equip[['AN','danger','total']]

In [207]:
df_equip['EFR'] = df_equip['danger']/df_equip['total']
df_equip

Unnamed: 0,AN,danger,total,EFR
0,102410,10,7342,0.001362
1,102585,0,13299,0.000000
2,102930,290,26982,0.010748
3,103142,0,31729,0.000000
4,103675,7,6218,0.001126
...,...,...,...,...
131,147292,0,20151,0.000000
132,147546,0,27651,0.000000
133,147982,0,18558,0.000000
134,147996,0,16092,0.000000


In [214]:
result = pd.merge(df_equip, df_fr, how='inner', on='AN')
result.drop(['danger', 'total'], axis=1, inplace=True)
result

Unnamed: 0,AN,EFR,FR
0,102410,0.001362,0.020
1,102585,0.000000,0.033
2,102930,0.010748,0.050
3,103142,0.000000,0.017
4,103675,0.001126,0.013
...,...,...,...
131,147292,0.000000,0.027
132,147546,0.000000,0.027
133,147982,0.000000,0.029
134,147996,0.000000,0.030


- t검정 수행

In [219]:
from scipy import stats

t_statistic, p_value = stats.ttest_ind(result['EFR'], result['FR'])
alpha = 0.05
print(f'통계량 : {t_statistic}')
print(f'p_value : {p_value}')

통계량 : -8.57310596082898
p_value : 7.930739242650732e-16


두 값의 차이가 크다. -> data.csv로 제품의 품질을 예측하기 어려워 보인다.