In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
import numpy as np

In [2]:
%matplotlib inline

In [3]:
warnings.filterwarnings("ignore")

In [4]:
path = './data/'

In [5]:
encodings = ['cp949', 'euc-kr', 'utf-8']

def get_df(name):
    for encoding in encodings:
        try:
            df = pd.read_csv(path + f'{name}', encoding=encoding)
            break
        except UnicodeDecodeError:
            continue
#     df = df.sample(n=50000, random_state=42)
    return df

In [6]:
data = get_df('data.csv')

In [7]:
data.head()

Unnamed: 0,TAG_MIN,배정번호,건조 1존 OP,건조 2존 OP,건조로 온도 1 Zone,건조로 온도 2 Zone,세정기,소입1존 OP,소입2존 OP,소입3존 OP,...,소입로 CP 값,소입로 CP 모니터 값,소입로 온도 1 Zone,소입로 온도 2 Zone,소입로 온도 3 Zone,소입로 온도 4 Zone,솔트 컨베이어 온도 1 Zone,솔트 컨베이어 온도 2 Zone,솔트조 온도 1 Zone,솔트조 온도 2 Zone
0,2022-01-03 11:22:07,102410,75.6648,30.0155,,,68.4386,72.8403,59.7862,51.7169,...,0.450421,,,859.854,,,,,,
1,2022-01-03 11:22:08,102410,75.6706,32.2732,,,68.4386,78.4415,61.6286,50.4453,...,0.450356,1.14626e-10,860.338,859.78,860.044,859.786,294.658,272.538,328.734,328.865
2,2022-01-03 11:22:09,102410,75.6776,32.1592,98.8533,99.146,68.4386,78.1099,61.5414,52.0196,...,0.450341,1.1452e-10,860.338,859.78,859.981,859.724,294.658,272.538,328.734,328.805
3,2022-01-03 11:22:11,102410,75.8656,30.8312,98.7918,99.17675,68.4999,77.50725,60.6663,52.69425,...,0.450201,1.14467e-10,860.338,859.842,859.95,859.599,294.719,272.538,328.674,328.865
4,2022-01-03 11:22:12,102410,73.6468,29.5274,98.7918,99.2075,68.4386,76.0262,61.1634,51.6915,...,0.450235,1.14536e-10,860.351,859.791,859.991,859.731,294.721,272.599,328.74,328.808


In [8]:
column_names_1 = {'배정번호': 'AN', '건조 1존 OP': 'DZ1_OP', '건조 2존 OP': 'DZ2_OP'
                ,'건조로 온도 1 Zone': 'DZ1_TEMP', '건조로 온도 2 Zone': 'DZ2_TEMP', '세정기': 'CLEAN'
                ,'소입1존 OP': 'HDZ1_OP', '소입2존 OP': 'HDZ2_OP', '소입3존 OP': 'HDZ3_OP'
                ,'소입4존 OP': 'HDZ4_OP', '소입로 CP 값': 'HDZ_CP', '소입로 CP 모니터 값': 'HDZ_CPM'
                ,'소입로 온도 1 Zone': 'HDZ1_TEMP', '소입로 온도 2 Zone': 'HDZ2_TEMP', '소입로 온도 3 Zone': 'HDZ3_TEMP'
                ,'소입로 온도 4 Zone': 'HDZ4_TEMP', '솔트 컨베이어 온도 1 Zone': 'SCZ1_TEMP', '솔트 컨베이어 온도 2 Zone': 'SCZ2_TEMP'
                ,'솔트조 온도 1 Zone': 'STZ1_TEMP', '솔트조 온도 2 Zone': 'STZ2_TEMP'}

data.rename(columns=column_names_1, inplace=True)

### 날짜 변환

In [9]:
data['TAG_MIN'] = pd.to_datetime(data['TAG_MIN'])

In [10]:
data['Year'] = data['TAG_MIN'].dt.year
data['Month'] = data['TAG_MIN'].dt.month
data['Day'] = data['TAG_MIN'].dt.day
data['Hour'] = data['TAG_MIN'].dt.hour
data['Minute'] = data['TAG_MIN'].dt.minute
data['Second'] = data['TAG_MIN'].dt.second

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2939722 entries, 0 to 2939721
Data columns (total 27 columns):
 #   Column     Dtype         
---  ------     -----         
 0   TAG_MIN    datetime64[ns]
 1   AN         int64         
 2   DZ1_OP     float64       
 3   DZ2_OP     float64       
 4   DZ1_TEMP   float64       
 5   DZ2_TEMP   float64       
 6   CLEAN      float64       
 7   HDZ1_OP    float64       
 8   HDZ2_OP    float64       
 9   HDZ3_OP    float64       
 10  HDZ4_OP    float64       
 11  HDZ_CP     float64       
 12  HDZ_CPM    float64       
 13  HDZ1_TEMP  float64       
 14  HDZ2_TEMP  float64       
 15  HDZ3_TEMP  float64       
 16  HDZ4_TEMP  float64       
 17  SCZ1_TEMP  float64       
 18  SCZ2_TEMP  float64       
 19  STZ1_TEMP  float64       
 20  STZ2_TEMP  float64       
 21  Year       int64         
 22  Month      int64         
 23  Day        int64         
 24  Hour       int64         
 25  Minute     int64         
 26  Second     int

In [12]:
data.isnull().sum()

TAG_MIN         0
AN              0
DZ1_OP          1
DZ2_OP          1
DZ1_TEMP      116
DZ2_TEMP      148
CLEAN          91
HDZ1_OP      4288
HDZ2_OP         0
HDZ3_OP         2
HDZ4_OP         3
HDZ_CP          1
HDZ_CPM       147
HDZ1_TEMP     130
HDZ2_TEMP     128
HDZ3_TEMP     157
HDZ4_TEMP     170
SCZ1_TEMP     106
SCZ2_TEMP     142
STZ1_TEMP     209
STZ2_TEMP     203
Year            0
Month           0
Day             0
Hour            0
Minute          0
Second          0
dtype: int64

In [13]:
data.describe()

Unnamed: 0,AN,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,HDZ4_OP,...,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP,Year,Month,Day,Hour,Minute,Second
count,2939722.0,2939721.0,2939721.0,2939606.0,2939574.0,2939631.0,2935434.0,2939722.0,2939720.0,2939719.0,...,2939616.0,2939580.0,2939513.0,2939519.0,2939722.0,2939722.0,2939722.0,2939722.0,2939722.0,2939722.0
mean,128442.2,69.89404,20.44708,100.0061,100.0198,67.71864,75.64373,54.86239,53.86029,71.08925,...,283.9963,279.9293,331.8062,332.1773,2022.0,4.15088,15.39287,11.08424,29.4769,29.4978
std,12637.04,4.014802,5.217123,0.4360371,0.3623526,1.630768,25.16083,4.429079,2.664304,2.556959,...,9.51277,6.611579,0.7827379,0.8732977,0.0,1.797681,8.476858,7.235095,17.32587,17.31843
min,102410.0,47.2532,0.000118883,97.3421,97.8706,60.6244,0.000850055,8.62001,0.0437045,0.0062442,...,266.23,266.426,328.161,328.073,2022.0,1.0,1.0,0.0,0.0,0.0
25%,119448.0,68.4288,18.9176,99.8144,99.8901,66.5694,64.9627,53.3259,52.3891,69.6781,...,274.754,273.502,331.867,332.178,2022.0,3.0,9.0,5.0,14.0,14.0
50%,129889.0,70.5166,21.2931,100.002,100.019,67.6972,82.2104,55.6654,53.8862,71.0454,...,284.586,280.02,332.017,332.423,2022.0,4.0,15.0,11.0,29.0,29.0
75%,139116.0,72.3781,23.3884,100.191,100.161,68.9799,95.3666,57.5733,55.4145,72.4771,...,293.343,286.334,332.141,332.626,2022.0,6.0,22.0,18.0,44.0,44.0
max,148069.0,87.2995,47.5395,102.469,101.843,71.4901,100.0,77.2709,66.015,87.3907,...,298.53,291.696,332.717,333.179,2022.0,7.0,31.0,23.0,59.0,59.0


In [14]:
quality = pd.read_excel(path+'quality.xlsx')

In [15]:
quality.head()

Unnamed: 0,배정번호,작업일,공정명,설비명,양품수량,불량수량,총수량
0,102410,2022-01-03,열처리,열처리 염욕_1,15160,3,15163
1,102585,2022-01-03,열처리,열처리 염욕_1,29892,10,29902
2,102930,2022-01-04,열처리,열처리 염욕_1,59616,30,59646
3,103142,2022-01-05,열처리,열처리 염욕_1,74730,13,74743
4,103675,2022-01-06,열처리,열처리 염욕_1,14979,2,14981


In [16]:
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   배정번호    136 non-null    int64         
 1   작업일     136 non-null    datetime64[ns]
 2   공정명     136 non-null    object        
 3   설비명     136 non-null    object        
 4   양품수량    136 non-null    int64         
 5   불량수량    136 non-null    int64         
 6   총수량     136 non-null    int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 7.6+ KB


In [17]:
column_names_2 = {'배정번호': 'AN', '작업일': 'WD', '공정명': 'PN'
                ,'설비명': 'EN', '양품수량': 'GQ', '불량수량': 'BQ'
                ,'총수량': 'TQ'}

quality.rename(columns=column_names_2, inplace=True)

In [18]:
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   AN      136 non-null    int64         
 1   WD      136 non-null    datetime64[ns]
 2   PN      136 non-null    object        
 3   EN      136 non-null    object        
 4   GQ      136 non-null    int64         
 5   BQ      136 non-null    int64         
 6   TQ      136 non-null    int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 7.6+ KB


In [19]:
quality.describe()

Unnamed: 0,AN,GQ,BQ,TQ
count,136.0,136.0,136.0,136.0
mean,128897.191176,45012.301471,15.117647,45027.419118
std,12403.393924,25548.197992,18.549657,25554.54871
min,102410.0,8412.0,0.0,8414.0
25%,120467.75,29736.0,4.0,29755.0
50%,130199.0,44003.0,8.5,44020.5
75%,138982.5,60120.75,17.0,60135.5
max,148069.0,104740.0,120.0,104761.0


In [20]:
quality.isnull().sum()

AN    0
WD    0
PN    0
EN    0
GQ    0
BQ    0
TQ    0
dtype: int64

In [21]:
train = get_df('train.csv')

In [22]:
train.head()

Unnamed: 0.1,Unnamed: 0,건조 1존 OP_Avg,건조 1존 OP_Std,건조 2존 OP_Avg,건조 2존 OP_Std,건조로 온도 1 Zone_Avg,건조로 온도 1 Zone_Std,건조로 온도 2 Zone_Avg,건조로 온도 2 Zone_Std,세정기_Avg,...,소입로 온도 4 Zone_Std,솔트 컨베이어 온도 1 Zone_Avg,솔트 컨베이어 온도 1 Zone_Std,솔트 컨베이어 온도 2 Zone_Avg,솔트 컨베이어 온도 2 Zone_Std,솔트조 온도 1 Zone_Avg,솔트조 온도 1 Zone_Std,솔트조 온도 2 Zone_Avg,솔트조 온도 2 Zone_Std,불량단계
0,97,69.497726,3.274577,20.310463,3.490991,99.999143,0.435237,100.001123,0.401438,67.864965,...,0.304168,284.699659,9.60185,280.411936,6.940009,332.111266,0.152253,332.712474,0.153026,안정
1,125,68.7767,3.548587,16.547672,4.161717,100.07776,0.394062,100.107134,0.291589,69.61422,...,0.3037,285.00715,9.239152,280.646734,6.650701,332.123215,0.225985,332.093658,0.329912,위험
2,11,73.502913,2.645737,21.218347,2.218216,100.006615,0.387797,99.992686,0.281373,66.220995,...,0.40205,283.120448,9.426413,279.110908,6.064772,332.182887,0.099322,332.407261,0.10744,안정
3,129,68.062513,3.439085,4.366498,3.727635,100.040387,0.418439,100.139576,0.334492,69.242707,...,0.360325,285.074759,9.475964,280.790056,6.699134,332.277923,0.121808,332.261568,0.184196,안정
4,48,68.820299,3.946638,19.902113,3.765778,100.07193,0.367971,100.097453,0.291788,65.512487,...,0.211795,284.166005,9.847216,279.587268,6.606966,331.943223,0.189118,332.503069,0.359428,안정


In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 38 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             108 non-null    int64  
 1   건조 1존 OP_Avg           108 non-null    float64
 2   건조 1존 OP_Std           108 non-null    float64
 3   건조 2존 OP_Avg           108 non-null    float64
 4   건조 2존 OP_Std           108 non-null    float64
 5   건조로 온도 1 Zone_Avg      108 non-null    float64
 6   건조로 온도 1 Zone_Std      108 non-null    float64
 7   건조로 온도 2 Zone_Avg      108 non-null    float64
 8   건조로 온도 2 Zone_Std      108 non-null    float64
 9   세정기_Avg                108 non-null    float64
 10  세정기_Std                108 non-null    float64
 11  소입1존 OP_Avg            108 non-null    float64
 12  소입1존 OP_Std            108 non-null    float64
 13  소입2존 OP_Avg            108 non-null    float64
 14  소입2존 OP_Std            108 non-null    float64
 15  소입3존 O

In [24]:
column_names_3 = {'Unnamed: 0' : 'UNNAMED', '건조 1존 OP': 'DZ1_OP', '건조 2존 OP': 'DZ2_OP'
                ,'건조로 온도 1 Zone': 'DZ1_TEMP', '건조로 온도 2 Zone': 'DZ2_TEMP', '세정기': 'CLEAN'
                ,'소입1존 OP': 'HDZ1_OP', '소입2존 OP': 'HDZ2_OP', '소입3존 OP': 'HDZ3_OP'
                ,'소입4존 OP': 'HDZ4_OP', '소입로 CP 값': 'HDZ_CP', '소입로 CP 모니터 값 ': 'HDZ_CPM'
                ,'소입로 온도 1 Zone': 'HDZ1_TEMP', '소입로 온도 2 Zone': 'HDZ2_TEMP', '소입로 온도 3 Zone': 'HDZ3_TEMP'
                ,'소입로 온도 4 Zone': 'HDZ4_TEMP', '솔트 컨베이어 온도 1 Zone': 'SCZ1_TEMP', '솔트 컨베이어 온도 2 Zone': 'SCZ2_TEMP'
                ,'솔트조 온도 1 Zone': 'STZ1_TEMP', '솔트조 온도 2 Zone': 'STZ2_TEMP', '불량단계' : 'FS'}

column_names_combined = {}
for key, value in column_names_3.items():
    column_names_combined[key] = value
    column_names_combined[f'{key}_Avg'] = f'{value}_Avg'
    column_names_combined[f'{key}_Std'] = f'{value}_Std'
    
train.rename(columns=column_names_combined, inplace=True)

In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   UNNAMED        108 non-null    int64  
 1   DZ1_OP_Avg     108 non-null    float64
 2   DZ1_OP_Std     108 non-null    float64
 3   DZ2_OP_Avg     108 non-null    float64
 4   DZ2_OP_Std     108 non-null    float64
 5   DZ1_TEMP_Avg   108 non-null    float64
 6   DZ1_TEMP_Std   108 non-null    float64
 7   DZ2_TEMP_Avg   108 non-null    float64
 8   DZ2_TEMP_Std   108 non-null    float64
 9   CLEAN_Avg      108 non-null    float64
 10  CLEAN_Std      108 non-null    float64
 11  HDZ1_OP_Avg    108 non-null    float64
 12  HDZ1_OP_Std    108 non-null    float64
 13  HDZ2_OP_Avg    108 non-null    float64
 14  HDZ2_OP_Std    108 non-null    float64
 15  HDZ3_OP_Avg    108 non-null    float64
 16  HDZ3_OP_Std    108 non-null    float64
 17  HDZ4_OP_Avg    108 non-null    float64
 18  HDZ4_OP_St

## 결측치 처리

In [26]:
from impyute.imputation.cs import mice

np_imputed=mice(data.drop('TAG_MIN', axis=1).values)
data_imputed=pd.DataFrame(np_imputed)

In [27]:
data_imputed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,102410.0,75.6648,30.015500,100.179937,99.773778,68.4386,72.84030,59.7862,51.71690,71.8405,...,283.047696,279.328486,331.627229,331.614746,2022.0,1.0,3.0,11.0,22.0,7.0
1,102410.0,75.6706,32.273200,100.001387,99.658726,68.4386,78.44150,61.6286,50.44530,70.5198,...,294.658000,272.538000,328.734000,328.865000,2022.0,1.0,3.0,11.0,22.0,8.0
2,102410.0,75.6776,32.159200,98.853300,99.146000,68.4386,78.10990,61.5414,52.01960,72.0384,...,294.658000,272.538000,328.734000,328.805000,2022.0,1.0,3.0,11.0,22.0,9.0
3,102410.0,75.8656,30.831200,98.791800,99.176750,68.4999,77.50725,60.6663,52.69425,73.4815,...,294.719000,272.538000,328.674000,328.865000,2022.0,1.0,3.0,11.0,22.0,11.0
4,102410.0,73.6468,29.527400,98.791800,99.207500,68.4386,76.02620,61.1634,51.69150,71.6093,...,294.721000,272.599000,328.740000,328.808000,2022.0,1.0,3.0,11.0,22.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939717,148069.0,61.2937,0.093966,99.905600,100.487000,67.1140,71.83560,60.7118,50.37770,76.3514,...,280.798000,272.102000,332.058000,332.247000,2022.0,7.0,19.0,19.0,8.0,55.0
2939718,148069.0,61.7260,0.283887,99.844000,100.487000,67.1140,77.17830,60.6722,55.03920,76.3017,...,280.859000,272.163000,332.058000,332.247000,2022.0,7.0,19.0,19.0,8.0,56.0
2939719,148069.0,61.6784,0.205745,99.782500,100.487000,67.1140,73.17290,62.1574,56.49890,76.2566,...,280.859000,272.163000,332.058000,332.186000,2022.0,7.0,19.0,19.0,8.0,57.0
2939720,148069.0,61.5148,0.136414,99.844000,100.487000,67.1140,73.84430,62.0722,54.81390,76.2158,...,280.921000,272.163000,332.058000,332.186000,2022.0,7.0,19.0,19.0,8.0,58.0


## 공정별 불량품과의 PCC

In [28]:
df = data.drop('TAG_MIN', axis=1)

In [29]:
for col in df.columns.drop('AN'):
    # Calculate the first and third quartiles for the 'Dry 1-zone OP' column
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)

    # Calculate the IQR for the 'Dry 1-zone OP' column
    iqr = 1.5 * (q3 - q1)

    # Create a boolean mask for outliers in the 'Dry 1-zone OP' column
    outlier_mask = (df[col] < (q1 - iqr)) | (df[col] > (q3 + iqr))

    # Create a new column 'Outlier' to label rows with 1 for outliers and 0 for non-outliers
    df[col] = np.where(outlier_mask, 1, 0)

In [30]:
df

Unnamed: 0,AN,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,HDZ4_OP,...,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP,Year,Month,Day,Hour,Minute,Second
0,102410,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,102410,0,1,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
2,102410,0,1,1,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,102410,0,1,1,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
4,102410,0,0,1,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939717,148069,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2939718,148069,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2939719,148069,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2939720,148069,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df.groupby('AN').sum()

Unnamed: 0_level_0,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,HDZ4_OP,HDZ_CP,...,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP,Year,Month,Day,Hour,Minute,Second
AN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102410,550,708,1082,1762,0,428,24,281,494,763,...,0,0,7341,7341,0,0,0,0,0,0
102585,564,501,1591,1450,0,740,63,41,308,441,...,0,0,13299,13299,0,0,0,0,0,0
102930,963,399,2629,2627,0,1931,20042,9896,4869,3041,...,0,0,26981,26981,0,0,0,0,0,0
103142,566,369,1076,952,0,545,22591,504,551,977,...,0,0,31727,31728,0,0,0,0,0,0
103675,556,301,870,944,0,445,5484,689,441,361,...,0,0,6217,6217,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147292,833,20149,1149,887,0,943,13,13,600,1764,...,0,0,0,0,0,0,0,0,0,0
147546,665,1384,795,1062,0,817,21,42,195,829,...,0,0,11,2073,0,0,0,0,0,0
147982,862,1117,861,1715,0,909,26,80,474,990,...,0,0,0,0,0,0,0,0,0,0
147996,931,2281,821,1872,0,728,269,129,1438,1015,...,0,0,0,404,0,0,0,0,0,0


In [32]:
quality['AN']

0      102410
1      102585
2      102930
3      103142
4      103675
        ...  
131    147292
132    147546
133    147982
134    147996
135    148069
Name: AN, Length: 136, dtype: int64

In [33]:
merged_df = quality.merge(df.groupby('AN').sum(), how='inner', on='AN')

In [34]:
merged_df.corr()['BQ'].sort_values(ascending=False)

BQ           1.000000
HDZ1_TEMP    0.392567
HDZ1_OP      0.354210
TQ           0.342684
GQ           0.342043
HDZ2_TEMP    0.329278
DZ1_TEMP     0.293382
DZ2_TEMP     0.285482
HDZ4_TEMP    0.269034
HDZ3_TEMP    0.245543
HDZ_CPM      0.244564
HDZ_CP       0.241514
HDZ4_OP      0.056438
AN           0.054524
HDZ3_OP     -0.019865
STZ1_TEMP   -0.040367
DZ2_OP      -0.045525
CLEAN       -0.059197
DZ1_OP      -0.066733
STZ2_TEMP   -0.067383
HDZ2_OP     -0.071493
SCZ1_TEMP         NaN
SCZ2_TEMP         NaN
Year              NaN
Month             NaN
Day               NaN
Hour              NaN
Minute            NaN
Second            NaN
Name: BQ, dtype: float64