In [1]:
import pandas as pd
import numpy as np

# np.triu, np.tril
- np.triu : 매트릭스의 대각선 위쪽만 복사, 아래쪽은 0
- np.tril : 매트릭스의 대각선 아래쪽만 복사, 위쪽은 0

### 인자
- k : k번째 대각선

In [2]:
np.tril(np.ones((5,5)), k=0)

array([[1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 1., 0., 0.],
       [1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 1.]])

In [3]:
np.tril(np.ones((5,5)), k=1)

array([[1., 1., 0., 0., 0.],
       [1., 1., 1., 0., 0.],
       [1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

**Example**

- correlation matrix에 활용하면 쉽게 상관관계가 높은 변수를 추출하여 응용이 가능하다.

1) 제거 [(예시 링크)](https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/)

In [71]:
X = np.array([[1, 1, 1, 1, 1],
              [2, 2, 0, 2, 0],
              [3, 3, 1, 3, 1],
              [4, 4, 0, 5, 0],
              [5, 5, 1, 5, 1],
              [6, 6, 0, 6, 0],
              [7, 7, 1, 7, 1],
              [8, 7, 0, 8, 0],
              [9, 7, 1, 9, 1]])

In [72]:
df = pd.DataFrame(X, columns=['a','b','c','d','e'])

In [73]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = np.ones(corr_matrix.shape)
upper = np.triu(upper, k=1) # 대각선 아래쪽은 0
upper = upper.astype(np.bool) # 대각선 아래쪽은 False
upper = corr_matrix.where(upper) # 대각선 아래쪽은 NaN

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [74]:
corr_matrix

Unnamed: 0,a,b,c,d,e
a,1.0,0.976103,0.0,0.992567,0.0
b,0.976103,1.0,0.034503,0.971864,0.034503
c,0.0,0.034503,1.0,0.048564,1.0
d,0.992567,0.971864,0.048564,1.0,0.048564
e,0.0,0.034503,1.0,0.048564,1.0


In [75]:
upper # 대각선 아래는 NaN인 것을 볼 수 있다.

Unnamed: 0,a,b,c,d,e
a,,0.976103,0.0,0.992567,0.0
b,,,0.034503,0.971864,0.034503
c,,,,0.048564,1.0
d,,,,,0.048564
e,,,,,


In [76]:
to_drop

['b', 'd', 'e']

In [77]:
# Drop features 
df.drop(df[to_drop], axis=1)

Unnamed: 0,a,c
0,1,1
1,2,0
2,3,1
3,4,0
4,5,1
5,6,0
6,7,1
7,8,0
8,9,1


2) 상관관계 높은 변수끼리 그룹화 [(예시 링크)](https://stackoverflow.com/questions/24002820/returning-groups-of-correlated-columns-in-pandas-data-frame) -> PCA 등 가능

In [68]:
corr_matrix2 = corr_matrix.copy()
corr_matrix2.loc[:,:] =  np.tril(corr_matrix2, k=-1) # borrowed from Karl D's answer

already_in = set()
result = []
for col in corr_matrix2:
    perfect_corr = corr_matrix2[col][corr_matrix2[col] > 0.95].index.tolist()
    if perfect_corr and col not in already_in:
        already_in.update(set(perfect_corr))
        perfect_corr.append(col)
        result.append(perfect_corr)

In [69]:
result

[['b', 'd', 'a'], ['e', 'c']]