# 4.3 다변량 비시각화
## 다변량 비시각화
두 개 이상의 변수로 구성된 데이터 관계를 교차표 및 상관계수 등으로 파악하는 데이터 탐색 유형  
=> 주어진 **변수 간의 관계를 수치 및 통계적 지표 기반으로 파악**하는 것이 목적
|데이터 조합|비시각화 방안|목적|
|---|---|---|
|범주형-범주형|**교차표**|두 개 범주형 변수의 **범주 별 연관성 및 구성**파악|
|범주형-연속형|**범주 별 통계량**|**범주 별 대표 통계량 비교** 파악|
|연속형-연속형|**상관계수**|두 개 **연속형 변수의 관계성 정도** 파악|
## 교차표(Cross tabulation)
범주형 - 범주형 변수 조합 간 연관 관계 파악
## 범주 별 요약 통계량
범주형 - 연속형 변수 조합 간 범주 별 대표 수치 비교
- 평균, 중앙값 등
## 상관계수(Corr. coefficient)
연속형 - 연속형 변수 조합 간 관계성 강도 파악
### 높은 상관계수
비슷한 정보를 제공하는 밀접한 관계의 변수
- 1 ~ 0.7 : 강한 상관관계
- 0.7 ~ 0.3 : 상관관계 존재
- 0.3 ~ 0.1 : 약한 상관관계
- 0.1 ~ 0 : 미미한 관계
  
- 회귀분석에서 독립변수 간에 **강한 상관관계** 발생 -> **다중공선성**발생
- 독립변수 간의 관계는 독립적이라는 **회귀분석 가정**에 위배
- 회귀 계수가 불안정하여 종속변수에 미치는 영향력을 올바르게 설명치 못하므로 모델의 안정성 저해  
>데이터 탐색 중 상관분석 결과를 통해 모델링 사전 단계 내 고려 필요

## 실습

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
from sklearn.datasets import load_boston
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    data = load_boston()


In [3]:
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.DataFrame(data.target, columns=['MEDV'])

housing = pd.merge(X, y, left_index=True, right_index=True, how='inner')

housing_data = housing.copy()

## 범주형 - 범주형 다변량 비시각화
### 교차표 (Cross Tabulation)

In [4]:
housing_data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [5]:
housing_data = housing_data.astype({'CHAS' : 'object'})

In [6]:
medv_bins = [0, np.mean(housing_data['MEDV']), np.max(housing_data['MEDV'])]
medv_names = ['cheap', 'expensive']
housing_data['MEDV_G'] = pd.cut(housing_data['MEDV'], medv_bins, labels=medv_names)
housing_data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,MEDV_G
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0,expensive
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6,cheap
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,expensive
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,expensive
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2,expensive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4,cheap
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6,cheap
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9,expensive
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0,cheap


In [7]:
rst_CHAS = pd.crosstab(housing_data['CHAS'], housing_data['MEDV_G'], margins=True)
rst_CHAS

MEDV_G,cheap,expensive,All
CHAS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,282,189,471
1.0,15,20,35
All,297,209,506


In [8]:
indus_bins = [0, np.mean(housing_data['INDUS']), np.max(housing_data['INDUS'])]
indus_names = ['INUDS_LOW', 'INDUS_HIGH']
housing_data['INDUS_G'] = pd.cut(housing_data['INDUS'], indus_bins, labels=indus_names)
housing_data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,MEDV_G,INDUS_G
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0,expensive,INUDS_LOW
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6,cheap,INUDS_LOW
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,expensive,INUDS_LOW
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,expensive,INUDS_LOW
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2,expensive,INUDS_LOW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4,cheap,INDUS_HIGH
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6,cheap,INDUS_HIGH
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9,expensive,INDUS_HIGH
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0,cheap,INDUS_HIGH


In [11]:
rst_INDUS = pd.crosstab(housing_data['INDUS_G'], housing_data['MEDV_G'], margins=True)

rst_INDUS['ratio_cheap'] = np.round((rst_INDUS['cheap'] / rst_INDUS['All'])*100, 2)
rst_INDUS['ratio_expensive'] = np.round((rst_INDUS['expensive'] / rst_INDUS['All'])*100, 2)
rst_INDUS

MEDV_G,cheap,expensive,All,ratio_cheap,ratio_expensive
INDUS_G,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
INUDS_LOW,126,168,294,42.86,57.14
INDUS_HIGH,171,41,212,80.66,19.34
All,297,209,506,58.7,41.3


In [12]:
RAD_bins = [0, np.mean(housing_data['RAD']), np.max(housing_data['RAD'])]
RAD_names = ['INUDS_LOW', 'RAD_HIGH']
housing_data['RAD_G'] = pd.cut(housing_data['RAD'], RAD_bins, labels=RAD_names)

rst_RAD = pd.crosstab(housing_data['RAD_G'], housing_data['MEDV_G'], margins=True)

rst_RAD['ratio_cheap'] = np.round((rst_RAD['cheap'] / rst_RAD['All'])*100, 2)
rst_RAD['ratio_expensive'] = np.round((rst_RAD['expensive'] / rst_RAD['All'])*100, 2)
rst_RAD

MEDV_G,cheap,expensive,All,ratio_cheap,ratio_expensive
RAD_G,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
INUDS_LOW,183,191,374,48.93,51.07
RAD_HIGH,114,18,132,86.36,13.64
All,297,209,506,58.7,41.3


In [15]:
rst_df = pd.crosstab([housing_data['RAD_G'], housing_data['INDUS_G']], housing_data['MEDV_G'], margins=True)
rst_df['ratio_cheap'] = np.round((rst_df['cheap'] / rst_df['All'])*100, 2)
rst_df['ratio_expensive'] = np.round((rst_df['expensive'] / rst_df['All'])*100, 2)
rst_df

Unnamed: 0_level_0,MEDV_G,cheap,expensive,All,ratio_cheap,ratio_expensive
RAD_G,INDUS_G,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
INUDS_LOW,INUDS_LOW,126,168,294,42.86,57.14
INUDS_LOW,INDUS_HIGH,57,23,80,71.25,28.75
RAD_HIGH,INDUS_HIGH,114,18,132,86.36,13.64
All,,297,209,506,58.7,41.3


In [16]:
re_indus_bins = [0, 
            np.max(housing_data['INDUS'])/4*1,
            np.max(housing_data['INDUS'])/4*2,
            np.max(housing_data['INDUS'])/4*3, 
            np.max(housing_data['INDUS'])]
re_indus_names = ['INUDS_G1', 'INDUS_G2', 'INDUS_G3', 'INDUS_G4']
housing_data['RE_INDUS_G'] = pd.cut(housing_data['INDUS'], re_indus_bins, labels=re_indus_names)

rst_RE_INDUS = pd.crosstab(housing_data['RE_INDUS_G'], housing_data['MEDV_G'], margins=True)

rst_RE_INDUS['ratio_cheap'] = np.round((rst_RE_INDUS['cheap'] / rst_RE_INDUS['All'])*100, 2)
rst_RE_INDUS['ratio_expensive'] = np.round((rst_RE_INDUS['expensive'] / rst_RE_INDUS['All'])*100, 2)
rst_RE_INDUS

MEDV_G,cheap,expensive,All,ratio_cheap,ratio_expensive
RE_INDUS_G,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
INUDS_G1,56,139,195,28.72,71.28
INDUS_G2,79,31,110,71.82,28.18
INDUS_G3,136,38,174,78.16,21.84
INDUS_G4,26,1,27,96.3,3.7
All,297,209,506,58.7,41.3


In [17]:
rst_df = pd.crosstab([housing_data['RAD_G'], housing_data['RE_INDUS_G']], housing_data['MEDV_G'], margins=True)

rst_df['ratio_cheap'] = np.round((rst_df['cheap'] / rst_df['All'])*100, 2)
rst_df['ratio_expensive'] = np.round((rst_df['expensive'] / rst_df['All'])*100, 2)
rst_df

Unnamed: 0_level_0,MEDV_G,cheap,expensive,All,ratio_cheap,ratio_expensive
RAD_G,RE_INDUS_G,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
INUDS_LOW,INUDS_G1,56,139,195,28.72,71.28
INUDS_LOW,INDUS_G2,79,31,110,71.82,28.18
INUDS_LOW,INDUS_G3,22,20,42,52.38,47.62
INUDS_LOW,INDUS_G4,26,1,27,96.3,3.7
RAD_HIGH,INDUS_G3,114,18,132,86.36,13.64
All,,297,209,506,58.7,41.3


## 범주형 - 연속형 다변량 비시각화 

In [18]:
pd.DataFrame(housing_data.groupby(['MEDV_G'])['INDUS'].mean())

Unnamed: 0_level_0,INDUS
MEDV_G,Unnamed: 1_level_1
cheap,13.813266
expensive,7.333349


In [20]:
pd.DataFrame(housing_data.groupby(['MEDV_G'])['INDUS'].median())

Unnamed: 0_level_0,INDUS
MEDV_G,Unnamed: 1_level_1
cheap,18.1
expensive,5.86


In [21]:
pd.DataFrame(housing_data.groupby(['MEDV_G'])['AGE'].mean())

Unnamed: 0_level_0,AGE
MEDV_G,Unnamed: 1_level_1
cheap,79.009764
expensive,53.746411


In [22]:
pd.DataFrame(housing_data.groupby(['MEDV_G'])['AGE'].median())

Unnamed: 0_level_0,AGE
MEDV_G,Unnamed: 1_level_1
cheap,88.6
expensive,52.6


## 연속형 - 연속형 다변량 비시각화

In [23]:
housing_data = housing.copy()

In [24]:
housing_data = housing_data.astype({'CHAS' : 'object'})

In [25]:
np.round(housing_data.corrwith(housing_data['MEDV']), 2).sort_values()

LSTAT     -0.74
PTRATIO   -0.51
INDUS     -0.48
TAX       -0.47
NOX       -0.43
CRIM      -0.39
AGE       -0.38
RAD       -0.38
DIS        0.25
B          0.33
ZN         0.36
RM         0.70
MEDV       1.00
dtype: float64

In [26]:
np.round(housing_data.corr(), 2)

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
CRIM,1.0,-0.2,0.41,0.42,-0.22,0.35,-0.38,0.63,0.58,0.29,-0.39,0.46,-0.39
ZN,-0.2,1.0,-0.53,-0.52,0.31,-0.57,0.66,-0.31,-0.31,-0.39,0.18,-0.41,0.36
INDUS,0.41,-0.53,1.0,0.76,-0.39,0.64,-0.71,0.6,0.72,0.38,-0.36,0.6,-0.48
NOX,0.42,-0.52,0.76,1.0,-0.3,0.73,-0.77,0.61,0.67,0.19,-0.38,0.59,-0.43
RM,-0.22,0.31,-0.39,-0.3,1.0,-0.24,0.21,-0.21,-0.29,-0.36,0.13,-0.61,0.7
AGE,0.35,-0.57,0.64,0.73,-0.24,1.0,-0.75,0.46,0.51,0.26,-0.27,0.6,-0.38
DIS,-0.38,0.66,-0.71,-0.77,0.21,-0.75,1.0,-0.49,-0.53,-0.23,0.29,-0.5,0.25
RAD,0.63,-0.31,0.6,0.61,-0.21,0.46,-0.49,1.0,0.91,0.46,-0.44,0.49,-0.38
TAX,0.58,-0.31,0.72,0.67,-0.29,0.51,-0.53,0.91,1.0,0.46,-0.44,0.54,-0.47
PTRATIO,0.29,-0.39,0.38,0.19,-0.36,0.26,-0.23,0.46,0.46,1.0,-0.18,0.37,-0.51


In [27]:
import scipy.stats as stats
stats.pearsonr(housing_data.TAX, housing_data.RAD)

PearsonRResult(statistic=0.9102281885331875, pvalue=4.129920119396259e-195)