## 주성분분석 (PCA)

차원축소 - 특성추출

연속형변수에 대해 pca진행

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

import scipy as sp
import sklearn.preprocessing as skp

import statsmodels.stats.anova as anova
from statsmodels.formula.api import ols
import statsmodels

|구분|설명|
|---|---|
| [01]  CRIM |자치시(town) 별 1인당 범죄율|
| [02]  ZN	| 25,000 평방피트를 초과하는 거주지역의 비율|
| [03]  INDUS |비소매상업지역이 점유하고 있는 토지의 비율|
| [04]  CHAS |찰스강에 대한 더미변수(강의 경계에 위치한 경우는 1, 아니면 0)|
| [05]  NOX	| 10ppm 당 농축 일산화질소|
| [06]  RM	| 주택 1가구당 평균 방의 개수|
| [07]  AGE	| 1940년 이전에 건축된 소유주택의 비율|
| [08]  DIS	| 5개의 보스턴 직업센터까지의 접근성 지수|
| [09]  RAD	| 방사형 도로까지의 접근성 지수|
| [10]  TAX	| 10,000 달러 당 재산세율|
| [11]  PTRATIO|	자치시(town)별 학생/교사 비율|
| [12]  B	| 1000(Bk-0.63)^2, 여기서 Bk는 자치시별 흑인의 비율을 말함.|
| [13]  LSTAT |	모집단의 하위계층의 비율(%)|
| [14]  MEDV |	본인 소유의 주택가격(중앙값) (단위: $1,000)|

In [2]:
data = pd.read_excel("https://data.hossam.kr/E04/boston.xlsx")
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT. MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0,0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6,0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4,0
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6,0
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9,0
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CRIM       506 non-null    float64
 1   ZN         506 non-null    float64
 2   INDUS      506 non-null    float64
 3   CHAS       506 non-null    int64  
 4   NOX        506 non-null    float64
 5   RM         506 non-null    float64
 6   AGE        506 non-null    float64
 7   DIS        506 non-null    float64
 8   RAD        506 non-null    int64  
 9   TAX        506 non-null    int64  
 10  PTRATIO    506 non-null    float64
 11  B          506 non-null    float64
 12  LSTAT      506 non-null    float64
 13  MEDV       506 non-null    float64
 14  CAT. MEDV  506 non-null    int64  
dtypes: float64(11), int64(4)
memory usage: 59.4 KB


> CAT. MEDV 필드 삭제

In [4]:
df = data.copy()
df.drop('CAT. MEDV', axis=1, inplace=True)
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


독립변수와 종속변수 구분

In [11]:
# 독립변수
x = df[df.columns.difference(['MEDV', 'CHAS'])] #종속변수MEDV, 독립변수CHAS(범주형변수) 제거

# 종속변수
y = df['MEDV']

In [12]:
x.describe()

Unnamed: 0,AGE,B,CRIM,DIS,INDUS,LSTAT,NOX,PTRATIO,RAD,RM,TAX,ZN
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,68.574901,356.674032,3.613524,3.795043,11.136779,12.653063,0.554695,18.455534,9.549407,6.284634,408.237154,11.363636
std,28.148861,91.294864,8.601545,2.10571,6.860353,7.141062,0.115878,2.164946,8.707259,0.702617,168.537116,23.322453
min,2.9,0.32,0.00632,1.1296,0.46,1.73,0.385,12.6,1.0,3.561,187.0,0.0
25%,45.025,375.3775,0.082045,2.100175,5.19,6.95,0.449,17.4,4.0,5.8855,279.0,0.0
50%,77.5,391.44,0.25651,3.20745,9.69,11.36,0.538,19.05,5.0,6.2085,330.0,0.0
75%,94.075,396.225,3.677083,5.188425,18.1,16.955,0.624,20.2,24.0,6.6235,666.0,12.5
max,100.0,396.9,88.9762,12.1265,27.74,37.97,0.871,22.0,24.0,8.78,711.0,100.0


> 데이터의 분포가 크므로 표준화를 진행

데이터 표준화

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
scale = StandardScaler()
std_x = scale.fit_transform(x)
std_x_df = pd.DataFrame(std_x)
std_x_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.120013,0.441052,-0.419782,0.140214,-1.287909,-1.075562,-0.144217,-1.459000,-0.982843,0.413672,-0.666608,0.284830
1,0.367166,0.441052,-0.417339,0.557160,-0.593381,-0.492439,-0.740262,-0.303094,-0.867883,0.194274,-0.987329,-0.487722
2,-0.265812,0.396427,-0.417342,0.557160,-0.593381,-1.208727,-0.740262,-0.303094,-0.867883,1.282714,-0.987329,-0.487722
3,-0.809889,0.416163,-0.416750,1.077737,-1.306878,-1.361517,-0.835284,0.113032,-0.752922,1.016303,-1.106115,-0.487722
4,-0.511180,0.441052,-0.412482,1.077737,-1.306878,-1.026501,-0.835284,0.113032,-0.752922,1.228577,-1.106115,-0.487722
...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.018673,0.387217,-0.413229,-0.625796,0.115738,-0.418147,0.158124,1.176466,-0.982843,0.439316,-0.803212,-0.487722
502,0.288933,0.441052,-0.415249,-0.716639,0.115738,-0.500850,0.158124,1.176466,-0.982843,-0.234548,-0.803212,-0.487722
503,0.797449,0.441052,-0.413447,-0.773684,0.115738,-0.983048,0.158124,1.176466,-0.982843,0.984960,-0.803212,-0.487722
504,0.736996,0.403225,-0.407764,-0.668437,0.115738,-0.865302,0.158124,1.176466,-0.982843,0.725672,-0.803212,-0.487722


In [29]:
dict(zip(list(range(0,12)), x.columns))

{0: 'AGE',
 1: 'B',
 2: 'CRIM',
 3: 'DIS',
 4: 'INDUS',
 5: 'LSTAT',
 6: 'NOX',
 7: 'PTRATIO',
 8: 'RAD',
 9: 'RM',
 10: 'TAX',
 11: 'ZN'}

In [31]:
std_x_df.rename(columns=dict(zip(list(range(0,12)), x.columns)), inplace=True)
std_x_df

Unnamed: 0,AGE,B,CRIM,DIS,INDUS,LSTAT,NOX,PTRATIO,RAD,RM,TAX,ZN
0,-0.120013,0.441052,-0.419782,0.140214,-1.287909,-1.075562,-0.144217,-1.459000,-0.982843,0.413672,-0.666608,0.284830
1,0.367166,0.441052,-0.417339,0.557160,-0.593381,-0.492439,-0.740262,-0.303094,-0.867883,0.194274,-0.987329,-0.487722
2,-0.265812,0.396427,-0.417342,0.557160,-0.593381,-1.208727,-0.740262,-0.303094,-0.867883,1.282714,-0.987329,-0.487722
3,-0.809889,0.416163,-0.416750,1.077737,-1.306878,-1.361517,-0.835284,0.113032,-0.752922,1.016303,-1.106115,-0.487722
4,-0.511180,0.441052,-0.412482,1.077737,-1.306878,-1.026501,-0.835284,0.113032,-0.752922,1.228577,-1.106115,-0.487722
...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.018673,0.387217,-0.413229,-0.625796,0.115738,-0.418147,0.158124,1.176466,-0.982843,0.439316,-0.803212,-0.487722
502,0.288933,0.441052,-0.415249,-0.716639,0.115738,-0.500850,0.158124,1.176466,-0.982843,-0.234548,-0.803212,-0.487722
503,0.797449,0.441052,-0.413447,-0.773684,0.115738,-0.983048,0.158124,1.176466,-0.982843,0.984960,-0.803212,-0.487722
504,0.736996,0.403225,-0.407764,-0.668437,0.115738,-0.865302,0.158124,1.176466,-0.982843,0.725672,-0.803212,-0.487722


연속형 변수에 대해 pca 검사

In [32]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA