주성분 분석

In [1]:
import pandas as pd
df = pd.read_csv('./data/wine_data.csv')

features = ['Alcohol', 'Malic', 'Ash', 'Alcalinity', 'Magesium', 'Phenols',
       'Flavanoids', 'Nonflavanoids', 'Proanthocyanins', 'Color', 'Hue',
       'Dilution', 'Proline']

X = df[features]
y = df['class']

# 트레이닝/테스트 데이터 분할
from sklearn.model_selection import train_test_split
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

# 데이터 표준화
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

In [2]:
# PCA 
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X_tn_std)
X_tn_pca = pca.transform(X_tn_std)
X_te_pca = pca.transform(X_te_std)

In [3]:
# 차원 축소 확인
print(X_tn_std.shape)
print(X_tn_pca.shape)

(133, 13)
(133, 2)


In [4]:
# 고유값 확인
print(pca.singular_values_)

[25.3887705  18.00040359]


In [5]:
# 고유벡터 확인
print(pca.components_)

[[ 0.14285263 -0.24443441 -0.01392104 -0.24517361  0.1315099   0.38923734
   0.42471766 -0.30128613  0.30523465 -0.10462106  0.30267149  0.37266737
   0.29150867]
 [-0.50194071 -0.23015832 -0.31022311  0.03872248 -0.23858952 -0.09688729
  -0.01451474 -0.03693926 -0.0584635  -0.54240798  0.28168064  0.16944035
  -0.35182263]]


In [6]:
# 설명된 분산
print(pca.explained_variance_)

[4.88325506 2.45465553]


In [7]:
# 설명된 분산 비율
print(pca.explained_variance_ratio_)

[0.37281068 0.18739996]


In [10]:
# 데이터프레임 생성
pca_columns = ['pca_comp1', 'pca_comp2']
X_tn_pca_df = pd.DataFrame(X_tn_pca, columns=pca_columns)
y_tn1 = y_tn.reset_index(drop=True)
X_tn_pca_df['target'] = y_tn1
X_tn_pca_df.head()

Unnamed: 0,pca_comp1,pca_comp2,target
0,2.1285,-1.019732,0
1,-2.948125,-0.254807,2
2,1.919795,1.337012,1
3,-2.133376,-0.945897,2
4,2.236803,-2.269839,0


LDA

In [13]:
# LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis() 
lda.fit(X_tn_std, y_tn)
X_tn_lda = lda.transform(X_tn_std)
X_te_lda = lda.transform(X_te_std)

In [20]:
# 차원축소 전후 데이터 차원 비교
print(X_tn_std.shape)
print(X_tn_lda.shape)

(133, 13)
(133, 2)


In [21]:
# LDA 상수항
print(lda.intercept_)

[-10.43672342  -4.06646567 -11.86481563]


In [22]:
# LDA 가중 벡터
print(lda.coef_)

[[ 2.08067239e+00  9.35053476e-03  1.21559585e+00 -2.93109927e+00
   9.76489959e-02 -2.70947375e+00  8.33641080e+00  9.59274709e-01
  -1.31511222e+00 -1.57641572e+00  6.01209963e-02  4.16138211e+00
   4.94293451e+00]
 [-1.50898343e+00 -1.03364308e+00 -1.42513305e+00  1.31234341e+00
  -8.24060552e-03  3.10104765e-01  4.63726906e-01  1.19450044e-01
   7.45392589e-01 -1.48434856e+00  8.54448096e-01  1.72053276e-01
  -2.18465736e+00]
 [-3.50493538e-01  1.28200202e+00  4.74650776e-01  1.51050245e+00
  -9.46719137e-02  2.52505333e+00 -9.54130024e+00 -1.18053287e+00
   4.82004904e-01  3.55008261e+00 -1.13269019e+00 -4.68855236e+00
  -2.58283290e+00]]


In [23]:
# LDA 데이터 셋
lda_columns = ['lda_comp1', 'lda_comp2']
X_tn_lda_df = pd.DataFrame(X_tn_lda, columns=lda_columns)
X_tn_lda_df['target'] = y_tn.reset_index(drop=True)
X_tn_lda_df.head()

Unnamed: 0,lda_comp1,lda_comp2,target
0,-3.75723,1.782242,0
1,3.7178,0.855757,2
2,-0.044172,-2.02249,1
3,3.502951,1.999101,2
4,-3.875662,2.650775,0


In [14]:
from sklearn.ensemble import RandomForestClassifier
# 학습
clf_rf_lda = RandomForestClassifier(max_depth=2, random_state=0)
clf_rf_lda.fit(X_tn_lda, y_tn)

# 예측 
pred_rf_lda = clf_rf_lda.predict(X_te_lda)

In [15]:
# LDA 적용 후 정확도
from sklearn.metrics import accuracy_score
accuracy_lda = accuracy_score(y_te, pred_rf_lda)
print(accuracy_lda)

0.9777777777777777


In [16]:
clf_rf = RandomForestClassifier(max_depth=2, random_state=0)
clf_rf.fit(X_tn_std, y_tn)

pred_rf = clf_rf.predict(X_te_std)
acc = accuracy_score(y_te, pred_rf)
print(acc)

0.9555555555555556
