# 정준상관분석 : 예시 (2)

In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_decomposition import CCA
from scipy.stats import chi2

In [2]:
# 데이터 로드
data = pd.read_csv("./EU_Jobs.csv")
print(data.head())

# 결측치 확인 및 처리
data.fillna(0, inplace=True)

# X, Y 데이터 설정
X = data.iloc[:, 1:5].values
Y = data.iloc[:, 5:9].values

      Country   Agr  Min   Man   PS  Con    SI  Fin   SPS   TC
0     Belgium   3.3  0.9  27.6  0.9  8.2  19.1  6.2  26.6  7.2
1     Denmark   9.2  0.1  21.8  0.6  8.3  14.6  6.5  32.2  7.1
2      France  10.8  0.8  27.5  0.9  8.9  16.8  6.0  22.6  5.7
3  W. Germany   6.7  1.3  35.8  0.9  7.3  14.4  5.0  22.3  6.1
4     Ireland  23.2  1.0  20.7  1.3  7.5  16.8  2.8  20.8  6.1


In [3]:
# 정준상관분석 수행
cca = CCA(n_components=1)
cca.fit(X, Y)
X_c, Y_c = cca.transform(X, Y)

In [4]:
# 정준점수 계산
score = X_c
print("Mean of canonical scores:")
print(np.round(np.mean(score, axis=0), 2))
print("Standard deviation of canonical scores:")
print(np.round(np.std(score, axis=0), 2))
print("Covariance of canonical scores:")
print(np.round(np.cov(score.T), 2))

Mean of canonical scores:
[0.]
Standard deviation of canonical scores:
[0.7]
Covariance of canonical scores:
0.51


In [5]:
# Wilks' lambda 통계 계산 함수
def wilks_lambda(canonical_corr, n, p, q):
    wilks_stats = np.prod(1 - np.array(canonical_corr) ** 2)
    chi_sq = -(n - 1 - (p + q + 1) / 2) * np.log(wilks_stats)
    df = p * q
    p_value = chi2.sf(chi_sq, df)
    return wilks_stats, chi_sq, p_value

# 정준상관계수 계산
canonical_corr = [np.corrcoef(X_c[:, i], Y_c[:, i])[0, 1] for i in range(1)]
n, p, q = X.shape[0], X.shape[1], Y.shape[1]

wilks_stats, chi_sq, p_value = wilks_lambda(canonical_corr, n, p, q)
print("Wilks' lambda statistic:", wilks_stats)
print("Chi-squared statistic:", chi_sq)
print("p-value:", p_value)

Wilks' lambda statistic: 0.004495680975669547
Chi-squared statistic: 110.79508158868286
p-value: 3.1665385456919904e-16


In [6]:
# 공헌도와 근사도 계산
w = np.array(canonical_corr) ** 2
print("Contribution (squared canonical correlations):")
print(w)
print("Cumulative contribution (%):")
print(np.cumsum(w) / np.sum(w) * 100)

Contribution (squared canonical correlations):
[0.99550432]
Cumulative contribution (%):
[100.]
