<a href="https://colab.research.google.com/github/Nago27/AI_python/blob/main/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://bit.ly/fruits_300_data -O fruits_300.npy

import numpy as np
fruits = np.load('fruits_300.npy')
fruits_2d = fruits.reshape(-1, 100 * 100)

import matplotlib.pyplot as plt
def draw_fruits(arr, ratio=1):
	n = len(arr)
	rows = int(np.ceil(n/10))
	cols = n if rows < 2 else 10
	fig, axs = plt.subplots(rows, cols, figsize=(cols*ratio, rows*ratio), squeeze=False)
	for i in range(rows):
		for j in range(cols):
			if i*10 + j < n:
				axs[i, j].imshow(arr[i*10 + j], cmap='gray_r')
			axs[i, j].axis('off')
	plt.show()

# 주성분 분석 -> PCA 클래스
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
pca.fit(fruits_2d)
print(pca.components_.shape)

fruits_pca = pca.transform(fruits_2d) # 50개의 특성을 가진 데이터
print(fruits_pca.shape) # (300, 50)

# 원본 데이터 재구성
fruits_inverse = pca.inverse_transform(fruits_pca)
fruits_reconstruct = fruits_inverse.reshape(-1, 100, 100)
for start in [0, 100, 200]:
  draw_fruits(fruits_reconstruct[start:start+100])
  print('\n')

# 설명된 분산
print('설명된 분산', np.sum(pca.explained_variance_ratio_))
plt.plot(pca.explained_variance_ratio_)
plt.show()

In [None]:
# 로지스틱 회귀로 원본과 축소 데이터 차이점 확인
rom sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

target = np.array([0]*100 + [1]*100 + [2]*100) # 사과: 0, 파인애플: 1, 바나나: 2

## 교차 검증
from sklearn.model_selection import cross_validate
scores = cross_validate(lr, fruits_2d, target)
print(np.mean(scores['test_score'])) # 점수: 0.997
print(np.mean(scores['fit_time'])) # 훈련 시간: 0.942

## PCA 교차 검증
scores = cross_validate(lr, fruits_pca, target)
print(np.mean(scores['test_score'])) # 점수: 1.0
print(np.mean(scores['fit_time'])) # 훈련 시간: 0.032

# 차원 축소된 데이터와 k-평균 알고리즘
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3, random_state=42)
km.fit(fruits_pca)
print(np.unique(km.labels_, return_counts=True))

for label in range(0, 3):
  draw_fruits(fruits[km.labels_ == label])
  print("\n")

for label in range(0, 3):
  data = fruits_pca[km.labels == label]
  plt.scatter(data[:, 0], data[:, 1])
plt.legend(['apple', 'banana', 'pineapple'])
plt.show()