# 2.2 Sklearn

* 데이터 불러오기
* 2.2.1. 싸이킷-런 데이터 분리
* 2.2.2. 싸이킷-런 지도 학습
* 2.2.3. 싸이킷-런 비지도 학습
* 2.2.4. 싸이킷-런 특징 추출

In [None]:
import sklearn
sklearn.__version__

### 데이터 불러오기

In [None]:
from sklearn.datasets import load_iris

In [None]:
iris_dataset = load_iris()
print("iris_dataset key: {}".format(iris_dataset.keys()))

In [None]:
print(iris_dataset['data'])
print("shape of data: {}". format(iris_dataset['data'].shape))

In [None]:
print(iris_dataset['feature_names'])

In [None]:
print(iris_dataset['target'])
print(iris_dataset['target_names'])

In [None]:
print(iris_dataset['DESCR'])

## 2.2.1. 싸이킷-런 데이터 분리

In [None]:
target = iris_dataset['target']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_input, test_input, train_label, test_label = train_test_split(iris_dataset['data'],
                                                                    target,
                                                                    test_size = 0.25,
                                                                    random_state=42)

In [None]:
print("shape of train_input: {}".format(train_input.shape))
print("shape of test_input: {}".format(test_input.shape))
print("shape of train_label: {}".format(train_label.shape))
print("shape of test_label: {}".format(test_label.shape))

## 2.2.2. 싸이킷-런 지도 학습

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 1)

In [None]:
knn.fit(train_input, train_label)

In [None]:
import numpy as np
new_input = np.array([[6.1, 2.8, 4.7, 1.2]])

In [None]:
knn.predict(new_input)

In [None]:
predict_label = knn.predict(test_input)
print(predict_label)

In [None]:
print('test accuracy {:.2f}'.format(np.mean(predict_label == test_label)))

## 2.2.3. 싸이킷-런 비지도 학습

In [None]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3)

In [None]:
k_means.fit(train_input)

In [None]:
k_means.labels_

In [None]:
print("0 cluster:", train_label[k_means.labels_ == 0])
print("1 cluster:", train_label[k_means.labels_ == 1])
print("2 cluster:", train_label[k_means.labels_ == 2])

In [None]:
import numpy as np
new_input  = np.array([[6.1, 2.8, 4.7, 1.2]])

In [None]:
prediction = k_means.predict(new_input)
print(prediction)

In [None]:
predict_cluster = k_means.predict(test_input)
print(predict_cluster)

In [None]:
np_arr = np.array(predict_cluster)
np_arr[np_arr==0], np_arr[np_arr==1], np_arr[np_arr==2] = 3, 4, 5
np_arr[np_arr==3] = 1
np_arr[np_arr==4] = 0
np_arr[np_arr==5] = 2
predict_label = np_arr.tolist()
print(predict_label)

In [None]:
print('test accuracy {:.2f}'.format(np.mean(predict_label == test_label)))

## 2.2.4. 싸이킷-런 특징 추출

* CountVectorizer
* TfidfVectorizer

### CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
text_data = ['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부 해야지']

count_vectorizer = CountVectorizer()

In [None]:
count_vectorizer.fit(text_data)
print(count_vectorizer.vocabulary_)

In [None]:
sentence = [text_data[0]] # ['나는 배가 고프다']
print(count_vectorizer.transform(sentence).toarray())

### TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
text_data = ['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부 해야지']
tfidf_vectorizer = TfidfVectorizer()

In [None]:
tfidf_vectorizer.fit(text_data)
print(tfidf_vectorizer.vocabulary_)

sentence = [text_data[3]] # ['점심 먹고 공부 해야지']
print(tfidf_vectorizer.transform(sentence).toarray())