# 一、 資料預處理

### 資料預處理

In [None]:
# !gdown 'google drive ID' --output filename.zip
# !unzip filename.zip

In [None]:
import pandas as pd

df = pd.read_csv('汽車車型資料檔.csv') # read data
df

df1 = df.iloc[:, 1:-1]  # 去除非數值的行
df1

In [None]:
# Z分數標準化
# !pip install scikit-learn
from sklearn.preprocessing import StandardScaler    # 載入模組
std = StandardScaler()
arr = std.fit_transform(df1)
arr

In [None]:
# 最大最小值標準化
from sklearn.preprocessing import MinMaxScaler
std = MinMaxScaler()
arr = std.fit_transform(df1)
arr

### 非數值資料轉換

In [None]:
# 1. 對應字典法
import pandas as pd

df = pd.read_csv('客戶聯絡狀況資料檔.csv') # read data
# df

df['婚姻'].unique() # 顯示類別
dict1 = {'單身':0, '已婚':1, '未知':2, '離婚':3}
df['婚姻'].replace(dict1, inplace=True) 
df

In [None]:
# 2. 標籤編碼法
from sklearn.preprocessing import LabelEncoder

df['工作'].unique()
label1 = LabelEncoder()
# 標籤欄位 = 標籤物件.fit_transform(DataFrame欄位)
df['工作'] = label1.fit_transform(df['工作'])
df

In [None]:
# 查詢轉換後數值代表特徵值
# 標籤物件.classes_
# labelname = label1.classes_
label1.classes_

In [None]:
# 3. one-Hot 編碼法
# df['訂購'].unique()

# 先轉標籤，再轉One-Hot，但不轉標籤也能用One-Hot
from sklearn.preprocessing import LabelEncoder
label1 = LabelEncoder()
df['訂購'] = label1.fit_transform(df['訂購'])
df

In [None]:
from sklearn.preprocessing import OneHotEncoder
# onehot物件 = OneHotEncoder(sparse=布林值)
onehot = OneHotEncoder(sparse=False) # sparse占記憶體較小，但不易懂
arr = onehot.fit_transform(df[['訂購']]) # DataFrame要放「二維陣列」
arr

# 二、 機器學習

> 01. 監督式學習: K近鄰、 單純貝氏分類、 決策樹

- 分類: 目標值不是連續值，例如能預測用戶是否能得到貸款

- 回歸: 目標值是連續值，例如能預測能得到多少貸款金額

> 02. 非監督式學習: K-means、DBSCAN

### K-means 演算法

In [None]:
import pandas as pd
df = pd.read_csv('customer.csv')
df

In [None]:
dict1 = {"男" : 1, "女" : 2}
df['性別'].replace(dict1, inplace = True)
df

In [None]:
# 加入分群結果
from sklearn.cluster import KMeans
# KMeans變數 = KMeans(n_clusters= 數值) # n_clusters:分成幾群
km = KMeans(n_clusters=3)
# KMeans變數.fit(訓練資料)
km.fit(df)

# 分群結果
km.labels_

# 加入分群結果
df['類別'] = km.labels_
df

In [None]:
# 解讀各群組資料
df1 = df[df['類別'] == 0]
df1.iloc[:25, :] #只顯示25筆符合資料，整列
# 收入偏高/消費指數偏高 --> VIP

In [None]:
df2 = df[df['類別'] == 1]
df2.iloc[:25, :]
# 收入無明顯之分/消費指數偏低 --> 普通人 or 對此商店物品不滿意者

In [None]:
df3 = df[df['類別'] == 2]
df3.iloc[:25, :]
# 收入偏中低/消費指數偏高 --> 熱愛此商店的消費者(有潛力)

In [None]:
# 評估分群效果
from sklearn.metrics import calinski_harabasz_score
for n in range(2,15):
    km = KMeans(n_clusters=n) #分n群
    km.fit(df)
    metric = calinski_harabasz_score(df, km.labels_)
    print(f'群組數量：{n}，評分：{metric}')
# 分數越高 分群效果越好

### K近鄰演算法
- 簡稱: KNN

In [None]:
#參考資料

# https://ithelp.ithome.com.tw/articles/10186473
# http://yann.lecun.com/exdb/mnist/
# https://stackoverflow.com/questions/62210186/from-tensorflow-examples-tutorials-mnist-not-working-in-google-colab
# https://www.codegrepper.com/code-examples/whatever/No+module+named+%27tensorflow.examples.tutorials%27+in+google+colab
# https://www.codegrepper.com/code-examples/shell/ModuleNotFoundError%3A+No+module+named+%27tensorflow.examples%27
# https://github.com/tensorflow/tensorflow
# https://github.com/tensorflow/tensorflow/issues/32790
# https://blog.csdn.net/weixin_41663570/article/details/102512468
# https://sweetornotspicymarathon.medium.com/tesorflow-keras-%E5%AD%B8%E7%BF%92%E7%AD%86%E8%A8%98-%E6%96%B0%E6%89%8B%E4%B8%80%E5%AE%9A%E8%A6%81%E7%8E%A9%E7%9A%84mnist%E6%89%8B%E5%AF%AB%E6%95%B8%E5%AD%97%E8%BE%A8%E8%AD%98-9327366cc838

# https://steam.oxxostudio.tw/category/python/example/image-conversion.html
# https://steam.oxxostudio.tw/category/python/example/matplotlib-save-image.html
# https://www.delftstack.com/zh-tw/howto/matplotlib/how-to-change-the-figure-size-in-matplotlib/

In [None]:
# !pip install --ignore-installed --upgrade tensorflow

# !pip install mnist
# import mnist
# train_images = mnist.train_images() 
# train_labels = mnist.train_labels()
# test_images = mnist.test_images()
# test_labels = mnist.test_labels()

# !git clone https://github.com/tensorflow/tensorflow.git

### mnist500.zip模擬創作

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

data = []
for i in range(10):
    for j in range(1,501):
        data.append(plt.imread(f'mnist500/{i}/{i}_{j}.bmp'))

# len(data)

x = np.array(data)
x

In [None]:
plt.imshow(x[1005], cmap='gray') #本身就是gray了

In [None]:
y = [0,1,2,3,4,5,6,7,8,9] * 500 # 目標值分組
y.sort()
print(y)

In [None]:
y = np.array(y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
len(x_test)

#### 實作K近鄰演算法
- 語法

    ```python
    from sklearn.neighbors import KNeighborsClassifier
    近鄰變數 = KNeighborsClassifier(n_neighbors=數值, algorithm=演算法, weights=權重計算方式)
    近鄰變數.fit(訓練資料, 訓練目標值)
    預測變數 = 近鄰變數.predict(預測資料)
    準確率變數 = 近鄰變數.score(預測資料, 預測目標值)
    ```
    
    

In [56]:
knn = KNeighborsClassifier(n_neighbors= 5)
knn.fit(x_train.reshape(4000, -1), y_train)
# 預測變數
pred = knn.predict(x_test.reshape(1000, -1))
pred

# 比對預測值與正確值
print(pred[100:130])
print(y_test[100:130])

# 準確率變數
score = knn.score(x_test.reshape(1000, -1), y_test)
score

[2 5 7 4 6 5 7 6 2 1 5 1 8 5 5 6 4 5 5 5 5 0 1 3 8 3 5 2 9 1]
[2 5 7 4 3 5 7 6 2 1 5 1 8 8 5 6 4 5 5 5 5 0 1 3 8 3 5 7 9 1]


0.929

In [57]:
# 儲存模型
import joblib
joblib.dump(knn, 'mnist500.pkl')

['mnist500.pkl']

In [58]:
# 讀取模型
import joblib
knn2 = joblib.load('mnist500.pkl')

In [59]:
# 驗證模型
img = plt.imread('mnist500/3/3_1.bmp')
pred2 = knn2.predict(img.reshape(1, -1))
pred2

array([3])

In [None]:
# KNN完整程式碼
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import joblib

data = []
for i in range(10):
    for j in range(1,501):
        data.append(plt.imread(f'mnist500/{i}/{i}_{j}.bmp'))
x = np.array(data)
y = [0,1,2,3,4,5,6,7,8,9] * 500
y = np.array(y)
y.sort()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 12)
knn = KNeighborsClassifier(n_neighbors= 5)
knn.fit(x_train.reshape(4000, -1), y_train)
score = knn.score(x_test.reshape(1000, -1), y_test)
print(score)
joblib.dump(knn, 'mnist500.pkl')