# 一、 資料預處理

### 資料預處理

In [None]:
# !gdown 'google drive ID' --output filename.zip
# !unzip filename.zip

In [None]:
import pandas as pd

df = pd.read_csv('汽車車型資料檔.csv') # read data
df

df1 = df.iloc[:, 1:-1]  # 去除非數值的行
df1

In [None]:
# Z分數標準化
# !pip install scikit-learn
from sklearn.preprocessing import StandardScaler    # 載入模組
std = StandardScaler()
arr = std.fit_transform(df1)
arr

In [None]:
# 最大最小值標準化
from sklearn.preprocessing import MinMaxScaler
std = MinMaxScaler()
arr = std.fit_transform(df1)
arr

### 非數值資料轉換

In [None]:
# 1. 對應字典法
import pandas as pd

df = pd.read_csv('客戶聯絡狀況資料檔.csv') # read data
# df

df['婚姻'].unique() # 顯示類別
dict1 = {'單身':0, '已婚':1, '未知':2, '離婚':3}
df['婚姻'].replace(dict1, inplace=True) 
df

In [None]:
# 2. 標籤編碼法
from sklearn.preprocessing import LabelEncoder

df['工作'].unique()
label1 = LabelEncoder()
# 標籤欄位 = 標籤物件.fit_transform(DataFrame欄位)
df['工作'] = label1.fit_transform(df['工作'])
df

In [None]:
# 查詢轉換後數值代表特徵值
# 標籤物件.classes_
# labelname = label1.classes_
label1.classes_

In [None]:
# 3. one-Hot 編碼法
# df['訂購'].unique()

# 先轉標籤，再轉One-Hot，但不轉標籤也能用One-Hot
from sklearn.preprocessing import LabelEncoder
label1 = LabelEncoder()
df['訂購'] = label1.fit_transform(df['訂購'])
df

In [None]:
from sklearn.preprocessing import OneHotEncoder
# onehot物件 = OneHotEncoder(sparse=布林值)
onehot = OneHotEncoder(sparse=False) # sparse占記憶體較小，但不易懂
arr = onehot.fit_transform(df[['訂購']]) # DataFrame要放「二維陣列」
arr

# 二、 機器學習

> 01. 監督式學習: K近鄰、 單純貝氏分類、 決策樹

- 分類: 目標值不是連續值，例如能預測用戶是否能得到貸款

- 回歸: 目標值是連續值，例如能預測能得到多少貸款金額

> 02. 非監督式學習: K-means、DBSCAN

### K-means 演算法

In [None]:
import pandas as pd
df = pd.read_csv('customer.csv')
df

In [None]:
dict1 = {"男" : 1, "女" : 2}
df['性別'].replace(dict1, inplace = True)
df

In [None]:
# 加入分群結果
from sklearn.cluster import KMeans
# KMeans變數 = KMeans(n_clusters= 數值) # n_clusters:分成幾群
km = KMeans(n_clusters=3)
# KMeans變數.fit(訓練資料)
km.fit(df)

# 分群結果
km.labels_

# 加入分群結果
df['類別'] = km.labels_
df

In [None]:
# 解讀各群組資料
df1 = df[df['類別'] == 0]
df1.iloc[:25, :] #只顯示25筆符合資料，整列
# 收入偏高/消費指數偏高 --> VIP

In [None]:
df2 = df[df['類別'] == 1]
df2.iloc[:25, :]
# 收入無明顯之分/消費指數偏低 --> 普通人 or 對此商店物品不滿意者

In [None]:
df3 = df[df['類別'] == 2]
df3.iloc[:25, :]
# 收入偏中低/消費指數偏高 --> 熱愛此商店的消費者(有潛力)

In [None]:
# 評估分群效果
from sklearn.metrics import calinski_harabasz_score
for n in range(2,15):
    km = KMeans(n_clusters=n) #分n群
    km.fit(df)
    metric = calinski_harabasz_score(df, km.labels_)
    print(f'群組數量：{n}，評分：{metric}')
# 分數越高 分群效果越好

### K近鄰演算法
- 簡稱: KNN

In [None]:
#參考資料

# https://ithelp.ithome.com.tw/articles/10186473
# http://yann.lecun.com/exdb/mnist/
# https://stackoverflow.com/questions/62210186/from-tensorflow-examples-tutorials-mnist-not-working-in-google-colab
# https://www.codegrepper.com/code-examples/whatever/No+module+named+%27tensorflow.examples.tutorials%27+in+google+colab
# https://www.codegrepper.com/code-examples/shell/ModuleNotFoundError%3A+No+module+named+%27tensorflow.examples%27
# https://github.com/tensorflow/tensorflow
# https://github.com/tensorflow/tensorflow/issues/32790
# https://blog.csdn.net/weixin_41663570/article/details/102512468
# https://sweetornotspicymarathon.medium.com/tesorflow-keras-%E5%AD%B8%E7%BF%92%E7%AD%86%E8%A8%98-%E6%96%B0%E6%89%8B%E4%B8%80%E5%AE%9A%E8%A6%81%E7%8E%A9%E7%9A%84mnist%E6%89%8B%E5%AF%AB%E6%95%B8%E5%AD%97%E8%BE%A8%E8%AD%98-9327366cc838

# https://steam.oxxostudio.tw/category/python/example/image-conversion.html
# https://steam.oxxostudio.tw/category/python/example/matplotlib-save-image.html
# https://www.delftstack.com/zh-tw/howto/matplotlib/how-to-change-the-figure-size-in-matplotlib/

In [None]:
# !pip install --ignore-installed --upgrade tensorflow

# !pip install mnist
# import mnist
# train_images = mnist.train_images() 
# train_labels = mnist.train_labels()
# test_images = mnist.test_images()
# test_labels = mnist.test_labels()

# !git clone https://github.com/tensorflow/tensorflow.git

### mnist500.zip模擬創作

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

data = []
for i in range(10):
    for j in range(1,501):
        data.append(plt.imread(f'mnist500/{i}/{i}_{j}.bmp'))

# len(data)

x = np.array(data)
x

In [None]:
plt.imshow(x[1005], cmap='gray') #本身就是gray了

In [None]:
y = [0,1,2,3,4,5,6,7,8,9] * 500 # 目標值分組
y.sort()
print(y)

In [None]:
y = np.array(y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
len(x_test)

#### 實作K近鄰演算法
- 語法

    ```python
    from sklearn.neighbors import KNeighborsClassifier
    近鄰變數 = KNeighborsClassifier(n_neighbors=數值, algorithm=演算法, weights=權重計算方式)
    近鄰變數.fit(訓練資料, 訓練目標值)
    預測變數 = 近鄰變數.predict(預測資料)
    準確率變數 = 近鄰變數.score(預測資料, 預測目標值)
    ```
    
    

In [None]:
knn = KNeighborsClassifier(n_neighbors= 5)
knn.fit(x_train.reshape(4000, -1), y_train)
# 預測變數
pred = knn.predict(x_test.reshape(1000, -1))
pred

# 比對預測值與正確值
print(pred[100:130])
print(y_test[100:130])

# 準確率變數
score = knn.score(x_test.reshape(1000, -1), y_test)
score

In [None]:
# 儲存模型
import joblib
joblib.dump(knn, 'mnist500.pkl')

In [None]:
# 讀取模型
import joblib
knn2 = joblib.load('mnist500.pkl')

In [None]:
# 驗證模型
img = plt.imread('mnist500/3/3_1.bmp')
pred2 = knn2.predict(img.reshape(1, -1))
pred2

In [None]:
# KNN完整程式碼
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import joblib

data = []
for i in range(10):
    for j in range(1,501):
        data.append(plt.imread(f'mnist500/{i}/{i}_{j}.bmp'))
x = np.array(data)
y = [0,1,2,3,4,5,6,7,8,9] * 500
y = np.array(y)
y.sort()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 12)
knn = KNeighborsClassifier(n_neighbors= 5)
knn.fit(x_train.reshape(4000, -1), y_train)
score = knn.score(x_test.reshape(1000, -1), y_test)
print(score)
joblib.dump(knn, 'mnist500.pkl')

### 交叉驗證與網路搜索
#### 實作機器學習參數調較

- 語法:

    ```python
    from sklearn.model_selection import GridSearchCV
    參數變數 = {'n_neighbors':[2,3,6,10], 'weights':['uniform','distance']} # {參數1:值1, 參數2:值2, ....}
    網格變數 = GridSearchCV(knn, param_grid=param, cv=5)    # (演算法物件變數, param_grid=參數變數, cv=數值)
    網格變數.fit(x_train.reshape(4000, -1), y_train)    # (訓練特徵值, 訓練目標值)
    ```

- 結果:

    + best_score : 交叉驗證中最佳準確率
    + best_params : 最佳參數組合
    + cv_results : 每次交叉驗證準確率

In [None]:
from sklearn.model_selection import GridSearchCV
param = {'n_neighbors':[2,3,6,10], 'weights':['uniform','distance']}
gc = GridSearchCV(knn, param_grid=param, cv=5)
gc.fit(x_train.reshape(4000, -1), y_train)

In [None]:
gc.best_params_

In [None]:
gc.best_score_

### 單純貝氏演算法

##### 文字處理上，較好的演算法

#### 英文文句的特徵處理
- 語法:
    ```python
    from sklearn.feature_extraction.text import CountVectorizer
    文句變數 = CountVectorizer()
    數值變數 = 文句變數.fit_transform(['文句串列'])
    ```

- 結果:
    + 數值變數.toarray() : 文句次數統計
    + 文句變數.get_feature_names_out() : 文句

In [None]:
# 英文文句特徵處理
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
data = cv.fit_transform(['code is easy, i like python code','code is too hard, i dislike python code'])
data

In [None]:
data.toarray()

In [None]:
cv.get_feature_names_out()

### tf-idf 文句處理
- tf : 單詞頻率，表示單詞在一個文句中出現的次數
- idf : 逆文件頻率，表示有出現單詞文句數量

- 語法:
    ```Python
    from sklearn.feature_extraction.text import TfidfVectorizer
    文句變數 = TfidfVectorizer()
    數值變數 = 文句變數.fit_transform(文句串列)
    ```

- 結果:
    + 數值變數.toarray() : 文句次數統計
    + 文句變數.文句變數.get_feature_names_out() : 文句

In [None]:
# tf-idf文句處理
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
data = tf.fit_transform(['code is easy, i like python code','code is too hard, i dislike python code'])
data.toarray()

In [None]:
# 判斷英文新聞類別
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

news = fetch_20newsgroups(subset='all')
news.data

In [81]:
news.target

array([10,  3, 17, ...,  3,  1,  7])

In [82]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [83]:
len(news.target)

18846

In [84]:
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size = 0.2)

tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)

x_test = tf.transform(x_test)

from sklearn.naive_bayes import MultinomialNB
mlt = MultinomialNB(alpha=1.0)
mlt.fit(x_train, y_train)

score = mlt.score(x_test, y_test)
score

0.8604774535809019

In [85]:
# 判斷英文新聞類別完整程式碼
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

news = fetch_20newsgroups(subset='all')
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size = 0.2)
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
x_test = tf.transform(x_test) # x_train可能有18000個字，x_test可能只有8000個字，這時如果用tf.fit_transform(x_test)的話，字的數量會對不上
mlt = MultinomialNB(alpha=1.0)
mlt.fit(x_train, y_train)
score = mlt.score(x_test, y_test)
score

0.8503978779840848

#### 例句：今天台北天氣晴朗，風景區擠滿了人潮。、台北的天氣常常下雨。

In [87]:
# 中文文句特徵處理
import jieba

In [88]:
t1 = list(jieba.cut('今天台北天氣晴朗，風景區擠滿了人潮。、台北的天氣常常下雨。'))
c1 = ' '.join(t1)
c1

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Egg\AppData\Local\Temp\jieba.cache
Loading model cost 0.805 seconds.
Prefix dict has been built successfully.


'今天 台北 天氣 晴朗 ， 風景區 擠 滿 了 人潮 。 、 台北 的 天氣 常常 下雨 。'

In [89]:
# 判斷中文新聞類別
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import jieba

In [90]:
data = []
target = []
f = open('toutiao_cat_data.txt', encoding = 'utf-8')
for line in f:
    linelist = line.split('_!_')
    target.append(linelist[1])
    t = list(jieba.cut(linelist[3]))
    data.append(' '.join(t))

f.close()

In [91]:
len(target)

382688

In [92]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size = 0.2)
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
x_test = tf.transform(x_test)
mlt = MultinomialNB(alpha=1.0)
mlt.fit(x_train, y_train)
score = mlt.score(x_test, y_test)
score

0.8302673181948836

### 線性迴歸演算法

In [93]:
import pandas as pd
hp = pd.read_csv('housePrice.csv')
hp

Unnamed: 0,犯罪率,豪宅比,公設比,臨公園,NO濃度,房間數,屋齡,賣場距離,捷運距離,繳稅率,師生比,低收入比,房價
0,2.64981,0.0,28.18,0,0.589,5.867,7.4,12.5002,5,102,21.0,23.038918,23.1
1,0.14511,0.0,6.95,0,0.213,4.283,81.3,14.6909,23,158,22.8,39.632374,15.8
2,3.93046,6.0,46.29,1,0.278,4.287,16.4,0.7680,21,637,7.0,37.513949,22.1
3,2.95667,0.0,15.28,0,0.355,1.154,76.4,9.1349,8,337,18.1,12.331483,16.6
4,1.24919,3.0,39.17,0,0.206,6.788,10.0,10.5811,17,222,17.9,12.984651,26.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.64935,0.0,13.13,0,0.420,4.671,29.6,2.7699,22,831,17.1,16.196702,23.0
496,3.81123,70.0,29.51,0,0.398,5.472,78.3,6.1768,9,928,19.5,8.527868,23.0
497,0.72696,0.0,56.88,0,0.628,3.126,91.4,13.3371,13,201,21.6,22.979185,18.1
498,9.52366,0.0,23.17,0,0.892,2.356,79.4,8.5226,23,907,18.9,6.263589,13.0


In [94]:
x = hp.iloc[:, :12]
x

Unnamed: 0,犯罪率,豪宅比,公設比,臨公園,NO濃度,房間數,屋齡,賣場距離,捷運距離,繳稅率,師生比,低收入比
0,2.64981,0.0,28.18,0,0.589,5.867,7.4,12.5002,5,102,21.0,23.038918
1,0.14511,0.0,6.95,0,0.213,4.283,81.3,14.6909,23,158,22.8,39.632374
2,3.93046,6.0,46.29,1,0.278,4.287,16.4,0.7680,21,637,7.0,37.513949
3,2.95667,0.0,15.28,0,0.355,1.154,76.4,9.1349,8,337,18.1,12.331483
4,1.24919,3.0,39.17,0,0.206,6.788,10.0,10.5811,17,222,17.9,12.984651
...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.64935,0.0,13.13,0,0.420,4.671,29.6,2.7699,22,831,17.1,16.196702
496,3.81123,70.0,29.51,0,0.398,5.472,78.3,6.1768,9,928,19.5,8.527868
497,0.72696,0.0,56.88,0,0.628,3.126,91.4,13.3371,13,201,21.6,22.979185
498,9.52366,0.0,23.17,0,0.892,2.356,79.4,8.5226,23,907,18.9,6.263589


In [95]:
y = hp.iloc[:, 12]
y

0      23.1
1      15.8
2      22.1
3      16.6
4      26.3
       ... 
495    23.0
496    23.0
497    18.1
498    13.0
499    19.3
Name: 房價, Length: 500, dtype: float64

### 實作梯度下降法線性回歸
- 語法:
    ```python
    from sklearn.linear_model import SGDRegressor
    梯度變數 = SGDRegressor()
    梯度變數.fit(訓練資料, 訓練目標值)
    預測變數 = 梯度變數.predict(預測資料)
    ```

- 結果:
    + 梯度變數.coef_ : 權重值
    + 梯度變數.intercept_ : 偏置值

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler    # 標準化模組

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test) 
std_y = StandardScaler()
y_train = std_y.fit_transform(y_train.to_numpy().reshape(-1, 1))
y_test = std_y.transform(y_test.to_numpy().reshape(-1, 1)) 

In [97]:
sgd = SGDRegressor()
sgd.fit(x_train, y_train)

# inverse_transform --> 把標準化資料還原成原始資料
y_predict = std_y.inverse_transform(sgd.predict(x_test).reshape(-1, 1))
y_real = std_y.inverse_transform(y_test)
for i in range(10):
    print(f'預測值：{float(y_predict[i]):.8f}，真實值：{float(y_real[i]):.1f}')
    # print(f'預測值：{y_predict[i]}，真實值：{y_real[i]}') # .0無法顯示

預測值：34.47213107，真實值：37.5
預測值：17.08025019，真實值：12.2
預測值：22.29347304，真實值：23.6
預測值：31.91278601，真實值：27.0
預測值：17.02030454，真實值：19.2
預測值：32.26635198，真實值：27.2
預測值：21.72660778，真實值：22.6
預測值：26.66668221，真實值：22.1
預測值：19.68594564，真實值：16.3
預測值：22.67003414，真實值：22.7


  y = column_or_1d(y, warn=True)


In [98]:
# 公式：y = Wx + b
sgd.coef_ # W

array([-0.22623164,  0.26626333, -0.02605722,  0.12029717, -0.22494569,
        0.25389394, -0.27185267, -0.23951715, -0.23154061, -0.0385527 ,
       -0.01572305, -0.03051434])

In [99]:
sgd.intercept_ #b

array([-0.00183578])

### 邏輯迴歸演算法

In [100]:
import pandas as pd
df = pd.read_csv('breastCancer.csv')
df

Unnamed: 0,團塊厚度,細胞大小均勻性,細胞大小均勻性.1,邊緣粘附,上皮細胞大小,裸核,淡染色質,正常核仁,有絲分裂,種類
0,5,1,1,1,2,1,3,1,1,1
1,5,4,4,5,7,10,3,2,1,1
2,3,1,1,1,2,2,3,1,1,1
3,6,8,8,1,3,4,3,7,1,1
4,4,1,1,3,2,1,3,1,1,1
...,...,...,...,...,...,...,...,...,...,...
675,1,1,1,1,2,1,2,1,1,1
676,4,1,4,1,2,1,1,1,1,1
677,1,1,2,1,2,1,2,1,1,1
678,5,1,1,1,2,1,1,1,1,1


In [101]:
x = df.iloc[: , :9]
x

Unnamed: 0,團塊厚度,細胞大小均勻性,細胞大小均勻性.1,邊緣粘附,上皮細胞大小,裸核,淡染色質,正常核仁,有絲分裂
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
675,1,1,1,1,2,1,2,1,1
676,4,1,4,1,2,1,1,1,1
677,1,1,2,1,2,1,2,1,1
678,5,1,1,1,2,1,1,1,1


In [102]:
y = df.iloc[: , 9]
y

0      1
1      1
2      1
3      1
4      1
      ..
675    1
676    1
677    1
678    1
679    1
Name: 種類, Length: 680, dtype: int64

### 實作邏輯迴歸顯算法
- 語法:
    ```python
    from sklearn.linear_model import LogisticRegression
    迴歸變數 = LogisticRegression()
    迴歸變數.fit(訓練資料, 訓練目標值)
    預測變數 = 迴歸變數.predict(預測資料)
    準確率變數 = 迴歸變數.score(預測資料, 預測目標值)
    ```

In [112]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test) 
logis = LogisticRegression()
logis.fit(x_train, y_train)
score = logis.score(x_test, y_test)
score

0.9485294117647058

### 實作邏輯迴歸召回律調整

- 語法:
    ```python
    from sklearn.metrics import classification_report
    classification_report(目標值, 預測值, labels=類別值, target_names=類別顯示值)
    ```

In [105]:
pred = logis.predict(x_test)
pred

array([1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1,
       2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2,
       2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 2], dtype=int64)

In [106]:
from sklearn.metrics import classification_report
ret = classification_report(y_test, pred, labels=(1, 2),target_names=('良性', '惡性'))
print(ret)

              precision    recall  f1-score   support

          良性       0.95      0.98      0.96        88
          惡性       0.96      0.90      0.92        48

    accuracy                           0.95       136
   macro avg       0.95      0.94      0.94       136
weighted avg       0.95      0.95      0.95       136



In [117]:
# 提升召回率，參數調整
pred_p = logis.predict_proba(x_test)[:, 1]
# pred_p[:, 1]    # 只顯示惡性的機率
pred_p

array([0.02193893, 0.05868292, 0.00253531, 0.9006384 , 0.00581342,
       0.03654369, 0.9399151 , 0.00373922, 0.07027727, 0.0033678 ,
       0.00226754, 0.00152201, 0.02228197, 0.63219868, 0.41555775,
       0.58929497, 0.01038869, 0.98239694, 0.00431041, 0.00257107,
       0.01907651, 0.0033678 , 0.99914211, 0.0024955 , 0.99643862,
       0.99946044, 0.00743541, 0.99956821, 0.00962127, 0.99979902,
       0.00263079, 0.02753097, 0.99971946, 0.99957842, 0.97296414,
       0.0078364 , 0.01348309, 0.99624726, 0.95088902, 0.10343093,
       0.03736482, 0.99999953, 0.05124798, 0.96084054, 0.00329124,
       0.003391  , 0.99195852, 0.00370596, 0.08993312, 0.85117809,
       0.99933817, 0.01509605, 0.02115757, 0.9900985 , 0.99428153,
       0.00220999, 0.98905847, 0.99887606, 0.00177088, 0.99960389,
       0.00152201, 0.2996219 , 0.00152201, 0.99488364, 0.00581342,
       0.00226754, 0.00194901, 0.99663376, 0.99991897, 0.79819333,
       0.99929678, 0.99991139, 0.02875734, 0.02960458, 0.01189

In [124]:
pred = np.where(pred_p > 0.15, 2, 1) # (條件, 符合時顯示值, 不符合時顯示值)
ret = classification_report(y_test, pred, labels=(1, 2),target_names=('良性', '惡性'))
print(ret)

              precision    recall  f1-score   support

          良性       0.99      0.94      0.97        88
          惡性       0.90      0.98      0.94        48

    accuracy                           0.96       136
   macro avg       0.95      0.96      0.95       136
weighted avg       0.96      0.96      0.96       136



In [110]:
# 召回率調整完整程式碼
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

df = pd.read_csv('breastCancer.csv')
x = df.iloc[: , :9]
y = df.iloc[: , 9]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test) 
logis = LogisticRegression()
logis.fit(x_train, y_train)
score = logis.score(x_test, y_test)

pred = logis.predict(x_test)
pred_p = logis.predict_proba(x_test)[:, 1]

pred = np.where(pred_p > 0.2, 2, 1)
ret = classification_report(y_test, pred, labels=(1, 2),target_names=('良性', '惡性'))
print(ret)

              precision    recall  f1-score   support

          良性       0.99      0.98      0.98        89
          惡性       0.96      0.98      0.97        47

    accuracy                           0.98       136
   macro avg       0.97      0.98      0.98       136
weighted avg       0.98      0.98      0.98       136

