## [作業重點]
使用 Sklearn 中的線性迴歸模型，來訓練各種資料集，務必了解送進去模型訓練的**資料型態**為何，也請了解模型中各項參數的意義

## 作業
試著使用 sklearn datasets 的其他資料集 (wine, boston, ...)，來訓練自己的線性迴歸模型。

#### HINT: 注意 label 的型態，確定資料集的目標是分類還是回歸，在使用正確的模型訓練！

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.simplefilter('ignore')

# Datasets
from sklearn import datasets

# Preprocessing
from sklearn.model_selection import train_test_split

# Model
from sklearn.linear_model import LinearRegression, LogisticRegression

# Evaluation
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

### Logistics regression

In [30]:
# 讀取紅酒資料集(分類問題)，其中 wine 為一個字典
wine = datasets.load_wine()
print(f"Keys in wine: {list(wine.keys())}")

# 轉成 DataFrame 比較方便觀察
wine_df = pd.DataFrame(wine.data, columns=wine.feature_names)
display(wine_df.head())

# 使用資料集中的所有特徵
X = wine_df # X 需要為一個 matrix
y = wine.target
print("X shape: ", X.shape)
print("y shape: ", y.shape)

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# 建立模型
log_reg = LogisticRegression()

# 訓練模型
log_reg.fit(x_train, y_train)

# 預測測試集
y_pred = log_reg.predict(x_test)
print('y_pred: ', y_pred)

# 分類問題的衡量採用 accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")

Keys in wine: ['data', 'target', 'target_names', 'DESCR', 'feature_names']


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


X shape:  (178, 13)
y shape:  (178,)
y_pred:  [0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0]
Accuracy: 1.00


In [31]:
# 讀取乳癌資料集(分類問題)，其中 breast_cancer 為一個字典
breast_cancer = datasets.load_breast_cancer()
print(f"Keys in wine: {list(breast_cancer.keys())}")

# 轉成 DataFrame 比較方便觀察
breast_cancer_df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)
display(breast_cancer_df.head())

# 使用資料集中的所有特徵
X = breast_cancer_df # X 需要為一個 matrix
y = breast_cancer.target
print("X shape: ", X.shape)
print("y shape: ", y.shape)

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# 建立模型
log_reg = LogisticRegression()

# 訓練模型
log_reg.fit(x_train, y_train)

# 預測測試集
y_pred = log_reg.predict(x_test)
print('y_pred: ', y_pred)

# 分類問題的衡量採用 accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")

Keys in wine: ['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


X shape:  (569, 30)
y shape:  (569,)
y_pred:  [0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 1 0 0 1 1 1 0 0 0 0 1 1 1 1]
Accuracy: 0.96


### Linear regression

In [32]:
# 讀取波士頓房產資料集(回歸問題)，其中 boston 為一個字典
boston = datasets.load_boston()
print(f"Keys in boston: {list(boston.keys())}")

# 轉成 DataFrame 比較方便觀察
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
display(boston_df.head())

# 使用資料集中的所有特徵
X = boston_df # X 需要為一個 matrix
y = boston.target
print("X shape: ", X.shape)
print("y shape: ", y.shape)

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# 建立模型
lin_reg = LinearRegression()

# 訓練模型
lin_reg.fit(x_train, y_train)

# 預測測試集
y_pred = lin_reg.predict(x_test)
print('y_pred:\n', y_pred)

# 回歸問題的衡量採用 MSE 及 R square
print(f"Mean squared error: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R square: {r2_score(y_test, y_pred):.2f}")

Keys in boston: ['data', 'target', 'feature_names', 'DESCR', 'filename']


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


X shape:  (506, 13)
y shape:  (506,)
y_pred:
 [25.01207787 23.70673643 29.0945173  12.31397471 21.61347812 19.13354868
 20.81580439 21.37329011 18.38618961 19.34579424  5.25912036 16.65767507
 17.52569896  5.77456709 39.90010353 32.4334732  22.86945378 36.53576421
 30.95591345 23.0906515  24.91430476 24.08929781 20.54441681 30.23258421
 22.3642316   8.72252642 17.58062573 17.65060042 36.10230383 20.91252213
 18.77553493 18.18471313 19.85999794 23.90528147 28.93272041 19.23050276
 12.01526727 24.24058855 17.68050031 16.09113614 26.38479882 21.06267915
 22.32605647 15.61632473 22.9796011  25.12377027 20.21458841 22.45911017
  9.8519346  24.41614999 20.21336125]
Mean squared error: 41.72
R square: 0.51
