In [4]:
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series,DataFrame

# 視覺化函式庫
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# 機器學習函式庫
import sklearn


import requests, zipfile
import io

# 顯示到小數點後第3位
%precision 3

'%.3f'

In [10]:
# 取得汽車價格資料
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
res = requests.get(url).content

# 將取得的資料作為DataFrame物件讀取
adult = pd.read_csv(io.StringIO(res.decode('utf-8')),header=None)

# 在資料的行裡設定標籤
adult.columns = ['age','workclass','fnlwgt','education','education-num','marital-status',
                 'occupation','relationship','race','sex','capital-gain',
                 'capital-loss','hours-per-week','native-country','flg-50k']

# 輸出資料的形式與遺漏數量
print('資料的形式：{}'.format(adult.shape))
print('遺漏的數量：{}'.format(adult.isnull().sum().sum()))

# 輸出資料的開頭5列
adult.head()

資料的形式：(32561, 15)
遺漏的數量：0


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,flg-50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### 8-3-2 資料的整理

In [12]:
adult.groupby('flg-50k').size()

flg-50k
 <=50K    24720
 >50K      7841
dtype: int64

In [15]:
# 增加「fin_flg」行，如果「flg-50k」行之值為「>50k」則設定為1、反之為0
adult['fin_flg'] = adult['flg-50k'].map(lambda x: 1 if x ==' >50K' else 0)
adult.groupby('fin_flg').size()

fin_flg
0    24720
1     7841
dtype: int64

#### 8-3-3 模型建構與評估

In [24]:
# 為了資料分割（訓練資料與測試資料）的匯入
from sklearn.model_selection import train_test_split

# 邏輯迴歸模型建構的匯入
from sklearn.linear_model import LogisticRegression

# 指定目標變數為fin_flg、其他為解釋變數
X = adult[['age','fnlwgt','education-num','capital-gain','capital-loss']]
y = adult['fin_flg']

# 分為訓練資料與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .5, random_state = 0) # test_size = .5 為將資料分為一半、random_state將亂數固定

# 邏輯迴歸的初始化學習
model = LogisticRegression()
model.fit(X_train,y_train)

print('準確度（train）:{:.3f}'.format(model.score(X_train,y_train)))
print('準確度（test）:{:.3f}'.format(model.score(X_test,y_test)))

準確度（train）:0.797
準確度（test）:0.798


In [17]:
model.coef_

array([[-1.185e-02, -4.379e-06, -2.774e-03,  3.274e-04,  7.532e-04]])

In [19]:
np.exp(model.coef_)

array([[0.988, 1.   , 0.997, 1.   , 1.001]])

#### 8-3-4 藉由縮放來提高預測準確度

In [26]:
# 對所有的解釋變數做正規化提高精準度

# 用於標準化的類別
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 設定X與y
X = adult[['age','fnlwgt','education-num','capital-gain','capital-loss']]
y = adult['fin_flg']

# 分為訓練資料與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .5, random_state = 0)

# 標準化處理
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# 邏輯迴歸的初始化學習
model = LogisticRegression()
model.fit(X_train_std,y_train)

print('準確度（train）:{:.3f}'.format(model.score(X_train_std,y_train)))
print('準確度（test）:{:.3f}'.format(model.score(X_test_std,y_test)))

準確度（train）:0.811
準確度（test）:0.810


#### 練習問題8-2

In [27]:
# 為了資料分割（訓練資料與測試資料）的匯入
from sklearn.model_selection import train_test_split

# 邏輯迴歸模型建構的匯入
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

# 分為訓練資料與測試資料
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size = .5, random_state = 0) # test_size = .5 為將資料分為一半、random_state將亂數固定

# 邏輯迴歸的初始化學習
model = LogisticRegression()
model.fit(X_train,y_train)

print('準確度（train）:{:.3f}'.format(model.score(X_train,y_train)))
print('準確度（test）:{:.3f}'.format(model.score(X_test,y_test)))

準確度（train）:0.968
準確度（test）:0.954


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 練習問題8-3

In [28]:
# 標準化處理
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# 邏輯迴歸的初始化學習
model = LogisticRegression()
model.fit(X_train_std,y_train)

print('準確度（train）:{:.3f}'.format(model.score(X_train_std,y_train)))
print('準確度（test）:{:.3f}'.format(model.score(X_test_std,y_test)))

準確度（train）:0.989
準確度（test）:0.975
