In [16]:
# 取得中文字型for juypter Notebook
import wget
f_url = 'https://github.com/flyingpath/electron-hand-dicom/raw/master/TaipeiSansTCBeta-Regular.ttf'
wget.download(f_url)

# 共通事前處理
# 隱藏不必要的警告
import warnings
warnings.filterwarnings('ignore')

# 匯入必要的函式庫
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm

# 將字型新增到 matplotlib
fm.fontManager.addfont('./TaipeiSansTCBeta-Regular.ttf')

# 用來顯示資料框的函式
from IPython.display import display

# 調整顯示選項
# NumPy 的浮點數表示精度
np.set_printoptions(suppress=True, precision=4)
# pandas 中的浮點數表示精度
pd.options.display.float_format = '{:.4f}'.format
# 顯示資料框中的所有項目
pd.set_option("display.max_columns",None)
# 指定圖形的預設字體大小
plt.rcParams["font.size"] = 14
# 指定圖形的預設字型
plt.rcParams['font.family'] = 'Taipei Sans TC Beta'
# 隨機種子
random_seed = 123

In [17]:
# 用於顯示混淆矩陣之函式
def make_cm(matrix, columns):  #matrix是矩陣的意思
    # matrix numpy 陣列
    
    # columns 項目名稱列表
    n = len(columns)
    
    # 將 '標準答案' 重複 n 次以生成列表
    act = ['標準答案'] * n
    pred = ['預測結果'] * n
    
    # 生成資料框
    cm = pd.DataFrame(matrix, 
        columns=[pred, columns], index=[act, columns])
    return cm

In [18]:
#1.載入資料
df_all = pd.read_csv('bank/bank-full.csv', sep=';')

#將項目名稱改成中文，確認資料框內容使用display(df_all.head())
columns = [
    '年齡', '職業', '婚姻', '學歷', '違約', '平均餘額', '房屋貸款', '個人信貸', '聯絡方式', '最近一次通話日期', '最近一次通話月份', '最近一次通話秒數', 
    '通話次數_促銷期間', '上次促銷後_經過天數', '通話次數_促銷之前', '上次促銷結果', '本次促銷結果'
]
df_all.columns = columns

#確認資料
display(df_all.head())


Unnamed: 0,年齡,職業,婚姻,學歷,違約,平均餘額,房屋貸款,個人信貸,聯絡方式,最近一次通話日期,最近一次通話月份,最近一次通話秒數,通話次數_促銷期間,上次促銷後_經過天數,通話次數_促銷之前,上次促銷結果,本次促銷結果
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [19]:
#2.確認資料
df_all.info()

#本次促銷結果的成功分布
print(df_all['本次促銷結果'].value_counts())

#成功率
rate = df_all['本次促銷結果'].value_counts()['yes'] / len(df_all)
print(f'本次促銷結果成功率: {rate:.4f}')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   年齡          45211 non-null  int64 
 1   職業          45211 non-null  object
 2   婚姻          45211 non-null  object
 3   學歷          45211 non-null  object
 4   違約          45211 non-null  object
 5   平均餘額        45211 non-null  int64 
 6   房屋貸款        45211 non-null  object
 7   個人信貸        45211 non-null  object
 8   聯絡方式        45211 non-null  object
 9   最近一次通話日期    45211 non-null  int64 
 10  最近一次通話月份    45211 non-null  object
 11  最近一次通話秒數    45211 non-null  int64 
 12  通話次數_促銷期間   45211 non-null  int64 
 13  上次促銷後_經過天數  45211 non-null  int64 
 14  通話次數_促銷之前   45211 non-null  int64 
 15  上次促銷結果      45211 non-null  object
 16  本次促銷結果      45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB
本次促銷結果
no     39922
yes     5289
Name: count, dtype: int64
本次促銷結果成功率: 0.1170


In [20]:
#3.預處理資料
#確認遺失值
print(df_all.isnull().sum())  #也可以用display()顯示耶

#確認需要被One-Hot的cloumns有哪些
#'職業','婚姻','學歷','聯絡方式','上次促銷結果'
#設置One-Hot函式
def enc(df, column:str):
    df_dummy = pd.get_dummies(df[column], prefix=column, dtype=int)  #要加上dtype才會顯示0、1
    df = pd.concat([df.drop([column],axis=1), df_dummy],axis=1)
    return df

df_all2 = df_all.copy()
df_all2 = enc(df_all2, '職業')
df_all2 = enc(df_all2, '婚姻')
df_all2 = enc(df_all2, '學歷')
df_all2 = enc(df_all2, '聯絡方式')
df_all2 = enc(df_all2, '上次促銷結果')

display(df_all2.head())

年齡            0
職業            0
婚姻            0
學歷            0
違約            0
平均餘額          0
房屋貸款          0
個人信貸          0
聯絡方式          0
最近一次通話日期      0
最近一次通話月份      0
最近一次通話秒數      0
通話次數_促銷期間     0
上次促銷後_經過天數    0
通話次數_促銷之前     0
上次促銷結果        0
本次促銷結果        0
dtype: int64


Unnamed: 0,年齡,違約,平均餘額,房屋貸款,個人信貸,最近一次通話日期,最近一次通話月份,最近一次通話秒數,通話次數_促銷期間,上次促銷後_經過天數,通話次數_促銷之前,本次促銷結果,職業_admin.,職業_blue-collar,職業_entrepreneur,職業_housemaid,職業_management,職業_retired,職業_self-employed,職業_services,職業_student,職業_technician,職業_unemployed,職業_unknown,婚姻_divorced,婚姻_married,婚姻_single,學歷_primary,學歷_secondary,學歷_tertiary,學歷_unknown,聯絡方式_cellular,聯絡方式_telephone,聯絡方式_unknown,上次促銷結果_failure,上次促銷結果_other,上次促銷結果_success,上次促銷結果_unknown
0,58,no,2143,yes,no,5,may,261,1,-1,0,no,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1
1,44,no,29,yes,no,5,may,151,1,-1,0,no,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1
2,33,no,2,yes,yes,5,may,76,1,-1,0,no,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1
3,47,no,1506,yes,no,5,may,92,1,-1,0,no,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1
4,33,no,1,no,no,5,may,198,1,-1,0,no,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1


In [21]:
#以0、1替換yes、no(映射/編碼)
def enc_bin(df, column):
    df[column] = df[column].map(dict(yes=1, no=0))  #series.map()把舊值換成新值,通常是用dict字典
    return df

#需要更換的值'違約','房屋貸款','個人信貸','本次促銷結果'
df_all2 = enc_bin(df_all2, '違約')
df_all2 = enc_bin(df_all2, '房屋貸款')
df_all2 = enc_bin(df_all2, '個人信貸')
df_all2 = enc_bin(df_all2, '本次促銷結果')

display(df_all2.head())

Unnamed: 0,年齡,違約,平均餘額,房屋貸款,個人信貸,最近一次通話日期,最近一次通話月份,最近一次通話秒數,通話次數_促銷期間,上次促銷後_經過天數,通話次數_促銷之前,本次促銷結果,職業_admin.,職業_blue-collar,職業_entrepreneur,職業_housemaid,職業_management,職業_retired,職業_self-employed,職業_services,職業_student,職業_technician,職業_unemployed,職業_unknown,婚姻_divorced,婚姻_married,婚姻_single,學歷_primary,學歷_secondary,學歷_tertiary,學歷_unknown,聯絡方式_cellular,聯絡方式_telephone,聯絡方式_unknown,上次促銷結果_failure,上次促銷結果_other,上次促銷結果_success,上次促銷結果_unknown
0,58,0,2143,1,0,5,may,261,1,-1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1
1,44,0,29,1,0,5,may,151,1,-1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1
2,33,0,2,1,1,5,may,76,1,-1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1
3,47,0,1506,1,0,5,may,92,1,-1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1
4,33,0,1,0,0,5,may,198,1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1


In [22]:
#將月份改為數值使用1~12編碼即可

month_dict = dict(jan=1, feb=2, mar=3, aqr=4, may=5, jun=6, jul=7, aug=8, sep=9, cot=10, nov=11, dec=12)

def nce_month(df, column):
    df[column] = df[column].map(month_dict)
    return df

df_all2 = nce_month(df_all2, '最近一次通話月份')

display(df_all2.head())

Unnamed: 0,年齡,違約,平均餘額,房屋貸款,個人信貸,最近一次通話日期,最近一次通話月份,最近一次通話秒數,通話次數_促銷期間,上次促銷後_經過天數,通話次數_促銷之前,本次促銷結果,職業_admin.,職業_blue-collar,職業_entrepreneur,職業_housemaid,職業_management,職業_retired,職業_self-employed,職業_services,職業_student,職業_technician,職業_unemployed,職業_unknown,婚姻_divorced,婚姻_married,婚姻_single,學歷_primary,學歷_secondary,學歷_tertiary,學歷_unknown,聯絡方式_cellular,聯絡方式_telephone,聯絡方式_unknown,上次促銷結果_failure,上次促銷結果_other,上次促銷結果_success,上次促銷結果_unknown
0,58,0,2143,1,0,5,5.0,261,1,-1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1
1,44,0,29,1,0,5,5.0,151,1,-1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1
2,33,0,2,1,1,5,5.0,76,1,-1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1
3,47,0,1506,1,0,5,5.0,92,1,-1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1
4,33,0,1,0,0,5,5.0,198,1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1


concat()說明書
pandas.concat(objs, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=True)
objs：一個可迭代物件（list、tuple）包含要連接的 DataFrame 或 Series。
axis：連接軸，0 表示縱向（行連接，預設），1 表示橫向（列連接）。
join：如何處理索引的聯集。'outer'（預設）為聯集，'inner'為交集。
ignore_index：是否重新索引結果，若True，結果索引將會從0開始。
keys：為連接的資料添加層次化索引標籤。
verify_integrity：是否檢查索引有無重複。
sort：是否排序非連接軸的索引。

資料處理Label encoding或One hot encoding
get_dummies()說明書
pd.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, drop_first=False, dtype=None)
data：欲處理的DataFrame或Series。
columns：選擇指定的欄位進行轉換，不指定則會對所有類別型欄位處理。
prefix：轉換後欄位名前綴，預設用原本欄位名。
prefix_sep：前綴與欄位值間隔符號（預設為 _）。
drop_first：是否刪除第一個類別，避免虛擬變數陷阱（可用於避免多重共線性）。
dummy_na：是否當成一類別值對待空值。
dtype：轉換後的資料型態。

In [27]:
#4.分割(split)資料
x = df_all2.drop('本次促銷結果', axis=1)
y = df_all2['本次促銷結果'].values

#分割成訓練集與驗證集，使用6:4分配
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=random_seed, stratify=y)

In [None]:
#5.選擇演算法(algorithm)

#邏輯斯回歸
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=random_seed)

#決策樹
from sklearn.tree import DecisionTreeClassifier
algorithm2 = DecisionTreeClassifier(random_state=random_seed)

#隨機森林
from sklearn.ensemble import RandomForestClassifier
algorithm3 = RandomForestClassifier(random_state=random_seed)

#XGBoost
from xgboost import XGBClassifier
algorithm4 = XGBClassifier(random_state=random_seed)

#組成一個list
algorithms = [algorithm1,  algorithm2, algorithm3, algorithm4]

In [33]:
#交叉驗證選擇最佳演算法
from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import cross_val_score

#loop做交叉驗證
for algorithm in algorithms:
    scores = cross_val_score(algorithm, x_train, y_train, cv=stratifiedkfold, scoring='roc_auc')
    score = scores.mean()
    name = algorithm.__class__.__name__
    print(f'平均分數: {score:.4f}, 個別分數:{scores} 演算法:{name}')

ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Gaudi\miniconda3\envs\exercises_2508\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gaudi\miniconda3\envs\exercises_2508\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Gaudi\miniconda3\envs\exercises_2508\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1247, in fit
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self,
        ^^^^^
    ...<5 lines>...
        accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\Gaudi\miniconda3\envs\exercises_2508\Lib\site-packages\sklearn\utils\validation.py", line 2971, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gaudi\miniconda3\envs\exercises_2508\Lib\site-packages\sklearn\utils\validation.py", line 1368, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "c:\Users\Gaudi\miniconda3\envs\exercises_2508\Lib\site-packages\sklearn\utils\validation.py", line 1105, in check_array
    _assert_all_finite(
    ~~~~~~~~~~~~~~~~~~^
        array,
        ^^^^^^
    ...<2 lines>...
        allow_nan=ensure_all_finite == "allow-nan",
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\Gaudi\miniconda3\envs\exercises_2508\Lib\site-packages\sklearn\utils\validation.py", line 120, in _assert_all_finite
    _assert_all_finite_element_wise(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        X,
        ^^
    ...<4 lines>...
        input_name=input_name,
        ^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\Gaudi\miniconda3\envs\exercises_2508\Lib\site-packages\sklearn\utils\validation.py", line 169, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
