#### 該文件包含1,338個案例，即目前已經登記過的保險計劃受益者、病人特點和計劃計入的總醫療費用特徵:

* age:表示主要受益者年齡
* sex:性別
* bmi:身體質量指數,理想BMI在18.5~24.9之間
* children:表示保險計劃中所包括的孩子/受撫養者的數量
* smoker:表示被保險人是否經常吸煙
* region:受益者在美國的居住地-東北(northeast), 東南(sotheast), 西南(southwest)和東北(northwest)
* charges:醫療費用

#### 目的:檢測與醫療費用相關之變量，找出在醫療費用上具高風險的人

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error

In [16]:
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [17]:
df = pd.read_csv('insurance.csv')

In [18]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [19]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [20]:
df.info


<bound method DataFrame.info of       age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1338 rows x 7 columns]>

In [21]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [22]:
df.describe() 

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


分類變數編碼

In [23]:
# OHE 方法一 文字轉數字表示
df_one_hot = pd.get_dummies(df, columns=['sex','smoker', 'region'], prefix='Cat') # prefix 前綴
df_one_hot = df_one_hot.astype(int)

df_one_hot

Unnamed: 0,age,bmi,children,charges,Cat_female,Cat_male,Cat_no,Cat_yes,Cat_northeast,Cat_northwest,Cat_southeast,Cat_southwest
0,19,27,0,16884,1,0,0,1,0,0,0,1
1,18,33,1,1725,0,1,1,0,0,0,1,0
2,28,33,3,4449,0,1,1,0,0,0,1,0
3,33,22,0,21984,0,1,1,0,0,1,0,0
4,32,28,0,3866,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30,3,10600,0,1,1,0,0,1,0,0
1334,18,31,0,2205,1,0,1,0,1,0,0,0
1335,18,36,0,1629,1,0,1,0,0,0,1,0
1336,21,25,0,2007,1,0,1,0,0,0,0,1


In [24]:
df_one_hot.columns

Index(['age', 'bmi', 'children', 'charges', 'Cat_female', 'Cat_male', 'Cat_no',
       'Cat_yes', 'Cat_northeast', 'Cat_northwest', 'Cat_southeast',
       'Cat_southwest'],
      dtype='object')

歸一化

In [25]:
from sklearn import preprocessing
# 建立MinMaxScaler物件
minmax = preprocessing.MinMaxScaler()
# 資料標準化
dataset_minmax = minmax.fit_transform(df_one_hot[['age', 'bmi', 'children', 'Cat_female', 'Cat_male', 'Cat_no',
       'Cat_yes', 'Cat_northeast', 'Cat_northwest', 'Cat_southeast',
       'Cat_southwest', 'charges']])
dataset_minmax

array([[0.02173913, 0.31578947, 0.        , ..., 0.        , 1.        ,
        0.25160817],
       [0.        , 0.47368421, 0.2       , ..., 1.        , 0.        ,
        0.00964102],
       [0.2173913 , 0.47368421, 0.6       , ..., 1.        , 0.        ,
        0.05312136],
       ...,
       [0.        , 0.55263158, 0.        , ..., 1.        , 0.        ,
        0.00810867],
       [0.06521739, 0.26315789, 0.        , ..., 0.        , 1.        ,
        0.01414228],
       [0.93478261, 0.36842105, 0.        , ..., 0.        , 0.        ,
        0.44725375]])

In [26]:
dataset_minmax= pd.DataFrame(dataset_minmax, columns=['age', 'bmi', 'children', 'Cat_female', 'Cat_male', 'Cat_no',
       'Cat_yes', 'Cat_northeast', 'Cat_northwest', 'Cat_southeast',
       'Cat_southwest', 'charges'])
dataset_minmax

Unnamed: 0,age,bmi,children,Cat_female,Cat_male,Cat_no,Cat_yes,Cat_northeast,Cat_northwest,Cat_southeast,Cat_southwest,charges
0,0.021739,0.315789,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.251608
1,0.000000,0.473684,0.2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.009641
2,0.217391,0.473684,0.6,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.053121
3,0.326087,0.184211,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.333014
4,0.304348,0.342105,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.043816
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,0.695652,0.394737,0.6,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.151303
1334,0.000000,0.421053,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.017303
1335,0.000000,0.552632,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.008109
1336,0.065217,0.263158,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.014142


標準化

In [27]:
from sklearn.preprocessing import StandardScaler
# 建立StandardScaler物件
std = StandardScaler()
# 資料標準化
dataset_std = std.fit_transform(df_one_hot[['age', 'bmi', 'children', 'Cat_female', 'Cat_male', 'Cat_no',
       'Cat_yes', 'Cat_northeast', 'Cat_northwest', 'Cat_southeast',
       'Cat_southwest', 'charges']])
dataset_std

array([[-1.43876426, -0.51812191, -0.90861367, ..., -0.61132367,
         1.76548098,  0.29854818],
       [-1.50996545,  0.46265684, -0.07876719, ...,  1.63579466,
        -0.56641788, -0.95369393],
       [-0.79795355,  0.46265684,  1.58092576, ...,  1.63579466,
        -0.56641788, -0.728672  ],
       ...,
       [-1.50996545,  0.95304622, -0.90861367, ...,  1.63579466,
        -0.56641788, -0.96162422],
       [-1.29636188, -0.84504816, -0.90861367, ..., -0.61132367,
         1.76548098, -0.93039871],
       [ 1.55168573, -0.19119566, -0.90861367, ..., -0.61132367,
        -0.56641788,  1.31106429]])

設定特徵與標籤

In [28]:
X_minmax = dataset_minmax.iloc[:, 0:11].values

print("X_minmax: ", X_minmax)


X_minmax:  [[0.02173913 0.31578947 0.         ... 0.         0.         1.        ]
 [0.         0.47368421 0.2        ... 0.         1.         0.        ]
 [0.2173913  0.47368421 0.6        ... 0.         1.         0.        ]
 ...
 [0.         0.55263158 0.         ... 0.         1.         0.        ]
 [0.06521739 0.26315789 0.         ... 0.         0.         1.        ]
 [0.93478261 0.36842105 0.         ... 1.         0.         0.        ]]


In [29]:
y_minmax = dataset_minmax.iloc[:, 11].values


print("y_minmax: ", y_minmax)


y_minmax:  [0.25160817 0.00964102 0.05312136 ... 0.00810867 0.01414228 0.44725375]


In [30]:
y_minmax = dataset_minmax.iloc[:, 11].values.reshape(-1,1) # 轉換成1個column



分割資料

In [31]:
# 分割訓練和測試
from sklearn.model_selection import train_test_split
X_minmax_train, X_minmax_test, y_minmax_train, y_minmax_test = train_test_split(X_minmax, y_minmax, test_size = 0.2, random_state = 25)

In [32]:
print(f"Shape of X_train:{X_minmax_train.shape}")
print(f"Shape of X_test:{X_minmax_test.shape}")
print(f"Shape of y_train:{y_minmax_train.shape}")
print(f"Shape of y_test{y_minmax_test.shape}")

Shape of X_train:(1070, 11)
Shape of X_test:(268, 11)
Shape of y_train:(1070, 1)
Shape of y_test(268, 1)


## LinearRegression

In [33]:
from sklearn.linear_model import LinearRegression
regressor_linear_minmax = LinearRegression()
regressor_linear_minmax.fit(X_minmax_train, y_minmax_train)

In [34]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# R2 score，即決定係數，反映Y的全部變異能通過迴歸關係被所有X解釋的比例。
y_pred_linear_train = regressor_linear_minmax.predict(X_minmax_train)
r2_score_linear_train = r2_score(y_minmax_train, y_pred_linear_train)

y_pred_linear_test = regressor_linear_minmax.predict(X_minmax_test)
r2_score_linear_test = r2_score(y_minmax_test, y_pred_linear_test)

# RMSE一般指均方根誤差(標準誤差)，可衡量預測值和實際值之間的平均差異，藉此估計預測模型預測目標值的準確度
rmse_train = (np.sqrt(mean_squared_error(y_minmax_train, y_pred_linear_train)))
rmse_test = (np.sqrt(mean_squared_error(y_minmax_test, y_pred_linear_test)))

print('R2_score (train): ', r2_score_linear_train)
print('R2_score (test): ', r2_score_linear_test)
print("RMSE: ", rmse_train)
print("RMSE: ", rmse_test)

R2_score (train):  0.7489933400542808
R2_score (test):  0.7541073286374163
RMSE:  0.09724411570767928
RMSE:  0.0940502690826589


In [35]:
import statsmodels.api as sm
from scipy import stats

In [36]:
X2 = sm.add_constant(X_minmax) 
est = sm.OLS(y_minmax, X2).fit() 
print(est.summary())
# P  < .05, P  < .01,P  < .001

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     500.1
Date:                Sat, 18 May 2024   Prob (F-statistic):               0.00
Time:                        12:34:11   Log-Likelihood:                 1230.2
No. Observations:                1338   AIC:                            -2442.
Df Residuals:                    1329   BIC:                            -2396.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.035e+12   7.76e+11      1.334      0.1

In [37]:
dataset_minmax.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            1338 non-null   float64
 1   bmi            1338 non-null   float64
 2   children       1338 non-null   float64
 3   Cat_female     1338 non-null   float64
 4   Cat_male       1338 non-null   float64
 5   Cat_no         1338 non-null   float64
 6   Cat_yes        1338 non-null   float64
 7   Cat_northeast  1338 non-null   float64
 8   Cat_northwest  1338 non-null   float64
 9   Cat_southeast  1338 non-null   float64
 10  Cat_southwest  1338 non-null   float64
 11  charges        1338 non-null   float64
dtypes: float64(12)
memory usage: 125.6 KB


## PolynomialFeatures

In [38]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_minmax_train)
poly_reg.fit(X_poly, y_minmax_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(X_poly, y_minmax_train)

In [39]:
y_pred_poly2_train = regressor_poly2.predict(poly_reg.fit_transform(X_minmax_train))
r2_score_poly2_train = r2_score(y_minmax_train, y_pred_poly2_train)

y_pred_poly2_test = regressor_poly2.predict(poly_reg.fit_transform(X_minmax_test))
r2_score_poly2_test = r2_score(y_minmax_test, y_pred_poly2_test)

rmse_train = (np.sqrt(mean_squared_error(y_minmax_train, y_pred_poly2_train)))
rmse_test = (np.sqrt(mean_squared_error(y_minmax_test, y_pred_poly2_test)))

print('R2_score (train): ', r2_score_poly2_train)
print('R2_score (test): ', r2_score_poly2_test)
print("RMSE: ", rmse_train)
print("RMSE: ", rmse_test)

R2_score (train):  0.8408336043725233
R2_score (test):  0.8711639435420089
RMSE:  0.07743662204928616
RMSE:  0.06807787033974158


## Lasso

In [47]:
lasso = Lasso(alpha = 0.01) # , positive=True
lasso.fit(X_minmax_train,y_minmax_train)

y_pred_lasso_train = lasso.predict(X_minmax_train)
r2_score_lasso_train = lasso.score(X_minmax_train, y_minmax_train)

y_pred_lasso_test = lasso.predict(X_minmax_test)
r2_score_lasso_test = lasso.score(X_minmax_test, y_minmax_test)

rmse_lasso_train = (np.sqrt(mean_squared_error(y_minmax_train, y_pred_lasso_train)))
rmse_lasso_test = (np.sqrt(mean_squared_error(y_minmax_test, y_pred_lasso_test)))

print("coef：")
for i, j in zip(dataset_minmax, lasso.coef_):
    print(i, j)

coef：
age 0.09148911718190574
bmi 0.0
children 0.0
Cat_female -0.0
Cat_male 0.0
Cat_no -0.3219055654970521
Cat_yes 9.404448895756041e-17
Cat_northeast 0.0
Cat_northwest -0.0
Cat_southeast 0.0
Cat_southwest -0.0


In [48]:
print("R2_score (train): ",r2_score_lasso_train)
print("R2_score (test):", r2_score_lasso_test)
print("train_RMSE: ", rmse_lasso_train)
print("test_RMSE: ", rmse_lasso_test)

R2_score (train):  0.6714821587671789
R2_score (test): 0.6986492204588906
train_RMSE:  0.11125004342934938
test_RMSE:  0.10411742278358074


## Ridge

In [51]:
ridgeReg = Ridge(alpha=2)

ridgeReg.fit(X_minmax_train, y_minmax_train)

y_pred_ridge_train = ridgeReg.predict(X_minmax_train)
r2_score_ridge_train = ridgeReg.score(X_minmax_train, y_minmax_train)

y_pred_ridge_test = ridgeReg.predict(X_minmax_test)
r2_score_ridge_test = ridgeReg.score(X_minmax_test, y_minmax_test)

rmse_ridge_train = (np.sqrt(mean_squared_error(y_minmax_train, y_pred_ridge_train)))
rmse_ridge_test = (np.sqrt(mean_squared_error(y_minmax_test, y_pred_ridge_test)))

print("coef：")
for i, j in zip(dataset_minmax.columns, ridgeReg.coef_[0]):
    print(i, j)

coef：
age 0.18512968236317645
bmi 0.19618090918421768
children 0.040261286485044935
Cat_female 8.219755614157352e-05
Cat_male -8.219755614145105e-05
Cat_no -0.19164507164195602
Cat_yes 0.19164507164195413
Cat_northeast 0.013117628199227823
Cat_northwest 0.006036835241908644
Cat_southeast -0.01146830220918175
Cat_southwest -0.0076861612319527334


In [52]:
print("R2_score (train): ",r2_score_ridge_train)
print("R2_score (test):", r2_score_ridge_test)
print("train_RMSE: ", rmse_ridge_train)
print("test_RMSE: ", rmse_ridge_test)

R2_score (train):  0.7487736070056966
R2_score (test): 0.7560238221546204
train_RMSE:  0.09728667049822103
test_RMSE:  0.09368303707604858
