#### 該文件包含1,338個案例，即目前已經登記過的保險計劃受益者、病人特點和計劃計入的總醫療費用特徵:

* age:表示主要受益者年齡
* sex:性別
* bmi:身體質量指數,理想BMI在18.5~24.9之間
* children:表示保險計劃中所包括的孩子/受撫養者的數量
* smoker:表示被保險人是否經常吸煙
* region:受益者在美國的居住地-東北(northeast), 東南(sotheast), 西南(southwest)和東北(northwest)
* charges:醫療費用

#### 目的:檢測與醫療費用相關之變量，找出在醫療費用上具高風險的人

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error

In [83]:
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [84]:
df = pd.read_csv('insurance.csv')

In [85]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [86]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [87]:
df.info


<bound method DataFrame.info of       age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1338 rows x 7 columns]>

In [88]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [89]:
df.describe() 

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


分類變數編碼

In [90]:
# OHE 方法一 文字轉數字表示
df_one_hot = pd.get_dummies(df, columns=['sex','smoker', 'region'], prefix='Cat') # prefix 前綴
df_one_hot = df_one_hot.astype(int)

df_one_hot

Unnamed: 0,age,bmi,children,charges,Cat_female,Cat_male,Cat_no,Cat_yes,Cat_northeast,Cat_northwest,Cat_southeast,Cat_southwest
0,19,27,0,16884,1,0,0,1,0,0,0,1
1,18,33,1,1725,0,1,1,0,0,0,1,0
2,28,33,3,4449,0,1,1,0,0,0,1,0
3,33,22,0,21984,0,1,1,0,0,1,0,0
4,32,28,0,3866,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30,3,10600,0,1,1,0,0,1,0,0
1334,18,31,0,2205,1,0,1,0,1,0,0,0
1335,18,36,0,1629,1,0,1,0,0,0,1,0
1336,21,25,0,2007,1,0,1,0,0,0,0,1


In [91]:
df_one_hot.columns

Index(['age', 'bmi', 'children', 'charges', 'Cat_female', 'Cat_male', 'Cat_no',
       'Cat_yes', 'Cat_northeast', 'Cat_northwest', 'Cat_southeast',
       'Cat_southwest'],
      dtype='object')

標準化

In [92]:
from sklearn.preprocessing import StandardScaler
# 建立StandardScaler物件
std = StandardScaler()
# 資料標準化

dataset_std = std.fit_transform(df_one_hot[['age', 'bmi', 'children', 'Cat_female', 'Cat_male', 'Cat_no',
       'Cat_yes', 'Cat_northeast', 'Cat_northwest', 'Cat_southeast',
       'Cat_southwest', 'charges']])
dataset_std

array([[-1.43876426, -0.51812191, -0.90861367, ..., -0.61132367,
         1.76548098,  0.29854818],
       [-1.50996545,  0.46265684, -0.07876719, ...,  1.63579466,
        -0.56641788, -0.95369393],
       [-0.79795355,  0.46265684,  1.58092576, ...,  1.63579466,
        -0.56641788, -0.728672  ],
       ...,
       [-1.50996545,  0.95304622, -0.90861367, ...,  1.63579466,
        -0.56641788, -0.96162422],
       [-1.29636188, -0.84504816, -0.90861367, ..., -0.61132367,
         1.76548098, -0.93039871],
       [ 1.55168573, -0.19119566, -0.90861367, ..., -0.61132367,
        -0.56641788,  1.31106429]])

In [93]:
dataset_std= pd.DataFrame(dataset_std, columns=['age', 'bmi', 'children', 'Cat_female', 'Cat_male', 'Cat_no',
       'Cat_yes', 'Cat_northeast', 'Cat_northwest', 'Cat_southeast',
       'Cat_southwest', 'charges'])
dataset_std

Unnamed: 0,age,bmi,children,Cat_female,Cat_male,Cat_no,Cat_yes,Cat_northeast,Cat_northwest,Cat_southeast,Cat_southwest,charges
0,-1.438764,-0.518122,-0.908614,1.010519,-1.010519,-1.970587,1.970587,-0.565267,-0.566418,-0.611324,1.765481,0.298548
1,-1.509965,0.462657,-0.078767,-0.989591,0.989591,0.507463,-0.507463,-0.565267,-0.566418,1.635795,-0.566418,-0.953694
2,-0.797954,0.462657,1.580926,-0.989591,0.989591,0.507463,-0.507463,-0.565267,-0.566418,1.635795,-0.566418,-0.728672
3,-0.441948,-1.335438,-0.908614,-0.989591,0.989591,0.507463,-0.507463,-0.565267,1.765481,-0.611324,-0.566418,0.719845
4,-0.513149,-0.354659,-0.908614,-0.989591,0.989591,0.507463,-0.507463,-0.565267,1.765481,-0.611324,-0.566418,-0.776832
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,0.768473,-0.027733,1.580926,-0.989591,0.989591,0.507463,-0.507463,-0.565267,1.765481,-0.611324,-0.566418,-0.220555
1334,-1.509965,0.135731,-0.908614,1.010519,-1.010519,0.507463,-0.507463,1.769076,-0.566418,-0.611324,-0.566418,-0.914042
1335,-1.509965,0.953046,-0.908614,1.010519,-1.010519,0.507463,-0.507463,-0.565267,-0.566418,1.635795,-0.566418,-0.961624
1336,-1.296362,-0.845048,-0.908614,1.010519,-1.010519,0.507463,-0.507463,-0.565267,-0.566418,-0.611324,1.765481,-0.930399


設定特徵與標籤

In [94]:
X_std = dataset_std.iloc[:, 0:11].values

print("X_std: ", X_std)


X_std:  [[-1.43876426 -0.51812191 -0.90861367 ... -0.56641788 -0.61132367
   1.76548098]
 [-1.50996545  0.46265684 -0.07876719 ... -0.56641788  1.63579466
  -0.56641788]
 [-0.79795355  0.46265684  1.58092576 ... -0.56641788  1.63579466
  -0.56641788]
 ...
 [-1.50996545  0.95304622 -0.90861367 ... -0.56641788  1.63579466
  -0.56641788]
 [-1.29636188 -0.84504816 -0.90861367 ... -0.56641788 -0.61132367
   1.76548098]
 [ 1.55168573 -0.19119566 -0.90861367 ...  1.76548098 -0.61132367
  -0.56641788]]


In [95]:
y_std = dataset_std.iloc[:, 11].values


print("y_minmax: ", y_std)


y_minmax:  [ 0.29854818 -0.95369393 -0.728672   ... -0.96162422 -0.93039871
  1.31106429]


In [96]:
y_std = dataset_std.iloc[:, 11].values.reshape(-1,1) # 轉換成1個column



分割資料

In [97]:
# 分割訓練和測試
from sklearn.model_selection import train_test_split
X_std_train, X_std_test, y_std_train, y_std_test = train_test_split(X_std, y_std, test_size = 0.2, random_state = 25)

In [98]:
print(f"Shape of X_train:{X_std_train.shape}")
print(f"Shape of X_test:{X_std_test.shape}")
print(f"Shape of y_train:{y_std_train.shape}")
print(f"Shape of y_test{y_std_test.shape}")

Shape of X_train:(1070, 11)
Shape of X_test:(268, 11)
Shape of y_train:(1070, 1)
Shape of y_test(268, 1)


## LinearRegression

In [99]:
from sklearn.linear_model import LinearRegression
regressor_linear_std = LinearRegression()
regressor_linear_std.fit(X_std_train, y_std_train)

In [111]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# R2 score，即決定係數，反映Y的全部變異能通過迴歸關係被所有X解釋的比例。
y_pred_linear_train = regressor_linear_std.predict(X_std_train)
r2_score_linear_train = r2_score(y_std_train, y_pred_linear_train)

y_pred_linear_test = regressor_linear_std.predict(X_std_test)
r2_score_linear_test = r2_score(y_std_test, y_pred_linear_test)

# RMSE一般指均方根誤差(標準誤差)，可衡量預測值和實際值之間的平均差異，藉此估計預測模型預測目標值的準確度
rmse_train = (np.sqrt(mean_squared_error(y_std_train, y_pred_linear_train)))
rmse_test = (np.sqrt(mean_squared_error(y_std_test, y_pred_linear_test)))

print('R2_score (train): ', r2_score_linear_train)
print('R2_score (test): ', r2_score_linear_test)
print("RMSE: ", rmse_train)
print("RMSE: ", rmse_test)

R2_score (train):  0.748973086093679
R2_score (test):  0.7550514853642599
RMSE:  0.5032835646826055
RMSE:  0.48579892641303585


In [101]:
import statsmodels.api as sm
from scipy import stats

In [102]:
X2 = sm.add_constant(X_std) 
est = sm.OLS(y_std, X2).fit() 
print(est.summary())
# P  < .05, P  < .01,P  < .001

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     500.7
Date:                Sat, 18 May 2024   Prob (F-statistic):               0.00
Time:                        12:36:03   Log-Likelihood:                -968.74
No. Observations:                1338   AIC:                             1955.
Df Residuals:                    1329   BIC:                             2002.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       6.782e-17      0.014   4.95e-15      1.0

In [103]:
dataset_std.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            1338 non-null   float64
 1   bmi            1338 non-null   float64
 2   children       1338 non-null   float64
 3   Cat_female     1338 non-null   float64
 4   Cat_male       1338 non-null   float64
 5   Cat_no         1338 non-null   float64
 6   Cat_yes        1338 non-null   float64
 7   Cat_northeast  1338 non-null   float64
 8   Cat_northwest  1338 non-null   float64
 9   Cat_southeast  1338 non-null   float64
 10  Cat_southwest  1338 non-null   float64
 11  charges        1338 non-null   float64
dtypes: float64(12)
memory usage: 125.6 KB


## PolynomialFeatures

In [104]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_std_train)
poly_reg.fit(X_poly, y_std_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(X_poly, y_std_train)

In [105]:
y_pred_poly2_train = regressor_poly2.predict(poly_reg.fit_transform(X_std_train))
r2_score_poly2_train = r2_score(y_std_train, y_pred_poly2_train)

y_pred_poly2_test = regressor_poly2.predict(poly_reg.fit_transform(X_std_test))
r2_score_poly2_test = r2_score(y_std_test, y_pred_poly2_test)

rmse_train = (np.sqrt(mean_squared_error(y_std_train, y_pred_poly2_train)))
rmse_test = (np.sqrt(mean_squared_error(y_std_test, y_pred_poly2_test)))

print('R2_score (train): ', r2_score_poly2_train)
print('R2_score (test): ', r2_score_poly2_test)
print("RMSE: ", rmse_train)
print("RMSE: ", rmse_test)

R2_score (train):  0.8408695934140449
R2_score (test):  0.8715576134652777
RMSE:  0.4007090868098443
RMSE:  0.351781769231398


## Lasso

In [109]:
lasso = Lasso(alpha = 0.1) # , positive=True
lasso.fit(X_std_train,y_std_train)

y_pred_lasso_train = lasso.predict(X_std_train)
r2_score_lasso_train = lasso.score(X_std_train, y_std_train)

y_pred_lasso_test = lasso.predict(X_std_test)
r2_score_lasso_test = lasso.score(X_std_test, y_std_test)

rmse_lasso_train = (np.sqrt(mean_squared_error(y_std_train, y_pred_lasso_train)))
rmse_lasso_test = (np.sqrt(mean_squared_error(y_std_test, y_pred_lasso_test)))

print("coef：")
for i, j in zip(dataset_std, lasso.coef_):
    print(i, j)

coef：
age 0.20784346428974568
bmi 0.07198136419964483
children 0.0
Cat_female -0.0
Cat_male 0.0
Cat_no -0.7018941395805428
Cat_yes 6.534344534394216e-16
Cat_northeast 0.0
Cat_northwest 0.0
Cat_southeast -0.0
Cat_southwest -0.0


In [110]:
print("R2_score (train): ",r2_score_lasso_train)
print("R2_score (test):", r2_score_lasso_test)
print("train_RMSE: ", rmse_lasso_train)
print("test_RMSE: ", rmse_lasso_test)

R2_score (train):  0.7149210918799861
R2_score (test): 0.7427684854132268
train_RMSE:  0.5363337763634524
test_RMSE:  0.49783019338902973


## Ridge

In [112]:
ridgeReg = Ridge(alpha=0.9)

ridgeReg.fit(X_std_train, y_std_train)

y_pred_ridge_train = ridgeReg.predict(X_std_train)
r2_score_ridge_train = ridgeReg.score(X_std_train, y_std_train)

y_pred_ridge_test = ridgeReg.predict(X_std_test)
r2_score_ridge_test = ridgeReg.score(X_std_test, y_std_test)

rmse_ridge_train = (np.sqrt(mean_squared_error(y_std_train, y_pred_ridge_train)))
rmse_ridge_test = (np.sqrt(mean_squared_error(y_std_test, y_pred_ridge_test)))

print("coef：")
for i, j in zip(dataset_std.columns, ridgeReg.coef_[0]):
    print(i, j)

coef：
age 0.29674988456717843
bmi 0.1760419920797299
children 0.05125926588118445
Cat_female 0.00062659393250381
Cat_male -0.0006265939324800147
Cat_no -0.40252362451535795
Cat_yes 0.4025236245151801
Cat_northeast 0.030938105419824048
Cat_northwest 0.01532147251463032
Cat_southeast -0.02884309630964721
Cat_southwest -0.016295882802594884


In [113]:
print("R2_score (train): ",r2_score_ridge_train)
print("R2_score (test):", r2_score_ridge_test)
print("train_RMSE: ", rmse_ridge_train)
print("test_RMSE: ", rmse_ridge_test)

R2_score (train):  0.7489736078408789
R2_score (test): 0.7551034036754423
train_RMSE:  0.5032830416571599
test_RMSE:  0.4857474396822549
