In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFpr, SelectKBest, f_regression

In [2]:
df = pd.read_csv(r'insurance.csv')

In [3]:
print(df.shape)
print(df.size)

(1338, 7)
9366


In [4]:
for i in df.columns:
    print(i)
    
print(len(df.columns))

age
sex
bmi
children
smoker
region
expenses
7


In [5]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [6]:
df.describe()

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


#### 原始資料狀態
- 1338筆，7個欄位
- str type
    - sex 性別, smoker 是否抽菸, region 居住地區
- int type
    - age
- float type
    - bmi, expense

In [8]:
df.isna().describe() 
df.isna().any() 
df.isna().sum() 



age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [9]:
print(df.duplicated().any())
print(df.duplicated().sum())

True
1


In [10]:
df.drop_duplicates(inplace=True)
print(df.duplicated().sum()) 

0


In [11]:
df['sex'].value_counts()


male      675
female    662
Name: sex, dtype: int64

In [12]:
df['smoker'].value_counts()


no     1063
yes     274
Name: smoker, dtype: int64

In [13]:
df['region'].value_counts() 

southeast    364
southwest    325
northwest    324
northeast    324
Name: region, dtype: int64

In [14]:
le=LabelEncoder()
df[['sex','smoker']]=df[['sex','smoker']].apply(le.fit_transform)

In [15]:
dummy=pd.get_dummies(df['region'])
df=pd.concat((df,dummy),axis=1)

In [16]:
df=df.drop(['region'],axis=1)
df=df.drop(['southwest'],axis=1)

In [17]:
X=df[['age','sex','bmi','children','smoker','northeast','northwest','southeast']]
y=df['expenses']

In [18]:
selector = SelectFpr(f_regression, alpha=0.05)
X_new = selector.fit_transform(X, y) 

In [19]:
mask = selector.get_support() 
new_features = X.columns[mask]
print (new_features)


Index(['age', 'sex', 'bmi', 'children', 'smoker', 'southeast'], dtype='object')


In [20]:
X=df[['age','sex','bmi','children','smoker','southeast']]
y=df['expenses']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20, random_state=17)

In [21]:
reg = linear_model.LinearRegression() 
reg.fit(X_train, y_train)

In [22]:
y_pred=reg.predict(X_test)

In [23]:
r2=metrics.r2_score(y_test, y_pred)
print('R2 Score：{0:.4f}'.format(r2))

n=len(y_test)    #樣本的個數
k=len(X.columns) #變數的個數
adj_r2 = 1-(n-1)/(n-k-1)*(1-r2)
print("Adjusted R^2 : {0:.4f}".format(adj_r2))

R2 Score：0.7415
Adjusted R^2 : 0.7355


In [24]:
def Calculate_pvalue(X,y):    
    fscore, pval=f_regression(X, y) #fscore & p-values of features
    i=0
    print('\np value of features')    
    print('==========  =======')
    for p in pval:
        print('{0:<12}\t{1:.4f}'.format(X.columns[i],p))
        i+=1

In [25]:
Calculate_pvalue(X,y)


p value of features
age         	0.0000
sex         	0.0338
bmi         	0.0000
children    	0.0137
smoker      	0.0000
southeast   	0.0071


In [26]:
X=df[['age','sex','bmi','children','smoker','southeast']]
y=df['expenses']
selector = SelectKBest(f_regression, k=5)
X_new = selector.fit_transform(X, y) 

mask = selector.get_support() 
new_features = X.columns[mask]
print (new_features)


Index(['age', 'bmi', 'children', 'smoker', 'southeast'], dtype='object')


In [27]:
X=df[['age','bmi','children','smoker','southeast']]
y=df['expenses']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20, random_state=17)

reg = linear_model.LinearRegression() 
reg.fit(X_train, y_train) 
y_pred=reg.predict(X_test)

In [28]:
# 計算 R^2, adjusted R^2，並和 model #1 比較
r2=metrics.r2_score(y_test, y_pred)
print('R2 Score：{0:.4f}'.format(r2))
n=len(y_test)    #樣本的個數
k=len(X.columns) #變數的個數
adj_r2 = 1-(n-1)/(n-k-1)*(1-r2)
print("Adjusted R^2 : {0:.4f}".format(adj_r2))
# 計算p-value
Calculate_pvalue(X,y) 

R2 Score：0.7421
Adjusted R^2 : 0.7371

p value of features
age         	0.0000
bmi         	0.0000
children    	0.0137
smoker      	0.0000
southeast   	0.0071


In [29]:
# 方程式的係數和截距
m=reg.coef_   
b=reg.intercept_
print('係數',m)
print('截距',b)

係數 [  252.28912811   342.09854741   386.5201729  24363.34441724
  -662.75013349]
截距 -12205.331896611895


In [30]:
#預測
nExpense=reg.predict([[50, 30, 3, 0, 0]])  
print('50歲, BMI=30, 有三個小孩，沒有吸菸，不是住在東南區的人，他的醫療費用預估： {:.2f}元'.format(float(nExpense)))

nExpense=reg.predict([[25, 23.2, 0, 1, 1]])  
print('25歲, BMI=24.2, 沒有小孩，有吸菸，是住在東南區的人，他的醫療費用預估： {:.2f}元'.format(float(nExpense)))

nExpense=reg.predict([[40, 26.7, 2, 0, 0]])  
print('40歲, BMI=26.7, 有兩個小孩，沒有吸菸，不是住在東南區的人，他的醫療費用預估： {:.2f}元'.format(float(nExpense)))



50歲, BMI=30, 有三個小孩，沒有吸菸，不是住在東南區的人，他的醫療費用預估： 11831.64元
25歲, BMI=24.2, 沒有小孩，有吸菸，是住在東南區的人，他的醫療費用預估： 25739.18元
40歲, BMI=26.7, 有兩個小孩，沒有吸菸，不是住在東南區的人，他的醫療費用預估： 7793.30元




In [31]:
nExpense=reg.predict([[26,25,0,1,1]])
print((float(nExpense)))

26607.243403291155


