In [1]:
### Extending multiple regression ###

import pandas as pd
import numpy as np

# read the insurance data
insurance = pd.read_csv("insurance.csv", sep = ",", encoding = "ISO-8859-1")

# check output
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [2]:
### Feature creation ###

# change sex and smoker to integer
insurance["sex"] = np.where(insurance["sex"] == "female", 0, 1)
insurance["smoker"] = np.where(insurance["smoker"] == "no", 0, 1)

# add a higher-order "age" term
insurance["age2"] = insurance["age"]^2

# create a 'bmi30' variable that is 1 if BMI >= 30 and 0 otherwise
insurance['bmi30'] = np.where(insurance['bmi'] >= 30, 1, 0)

# check output
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,age2,bmi30
0,19,0,27.9,0,1,southwest,16884.924,17,0
1,18,1,33.77,1,0,southeast,1725.5523,16,1
2,28,1,33.0,3,0,southeast,4449.462,30,1
3,33,1,22.705,0,0,northwest,21984.47061,35,0
4,32,1,28.88,0,0,northwest,3866.8552,34,0


In [3]:
### Regression model ###

# Regression model
import statsmodels
import statsmodels.api as sm

insurance_model = sm.OLS.from_formula('charges ~ age + age2 + children + bmi + sex + bmi30*smoker + region', 
                                      data = insurance).fit()

# Print model summary
insurance_model.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.864
Model:,OLS,Adj. R-squared:,0.863
Method:,Least Squares,F-statistic:,765.4
Date:,"Wed, 11 Dec 2019",Prob (F-statistic):,0.0
Time:,14:00:39,Log-Likelihood:,-13143.0
No. Observations:,1338,AIC:,26310.0
Df Residuals:,1326,BIC:,26370.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-4702.7515,961.381,-4.892,0.000,-6588.746,-2816.757
region[T.northwest],-269.3153,352.476,-0.764,0.445,-960.787,422.156
region[T.southeast],-825.3873,354.852,-2.326,0.020,-1521.520,-129.254
region[T.southwest],-1223.8432,353.736,-3.460,0.001,-1917.787,-529.900
age,213.8870,63.670,3.359,0.001,88.982,338.792
age2,48.5371,62.013,0.783,0.434,-73.116,170.191
children,517.9404,102.021,5.077,0.000,317.800,718.081
bmi,114.9841,34.565,3.327,0.001,47.176,182.792
sex,-492.4171,246.604,-1.997,0.046,-976.194,-8.640

0,1,2,3
Omnibus:,869.512,Durbin-Watson:,2.057
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7277.976
Skew:,3.081,Prob(JB):,0.0
Kurtosis:,12.622,Cond. No.,540.0
