In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

import math

In [3]:
insurance = pd.read_csv('./data/expenses.csv')
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
insurance.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [5]:
insurance.shape

(1338, 7)

In [6]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
insurance.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [8]:
n = len(insurance)
numerator = n * sum(insurance["charges"]*insurance['age']) - sum(insurance["charges"]) * sum(insurance["age"])
denominator = n * sum(insurance["charges"]**2) - sum(insurance["charges"]) ** 2

print(numerator)
print(denominator)
b1_ = numerator / denominator
print(b1_) #intercept

91010239079.4524
262347308458475.62
0.00034690746253208656


In [9]:
# sample mean

b0_ = np.mean(insurance['age']) - b1_ *  np.mean(insurance['charges'])

print(b0_) #slope

34.60341689633183


In [10]:
print('New regression model')

print("age= ", b0_ ,"+",b1_,"* charges")

New regression model
age=  34.60341689633183 + 0.00034690746253208656 * charges


In [11]:
result = smf.ols(formula="charges ~ age", data=insurance).fit()

print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.089
Model:                            OLS   Adj. R-squared:                  0.089
Method:                 Least Squares   F-statistic:                     131.2
Date:                Sat, 03 Dec 2022   Prob (F-statistic):           4.89e-29
Time:                        12:29:25   Log-Likelihood:                -14415.
No. Observations:                1338   AIC:                         2.883e+04
Df Residuals:                    1336   BIC:                         2.884e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   3165.8850    937.149      3.378      0.0

In [12]:
result.params

Intercept    3165.885006
age           257.722619
dtype: float64

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
X=insurance.drop(['charges', 'region', 'smoker', 'sex'], axis=1)
y=insurance['charges']

In [15]:
X

Unnamed: 0,age,bmi,children
0,19,27.900,0
1,18,33.770,1
2,28,33.000,3
3,33,22.705,0
4,32,28.880,0
...,...,...,...
1333,50,30.970,3
1334,18,31.920,0
1335,18,36.850,0
1336,21,25.800,0


In [16]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [18]:
#split data 75/25, random state = 0 
# Split data to train set and step set

from sklearn.model_selection import train_test_split

#data = insurance[['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']]
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=.25, random_state=0)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)


X_train shape: (1003, 3)
X_test shape: (335, 3)
y_train shape: (1003,)
y_test shape: (335,)


In [19]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression()

In [20]:
y_pred = reg.predict(X_test)
y_pred

array([15756.09703444, 14430.22566727, 18669.18537412, 19502.82816064,
       11229.58889778,  8572.47462436,  6616.82538334, 18709.14997634,
       14052.37442945, 12803.4688431 ,  9816.29959658, 14780.32103673,
       13729.03938526,  9241.0047288 ,  9211.45244635, 16017.42827263,
       15054.20060562, 10850.61820512, 13113.35404751,  8363.00697692,
       14566.44865903, 18359.81676167, 15306.36062994, 13313.55756428,
        9722.27785603, 13600.58940215,  6051.33777986, 15322.82144805,
        9241.19056239, 14991.55410331, 14025.08979096, 20556.45155244,
       19422.94431829, 17886.49030212,  6163.11564499,  9685.40278179,
       17517.67199927, 12458.29479586, 14602.0969945 ,  9150.37780831,
        8545.18998586,  9473.22172683, 11164.46237623, 19734.14253239,
        9788.89104632, 10484.84406816, 15918.06598798, 12434.41162329,
        8536.30969693, 14959.2286841 , 10267.17201565,  9277.2227396 ,
       14232.91367033, 18909.3800348 , 19920.12740872, 11514.75794643,
      

In [33]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=99)

for k in cv.split(X[:15]):
    print(k)

(array([ 1,  2,  3,  4,  5,  7,  8,  9, 11, 12, 13, 14]), array([ 0,  6, 10]))
(array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12]), array([ 7, 13, 14]))
(array([ 0,  1,  3,  6,  7,  8,  9, 10, 11, 12, 13, 14]), array([2, 4, 5]))
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  9, 10, 13, 14]), array([ 8, 11, 12]))
(array([ 0,  2,  4,  5,  6,  7,  8, 10, 11, 12, 13, 14]), array([1, 3, 9]))


In [34]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=99)

In [37]:
from sklearn.model_selection import cross_val_score


In [21]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [22]:
#sex, smoker region
imp_ohe = make_pipeline(SimpleImputer(strategy='most_frequent'), 
                       OneHotEncoder(sparse=False))
#Age, BMI, Childern
imp_std = make_pipeline(SimpleImputer(), 
                       StandardScaler())

In [23]:
preprocessor = make_column_transformer((imp_ohe, ['sex', 'smoker', 'region']), 
                       (imp_std, ['age', 'bmi', 'children']), 
                       remainder = 'passthrough')

In [26]:
preprocessor

ColumnTransformer(remainder='passthrough',
                  transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(sparse=False))]),
                                 ['sex', 'smoker', 'region']),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['age', 'bmi', 'children'])])

In [74]:
from sklearn.linear_model import ElasticNet
Elsnet = ElasticNet(alpha = 0.1, random_state = 0)

In [72]:
from sklearn import set_config
set_config(display='diagram')

pipe

In [27]:
X_train.head()

Unnamed: 0,age,bmi,children
1075,32,29.59,1
131,61,22.04,0
15,19,24.6,1
1223,20,24.42,0
1137,26,22.23,0


In [28]:
X_train.dtypes
y_train.head()

1075     4562.84210
131     13616.35860
15       1837.23700
1223    26125.67477
1137     3176.28770
Name: charges, dtype: float64

In [42]:

#R-square
from sklearn.metrics import r2_score
r2_score(y_test, y_pred).round(3)



0.158

In [77]:
#MSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
(math.sqrt(mse))


11516.411886639082

In [79]:
#MAPE 
(np.mean(np.abs((y_test - y_pred)/y_test))*100)


115.60373821367821

In [48]:
#PR 
sum(y_pred)/sum(y_test)

0.9804473931524404