# U.S. Medical Insurance Costs

## Import Library

In [None]:
!pip install pycaret
!pip install scipy

In [2]:
import pandas as pd
from pycaret.regression import setup, compare_models, evaluate_model, predict_model, save_model
import joblib

## Data Loading

In [3]:
df = pd.read_csv('insurance.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


* There is no missing data.
* There are seven columns.
* Some columns are numerical while some are categorical.

## Exploratory Data Analysis (EDA)

In [6]:
# Find out the average age of the patients
average_age = df['age'].mean()
print(f"Average Age of Patients: {average_age}")

Average Age of Patients: 39.20702541106129


In [7]:
# Analyze where a majority of the individuals are from
majority_region = df['region'].value_counts().idxmax()
print(f"Majority Region: {majority_region}")

Majority Region: southeast


In [8]:
# Look at the different costs between smokers vs. non-smokers
average_cost_smokers = df[df['smoker'] == 'yes']['charges'].mean()
average_cost_non_smokers = df[df['smoker'] == 'no']['charges'].mean()
print(f"Average Charges for Smokers: {average_cost_smokers}")
print(f"Average Charges for Non-Smokers: {average_cost_non_smokers}")

Average Charges for Smokers: 32050.23183153284
Average Charges for Non-Smokers: 8434.268297856204


In [9]:
# Figure out what the average age is for someone who has at least one child
average_age_with_children = df[df['children'] > 0]['age'].mean()
print(f"Average Age for Individuals with at Least One Child: {average_age_with_children}")

Average Age for Individuals with at Least One Child: 39.78010471204188


## Model Training

In [10]:
s = setup(df, target='charges')

Unnamed: 0,Description,Value
0,Session id,7841
1,Target,charges
2,Target type,Regression
3,Original data shape,"(1338, 7)"
4,Transformed data shape,"(1338, 10)"
5,Transformed train set shape,"(936, 10)"
6,Transformed test set shape,"(402, 10)"
7,Ordinal features,2
8,Numeric features,3
9,Categorical features,3


In [11]:
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,2729.6166,23312793.1866,4809.6811,0.8326,0.4352,0.305,0.313
rf,Random Forest Regressor,2874.9143,25486795.6559,5032.6251,0.8172,0.4699,0.3349,0.355
lightgbm,Light Gradient Boosting Machine,3039.0291,25957253.9627,5079.5674,0.8142,0.5236,0.3611,0.36
ada,AdaBoost Regressor,4101.2116,28197360.0147,5288.6464,0.7975,0.5789,0.6292,0.146
et,Extra Trees Regressor,2932.9901,29162903.5004,5383.2165,0.7913,0.4788,0.3244,0.296
xgboost,Extreme Gradient Boosting,3244.6627,29861322.0,5444.2268,0.7855,0.5759,0.4144,0.199
ridge,Ridge Regression,4340.8878,39165423.6484,6249.4912,0.7201,0.5658,0.4248,0.091
br,Bayesian Ridge,4335.1987,39164682.7509,6249.3461,0.72,0.5687,0.4238,0.156
llar,Lasso Least Angle Regression,4328.1656,39159776.1423,6248.8431,0.72,0.573,0.4227,0.086
lar,Least Angle Regression,4328.0066,39162677.4878,6249.0787,0.72,0.5732,0.4226,0.084


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

## Model Evaluation

In [12]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

## Model Saving

In [13]:
save_model(best_model, 'gbr')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['age', 'bmi', 'children'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['sex', 'smoker', 'region'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('ordinal_encoding',
                  TransformerWrapper(include=['sex', 'smoker'],
                                     transfor...
                                                                          'data_type': dtype('O'),
                                                                          'mapping': female    0
 male      1
 NaN      -1
 dtype: int64},
                                                                         {'col': 'smoker',
                                                                          'data_type': dtype(