In [1]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [2]:
# Load data
df = pd.read_csv("cleaned_insurance_data_2.csv")

# Drop unwanted column
df.drop(["Unnamed: 0", "sex"], inplace=True, axis=1)

In [3]:
# Re-order column
df = df[['charges', 'smoker', 'age', 'children', 'bmi', 'region']]

In [4]:
# Categorize BMI
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'under_weight'
    elif 18.5 <= bmi < 25:
        return 'normal_weight'
    elif 25 <= bmi < 30:
        return 'over_weight'
    else:
        return 'obese'

# Apply function to bmi column
df['bmi_category'] = df['bmi'].apply(categorize_bmi)

In [5]:
# Categorize age category
def categorize_age(age):
    if 18 < age < 26:
        return 'young_adult'
    elif 26 <= age < 36:
        return 'early_adulthood'
    elif 36 <= age < 46:
        return 'mid_adulthood'
    else:
        return 'late_adulthood'
    
# Apply function to age column
df['age_category'] = df['age'].apply(categorize_age)

In [6]:
# Create new columns
#
# "child_stat": indicates whether a client has children (regardless of their number)
# "not_smoker": indicates the client is not a smoker 
# "children_str": represents number of children as strings
df['child_stat'] = df['children'].apply(lambda x : 0 if x == 0 else 1)
df['not_smoker'] = df["smoker"].apply(lambda x: 1 if x == 0 else 0)
df['children_str'] = df['children'].apply(lambda x: str(x))

In [7]:
# Feature Engineering
# Create interaction terms between variables
df['age_children'] = df['age'] * df['children']
df['smoker_child'] = df['children'] * (df['smoker'] == 1)
df['non_smoker_child'] = df['children'] * (df['not_smoker'] == 1)
df['smoker_age'] = df['smoker'] * df['age']
df['not_smoker_age'] = df['not_smoker'] * df['age']

In [8]:
# Define features and targets
X = df.drop("charges", axis=1)
y = df[["charges"]]

In [9]:
# One-Hot Encode 
X = pd.get_dummies(X, columns=['region', 'age_category', 'children_str', 'bmi_category']).astype(int)

In [10]:
# Review columns
pd.set_option("display.max_columns", None)

# Check first four rows
X.head(4)

Unnamed: 0,smoker,age,children,bmi,child_stat,not_smoker,age_children,smoker_child,non_smoker_child,smoker_age,not_smoker_age,region_northeast,region_northwest,region_southeast,region_southwest,age_category_early_adulthood,age_category_late_adulthood,age_category_mid_adulthood,age_category_young_adult,children_str_0,children_str_1,children_str_2,children_str_3,children_str_4,children_str_5,bmi_category_normal_weight,bmi_category_obese,bmi_category_over_weight,bmi_category_under_weight
0,1,19,0,27,0,0,0,0,0,19,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0
1,0,18,1,33,1,1,18,0,1,0,18,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
2,0,28,3,33,1,1,84,0,3,0,28,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
3,0,33,0,22,0,1,0,0,0,0,33,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0


In [11]:
# Confirm feature shape
X.shape

# Confirm target shape
y.shape

(1337, 1)

### Lasso

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV


# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi', 
                    'bmi_category_obese', 'bmi_category_over_weight',
                    'age_category_early_adulthood',	
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lasso', Lasso())
])

# GridSearchCV

params = {"lasso__alpha": np.arange(1, 20),
     "lasso__selection": ["cyclic", "random"]}

grid_lasso = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_lasso.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_lasso.best_params_)
print("Best Score:", grid_lasso.best_score_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best Params: {'lasso__alpha': np.int64(19), 'lasso__selection': 'random'}
Best Score: 0.8509192649702095


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [40]:
grid_lasso.score(x_test, y_test)

0.9272114568714849

### Linear SVR

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVR

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi', 
                    'bmi_category_obese', 'bmi_category_over_weight',
                    'age_category_early_adulthood',
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lsvr', LinearSVR())
])

# GridSearchCV
params = {'lsvr__loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
              'lsvr__epsilon': np.arange(0.0, 15),
              'lsvr__random_state' : np.arange(35, 50),
             }

grid_svr = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_svr.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_svr.best_params_)
print("Best Score:", grid_svr.best_score_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Params: {'lsvr__epsilon': np.float64(0.0), 'lsvr__loss': 'squared_epsilon_insensitive', 'lsvr__random_state': np.int64(45)}
Best Score: 0.8491288488352451


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [13]:
grid_svr.score(x_test, y_test)

0.9266549314137009

### Linear Regression

In [43]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi',
                    'bmi_category_obese', 'bmi_category_over_weight',
                    'age_category_early_adulthood',	
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lr', LinearRegression())
])

# GridSearchCV

params = {'lr__fit_intercept' : [True]}

grid_lr = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_lr.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_lr.best_params_)
print("Best Score:", grid_lr.best_score_)

Best Params: {'lr__fit_intercept': True}
Best Score: 0.8490763231502658


In [44]:
grid_lr.score(x_test, y_test)

0.9266593214087172