In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [2]:
df = pd.read_csv("cleaned_insurance_data_2.csv")

df.drop(["Unnamed: 0", "sex"], inplace=True, axis=1)

In [3]:
df = df[['charges', 'smoker', 'age', 'children', 'bmi', 'region']]

In [4]:
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'under_weight'
    elif 18.5 <= bmi < 25:
        return 'normal_weight'
    elif 25 <= bmi < 30:
        return 'over_weight'
    else:
        return 'obese'

df['bmi_category'] = df['bmi'].apply(categorize_bmi)

In [5]:
# Categorize age category
def categorize_age(age):
    if 18 < age < 26:
        return 'young_adult'
    elif 26 <= age < 36:
        return 'early_adulthood'
    elif 36 <= age < 46:
        return 'mid_adulthood'
    else:
        return 'late_adulthood'

df['age_category'] = df['age'].apply(categorize_age)

In [6]:
df['child_stat'] = df['children'].apply(lambda x : 0 if x == 0 else 1)
df['not_smoker'] = df["smoker"].apply(lambda x: 1 if x == 0 else 0)

In [7]:
# Feature Engineer

df['age_children'] = df['age'] * df['children']

df['smoker_child'] = df['children'] * (df['smoker'] == 1)
df['non_smoker_child'] = df['children'] * (df['not_smoker'] == 1)

df['smoker_age'] = df['smoker'] * df['age']
df['not_smoker_age'] = df['not_smoker'] * df['age']

df['children_str'] = df['children'].apply(lambda x: str(x))

In [8]:
df.head(4)

Unnamed: 0,charges,smoker,age,children,bmi,region,bmi_category,age_category,child_stat,not_smoker,age_children,smoker_child,non_smoker_child,smoker_age,not_smoker_age,children_str
0,16884.92,1,19,0,27.9,southwest,over_weight,young_adult,0,0,0,0,0,19,0,0
1,1725.55,0,18,1,33.77,southeast,obese,late_adulthood,1,1,18,0,1,0,18,1
2,4449.46,0,28,3,33.0,southeast,obese,early_adulthood,1,1,84,0,3,0,28,3
3,21984.47,0,33,0,22.7,northwest,normal_weight,early_adulthood,0,1,0,0,0,0,33,0


In [9]:
X = df.drop("charges", axis=1)
y = df[["charges"]]

In [10]:
X.shape

(1337, 15)

In [11]:
X.head(2)

Unnamed: 0,smoker,age,children,bmi,region,bmi_category,age_category,child_stat,not_smoker,age_children,smoker_child,non_smoker_child,smoker_age,not_smoker_age,children_str
0,1,19,0,27.9,southwest,over_weight,young_adult,0,0,0,0,0,19,0,0
1,0,18,1,33.77,southeast,obese,late_adulthood,1,1,18,0,1,0,18,1


In [12]:
X = pd.get_dummies(X, columns=['region', 'age_category', 'children_str', 'bmi_category']).astype(int)

In [13]:
pd.set_option("display.max_columns", None)

X.head(4)

Unnamed: 0,smoker,age,children,bmi,child_stat,not_smoker,age_children,smoker_child,non_smoker_child,smoker_age,not_smoker_age,region_northeast,region_northwest,region_southeast,region_southwest,age_category_early_adulthood,age_category_late_adulthood,age_category_mid_adulthood,age_category_young_adult,children_str_0,children_str_1,children_str_2,children_str_3,children_str_4,children_str_5,bmi_category_normal_weight,bmi_category_obese,bmi_category_over_weight,bmi_category_under_weight
0,1,19,0,27,0,0,0,0,0,19,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0
1,0,18,1,33,1,1,18,0,1,0,18,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
2,0,28,3,33,1,1,84,0,3,0,28,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
3,0,33,0,22,0,1,0,0,0,0,33,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0


In [14]:
X.shape

(1337, 29)

In [15]:
y.shape

(1337, 1)

### Feature scaling and selection

In [15]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

minmax_scale = MinMaxScaler()

X_transformed = minmax_scale.fit_transform(X)

stand_scaler = StandardScaler()

X_scaled = stand_scaler.fit_transform(X_transformed)

In [16]:
from sklearn.feature_selection import f_regression

f_statistic, p_values = f_regression(X_scaled, y)

  y = column_or_1d(y, warn=True)


In [None]:
ver_features = ['smoker', 'age', 'children', 'bmi',	'child_stat', 'not_smoker', 
                'age_children',	'smoker_child',	'non_smoker_child', 'smoker_age',
                'not_smoker_age', 'region_northeast',	'region_northwest',	'region_southeast',
                'age_category_early_adulthood',	'age_category_late_adulthood', 'age_category_mid_adulthood', 'age_category_young_adult',
                'region_southwest',	'bmi_category_normal_weight', 'bmi_category_obese', 'bmi_category_over_weight', 'bmi_category_under_weight']

selected_features = pd.DataFrame({"features": ver_features,
                            "f_statistic": f_statistic,
                             "p-values": p_values})

selected_features.sort_values(by='f_statistic', ascending= False, inplace=True)

In [19]:
selected_features.reset_index(inplace=True, drop=True)

In [20]:
selected_features

Unnamed: 0,features,f_statistic,p-values
0,smoker_age,2205.359058,5.144833e-285
1,not_smoker,2175.736862,1.406722e-282
2,smoker,2175.736862,1.406722e-282
3,smoker_child,507.54062,1.625889e-95
4,not_smoker_age,332.768801,1.482141e-66
5,age,130.402967,6.975762e-29
6,non_smoker_child,81.225502,6.800093999999999e-19
7,bmi_category_obese,55.82607,1.425685e-13
8,bmi,53.467709,4.515294e-13
9,age_children,23.468005,1.418501e-06


In [18]:
X_model = X.copy()

In [111]:
# X_model_select = X_model[['smoker', 'age',
# 'smoker_child', 	
#  'bmi_category_obese', 'bmi_category_over_weight']]

# smoker_age
# not_smoker_age
# child_stat
# non_smoker_child


In [19]:
X_model_select = X_model[['smoker', 'age',	'children',	'bmi',	'child_stat',	'not_smoker',
'age_children', 'smoker_child', 'non_smoker_child', 'smoker_age', 'not_smoker_age', 	
'age_category_early_adulthood',	'age_category_late_adulthood', 'age_category_mid_adulthood', 'age_category_young_adult',
'children_str_0',	'children_str_1',	'children_str_2',	'children_str_3',	'children_str_4',
'children_str_5', 'bmi_category_normal_weight', 'bmi_category_obese', 'bmi_category_over_weight', 'bmi_category_under_weight' ]]

# smoker_age
# not_smoker_age
# child_stat
# non_smoker_child


### Linear SVR

In [55]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVR

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X_model_select, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi', 
                    'bmi_category_obese', 'bmi_category_over_weight',
                    'age_category_early_adulthood',	
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lsvr', LinearSVR())
])

# GridSearchCV

params = {'lsvr__loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
              'lsvr__epsilon': np.arange(0.0, 15),
              'lsvr__random_state' : np.arange(35, 50),
             }

grid_svr = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_svr.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_svr.best_params_)
print("Best Score:", grid_svr.best_score_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Params: {'lsvr__epsilon': np.float64(0.0), 'lsvr__loss': 'squared_epsilon_insensitive', 'lsvr__random_state': np.int64(35)}
Best Score: 0.8491288461172981


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [56]:
grid_svr.score(x_test, y_test)

0.9266549346576858

0.9283916003960724, 

numerical_features = ['smoker', 'age',	'bmi', 'child_stat',
'smoker_child', 'children_str_0', 'children_str_1',	'children_str_2', 'children_str_3',
'bmi_category_obese', 'bmi_category_over_weight',]

### KNN

In [50]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X_model_select, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi',
                    'bmi_category_obese', 'bmi_category_over_weight',
                    'age_category_early_adulthood',	
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('knn', KNeighborsRegressor())
])

# GridSearchCV

params = {'knn__n_neighbors' : np.arange(1, 20),
              'knn__weights': ['uniform', 'distance'],
              'knn__metric': ['sokalmichener', 'cityblock', 'nan_euclidean', 'chebyshev',
                         'euclidean', 'manhattan', 'cosine'],
              'knn__p': np.arange(2, 8)
              }

grid_knn = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_knn.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_knn.best_params_)
print("Best Score:", grid_knn.best_score_)



Best Params: {'knn__metric': 'cosine', 'knn__n_neighbors': np.int64(17), 'knn__p': np.int64(2), 'knn__weights': 'uniform'}
Best Score: 0.8372062991643301


In [51]:
grid_knn.score(x_test, y_test)

0.9193674653846005

### Lasso (1)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV


# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X_model_select, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi', 
                    'bmi_category_obese', 'bmi_category_over_weight',
                    'age_category_early_adulthood',	
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lasso', Lasso())
])

# GridSearchCV

params = {"lasso__alpha": np.arange(1, 20),
     "lasso__selection": ["cyclic", "random"]}

grid_lasso = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_lasso.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_lasso.best_params_)
print("Best Score:", grid_lasso.best_score_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best Params: {'lasso__alpha': np.int64(19), 'lasso__selection': 'random'}
Best Score: 0.8509192649702095


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [40]:
grid_lasso.score(x_test, y_test)

0.9272114568714849

### Linear Regression

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X_model_select, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi',
                    'bmi_category_obese', 'bmi_category_over_weight',
                    'age_category_early_adulthood',	
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lr', LinearRegression())
])

# GridSearchCV

params = {'lr__fit_intercept' : [True]}

grid_lr = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_lr.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_lr.best_params_)
print("Best Score:", grid_lr.best_score_)

Best Params: {'lr__fit_intercept': True}
Best Score: 0.8490763231502658


In [37]:
grid_lr.score(x_test, y_test)

0.9266593214087172

### Bayesian ridge

In [99]:
from sklearn.linear_model import BayesianRidge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV


# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X_model_select, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi', 'smoker_child',
                    'bmi_category_obese', 'bmi_category_over_weight',	
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('br', BayesianRidge())
])

# GridSearchCV

params = {'br__tol' : [0.001, 0.005,]}

grid_br = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_br.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_br.best_params_)
print("Best Score:", grid_br.best_score_)

Best Params: {'br__tol': 0.001}
Best Score: 0.8513525561033661


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [100]:
grid_br.score(x_test, y_test)

0.9261571359083709