## Import Libraries 

In [30]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

In [2]:
# Load data
df = pd.read_csv("cleaned_insurance_data_2.csv")

# Drop unwanted column
df.drop(["Unnamed: 0"], inplace=True, axis=1)

In [3]:
# Re-order column
df = df[['charges', 'sex', 'smoker', 'age', 'children', 'bmi', 'region']]

## Feature Selection

- With F-statistic

In [4]:
# Duplicate data
model_data = df.copy()

In [6]:
# One-Hot Encode "region" column
model_data = pd.get_dummies(model_data, columns=['region']).astype(int)

In [7]:
# Separate feature and target params
x_model_features = model_data[['smoker', 'sex', 'children', 'bmi', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']]
y_target_feature = model_data[['charges']]

In [8]:
# Transform (to normalize distribution as much as possible)
# Scale features (to have feature parameters on the same scale)

from sklearn.preprocessing import StandardScaler

x_model_features_log = np.log1p(x_model_features)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the input data (X)
x_scaled = scaler.fit_transform(x_model_features_log)

In [9]:
# Transform target 
y_log = np.log1p(y_target_feature)

In [10]:
# Calculate f_statistic 
from sklearn.feature_selection import f_regression

f_statistic, p_values = f_regression(x_scaled,y_log)

  y = column_or_1d(y, warn=True)


In [17]:
# Display f_statistic
features = pd.DataFrame({"Parameters": x_model_features.columns,
              "f_statistic": f_statistic,
              "p-value": p_values
              })

# Sort selected features
features.sort_values(by= ['f_statistic', 'p-value'], ascending=False).reset_index(drop=True)

Unnamed: 0,Parameters,f_statistic,p-value
0,smoker,1062.63565,5.873238e-172
1,age,532.436093,2.055911e-99
2,children,40.030499,3.40497e-10
3,bmi,25.451589,5.161291e-07
4,region_southwest,2.412675,0.1205936
5,region_northeast,2.403414,0.1213078
6,region_northwest,0.317817,0.5730172
7,region_southeast,0.299346,0.5843841
8,sex,0.065792,0.797604


## Checking for multicolinearity

In [31]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

variables = ols_data_reorder[['smoker','age','children']]

vif = pd.DataFrame()

vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]

vif["Features"] = variables.columns

NameError: name 'ols_data_reorder' is not defined

In [None]:
vif

## Checking OLS Assumption

In [None]:
# 
x_scaled_ols = pd.DataFrame(x_scaled[ : , :3], columns= ["smoker", "age", "children"])

In [None]:
ols_data = pd.concat([x_scaled_ols, y_log], axis = 1)

In [None]:
ols_data_reorder = ols_data[["charges_log", "smoker", "age", "children"]]

ols_data_reorder

In [None]:
# Create a figure with 3 subplots in 1 row
fig, axes = plt.subplots(1, 3, figsize = (16, 5), sharey=True) 

# Scatter plot: age vs bmi
sns.scatterplot(data=ols_data_reorder, x='smoker', y='charges_log', ax=axes[0], color='orange')
axes[0].set_title('Smoker vs Charges', fontsize=14, weight='bold')
axes[0].set_xlabel('Smoker', fontsize=12, weight='normal')
axes[0].set_ylabel('Charges (€)', fontsize=12, weight='normal')

# Scatter plot: children vs bmi
sns.scatterplot(data=ols_data_reorder, x='age', y='charges_log', ax=axes[1], color='green')
axes[1].set_title('Age vs Charges', fontsize=14, weight='bold')
axes[1].set_xlabel('Age', fontsize=12, weight='normal')

# Scatter plot: children vs age_log
sns.scatterplot(data=ols_data_reorder, x='children', y='charges_log', ax=axes[2], color='magenta')
axes[2].set_title('Children vs Charges', fontsize=14, weight='bold')
axes[2].set_xlabel('Children', fontsize=12, weight='normal')

# Adjust layout
plt.tight_layout()
plt.show()

## Feature Engineering

In [21]:
# Categorize BMI
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'under_weight'
    elif 18.5 <= bmi < 25:
        return 'normal_weight'
    elif 25 <= bmi < 30:
        return 'over_weight'
    else:
        return 'obese'

# Apply function to bmi column
df['bmi_category'] = df['bmi'].apply(categorize_bmi)

In [22]:
# Categorize age category
def categorize_age(age):
    if 18 < age < 26:
        return 'young_adult'
    elif 26 <= age < 36:
        return 'early_adulthood'
    elif 36 <= age < 46:
        return 'mid_adulthood'
    else:
        return 'late_adulthood'
    
# Apply function to age column
df['age_category'] = df['age'].apply(categorize_age)

In [23]:
# Create new columns
#
# "child_stat": indicates whether a client has children (regardless of their number)
# "not_smoker": indicates the client is not a smoker 
# "children_str": represents number of children as strings
df['child_stat'] = df['children'].apply(lambda x : 0 if x == 0 else 1)
df['not_smoker'] = df["smoker"].apply(lambda x: 1 if x == 0 else 0)
df['children_str'] = df['children'].apply(lambda x: str(x))

In [24]:
# Define features and targets
X = df.drop("charges", axis=1)
y = df[["charges"]]

In [25]:
# One-Hot Encode 
X = pd.get_dummies(X, columns=['region', 'age_category', 'children_str', 'bmi_category']).astype(int)

In [26]:
# Review columns
pd.set_option("display.max_columns", None)

# Check first four rows
X.head(4)

Unnamed: 0,sex,smoker,age,children,bmi,child_stat,not_smoker,region_northeast,region_northwest,region_southeast,region_southwest,age_category_early_adulthood,age_category_late_adulthood,age_category_mid_adulthood,age_category_young_adult,children_str_0,children_str_1,children_str_2,children_str_3,children_str_4,children_str_5,bmi_category_normal_weight,bmi_category_obese,bmi_category_over_weight,bmi_category_under_weight
0,0,1,19,0,27,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0
1,1,0,18,1,33,1,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
2,1,0,28,3,33,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
3,1,0,33,0,22,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0


In [27]:
# Confirm feature shape
X.shape

# Confirm target shape
y.shape

(1337, 1)

## Model Training

### Lasso 

In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV


# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi', 
                    'bmi_category_obese', 'bmi_category_over_weight',
                    'age_category_early_adulthood',	
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lasso', Lasso())
])

# GridSearchCV

params = {"lasso__alpha": np.arange(1, 20),
     "lasso__selection": ["cyclic", "random"]}

grid_lasso = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_lasso.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_lasso.best_params_)
print("Best Score:", grid_lasso.best_score_)

TypeError: Parameter grid for parameter 'lasso__alpha' needs to be a list or a numpy array, but got 19 (of type int) instead. Single values need to be wrapped in a list with one element.

In [16]:
# Test model 
grid_lasso.score(x_test, y_test)

0.9272074897896826

In [18]:
# Save best estimator
model = grid_lasso.best_estimator_

In [19]:
# Export model as .pkl file
import pickle

with open("model.pkl", 'wb') as file:
    pickle.dump(model, file)

### Linear SVR

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVR

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi', 
                    'bmi_category_obese', 'bmi_category_over_weight',
                    'age_category_early_adulthood',
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lsvr', LinearSVR())
])

# GridSearchCV
params = {'lsvr__loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
              'lsvr__epsilon': np.arange(0.0, 15),
              'lsvr__random_state' : np.arange(35, 50),
             }

grid_svr = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_svr.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_svr.best_params_)
print("Best Score:", grid_svr.best_score_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Params: {'lsvr__epsilon': np.float64(0.0), 'lsvr__loss': 'squared_epsilon_insensitive', 'lsvr__random_state': np.int64(45)}
Best Score: 0.8491288488352451


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [13]:
grid_svr.score(x_test, y_test)

0.9266549314137009

### Linear Regression

In [43]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age', 	'bmi',
                    'bmi_category_obese', 'bmi_category_over_weight',
                    'age_category_early_adulthood',	
                    'children_str_0',]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lr', LinearRegression())
])

# GridSearchCV

params = {'lr__fit_intercept' : [True]}

grid_lr = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_lr.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_lr.best_params_)
print("Best Score:", grid_lr.best_score_)

Best Params: {'lr__fit_intercept': True}
Best Score: 0.8490763231502658


In [44]:
grid_lr.score(x_test, y_test)

0.9266593214087172