In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [2]:
df_load = pd.read_csv("cleaned_insurance_data.csv")

df_load.drop("Unnamed: 0", inplace=True, axis=1)

In [3]:
df_load.drop_duplicates(inplace=True, keep='first')

In [4]:
df = df_load.drop(["sex", "region"], axis = 1)

In [5]:
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'under_weight'
    elif 18.5 <= bmi < 25:
        return 'normal_weight'
    elif 25 <= bmi < 30:
        return 'over_weight'
    else:
        return 'obese'

df['bmi_category'] = df['bmi'].apply(categorize_bmi)

In [6]:
df['not_smoker'] = df["smoker"].apply(lambda x: 1 if x == 0 else 0)

df['age_children'] = df['age'] * df['children']

df['smoker_child'] = df['children'] * (df['smoker'] == 1)
df['non_smoker_child'] = df['children'] * (df['not_smoker'] == 1)

df['smoker_age'] = df['smoker'] * df['age']
df['not_smoker_age'] = df['not_smoker'] * df['age']

In [7]:
df['child_stat'] = df['children'].apply(lambda x : 0 if x == 0 else 1)

In [8]:
X = df.drop("charges", axis=1)
y = df[["charges"]]

In [9]:
X = pd.get_dummies(X, columns=['bmi_category']).astype(int)

In [16]:
X

Unnamed: 0,age,bmi,children,smoker,not_smoker,age_children,smoker_child,non_smoker_child,smoker_age,not_smoker_age,child_stat,bmi_category_normal_weight,bmi_category_obese,bmi_category_over_weight,bmi_category_under_weight
0,19,27,0,1,0,0,0,0,19,0,0,0,0,1,0
1,18,33,1,0,1,18,0,1,0,18,1,0,1,0,0
2,28,33,3,0,1,84,0,3,0,28,1,0,1,0,0
3,33,22,0,0,1,0,0,0,0,33,0,1,0,0,0
4,32,28,0,0,1,0,0,0,0,32,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1321,50,30,3,0,1,150,0,3,0,50,1,0,1,0,0
1322,18,31,0,0,1,0,0,0,0,18,0,0,1,0,0
1323,18,36,0,0,1,0,0,0,0,18,0,0,1,0,0
1324,21,25,0,0,1,0,0,0,0,21,0,0,0,1,0


In [19]:
X_final = X[['smoker', 'age', 'smoker_child',
            'bmi_category_obese', 'bmi_category_over_weight',
            'smoker_age', 'not_smoker_age', 'non_smoker_child', 'child_stat']]

In [None]:
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer

log_transformer = FunctionTransformer(np.log1p, validate=True)

y_final = log_transformer.fit_transform(y)

In [6]:
# df = df[['charges', 'smoker', 'age', 'children', 'not_smoker', 'age_children_interaction', 'dependency_ratio',
#          'smoker_child_interaction', 'non_smoker_child_interaction', 'smoker_age_interaction', 'not_smoker_age_interaction']]

In [7]:
# X = df.drop("charges", axis=1).values
# y = df["charges"].values.reshape(-1, 1)

In [None]:
# from sklearn.preprocessing import PowerTransformer

# # Transform both train features and target
# pt = PowerTransformer(method='yeo-johnson')
# x_train_transformed = pt.fit_transform(x_train)
# y_train_transformed = pt.fit_transform(y_train.values.reshape(-1, 1))

In [None]:
# # Transform both test features and target
# x_test_transformed = pt.fit_transform(x_test)
# y_test_transformed = pt.fit_transform(y_test.values.reshape(-1, 1))

In [68]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.linear_model import SGDRegressor, LinearRegression, ElasticNet, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.svm import LinearSVR

x_train, x_test, y_train, y_test = train_test_split(X_final, y, test_size=0.15, random_state=42, stratify=X_final['smoker'])


model_pipeline = make_pipeline(PolynomialFeatures(degree=2),
                               StandardScaler(),
                               ElasticNet())

params = {'elasticnet__selection': ['cyclic', 'random']
             }

grid = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid.fit(x_train, y_train)

In [69]:
grid.best_score_

np.float64(0.8475453927100883)

In [70]:
grid.score(x_test, y_test)

0.7505189730139534

In [21]:
# check = pd.DataFrame(x_train_transformed, columns= ['smoker', 'age', 'children', 'not_smoker', 'age_children_interaction', 'dependency_ratio',
#          'smoker_child_interaction', 'non_smoker_child_interaction', 'smoker_age_interaction', 'not_smoker_age_interaction'])

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

x_train_scaled = scaler.fit_transform(x_train_transformed)
x_test_scaled = scaler.fit_transform(x_test_transformed)