In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [2]:
df_load = pd.read_csv("cleaned_insurance_data.csv")

df_load.drop("Unnamed: 0", inplace=True, axis=1)

In [3]:
df = df_load.drop(["sex", "region", "bmi"], axis = 1)

In [4]:
df['not_smoker'] = df["smoker"].apply(lambda x: 1 if x == 0 else 0)

df['age_children_interaction'] = df['age'] * df['children']
df['dependency_ratio'] = df['children'] / (df['age'] + 1)

df['smoker_child_interaction'] = df['children'] * (df['smoker'] == 1)
df['non_smoker_child_interaction'] = df['children'] * (df['not_smoker'] == 1)

df['smoker_age_interaction'] = df['smoker'] * df['age']
df['not_smoker_age_interaction'] = df['not_smoker'] * df['age']

In [5]:
df.head(4)

Unnamed: 0,age,children,smoker,charges,not_smoker,age_children_interaction,dependency_ratio,smoker_child_interaction,non_smoker_child_interaction,smoker_age_interaction,not_smoker_age_interaction
0,19,0,1,16884.92,0,0,0.0,0,0,19,0
1,18,1,0,1725.55,1,18,0.052632,0,1,0,18
2,28,3,0,4449.46,1,84,0.103448,0,3,0,28
3,33,0,0,21984.47,1,0,0.0,0,0,0,33


In [6]:
df = df[['charges', 'smoker', 'age', 'children', 'not_smoker', 'age_children_interaction', 'dependency_ratio',
         'smoker_child_interaction', 'non_smoker_child_interaction', 'smoker_age_interaction', 'not_smoker_age_interaction']]

In [7]:
df.head(3)

Unnamed: 0,charges,smoker,age,children,not_smoker,age_children_interaction,dependency_ratio,smoker_child_interaction,non_smoker_child_interaction,smoker_age_interaction,not_smoker_age_interaction
0,16884.92,1,19,0,0,0,0.0,0,0,19,0
1,1725.55,0,18,1,1,18,0.052632,0,1,0,18
2,4449.46,0,28,3,1,84,0.103448,0,3,0,28


In [8]:
X = df.drop("charges", axis=1)
y = df[["charges"]]

In [9]:
from sklearn.model_selection import  train_test_split

x_train, y_train, x_test, y_test = train_test_split(X, y, test_size=0.2, stratify= X['smoker'], random_state=35)

In [10]:
from sklearn.preprocessing import PowerTransformer

# Transform both features and target
pt = PowerTransformer(method='yeo-johnson')
x_train_transformed = pt.fit_transform(x_train)
y_train_transformed = pt.fit_transform(y_train.values.reshape(-1, 1))


In [11]:
x_train_transformed.shape

(1060, 10)

In [13]:
x_train_transformed

array([[-0.49557383,  0.66183229, -1.0735895 , ..., -0.89624465,
        -0.49556673,  0.79879433],
       [-0.49557383,  1.40159341, -1.0735895 , ..., -0.89624465,
        -0.49556673,  1.25906385],
       [-0.49557383,  0.66183229, -1.0735895 , ..., -0.89624465,
        -0.49556673,  0.79879433],
       ...,
       [-0.49557383,  0.26508558,  0.29477096, ...,  0.66468152,
        -0.49556673,  0.5572777 ],
       [ 2.01786281,  0.33266523,  0.29477096, ..., -0.89624465,
         2.02488657, -1.7088022 ],
       [ 2.01786281, -0.67949943, -1.0735895 , ..., -0.89624465,
         2.01449095, -1.7088022 ]], shape=(1060, 10))

In [12]:
y_train.shape

(266, 10)