In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
dataset = pd.read_csv('insurance.csv')

In [3]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
dataset.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [5]:
dataset.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [6]:
dataset.isna().any()

age         False
sex         False
bmi         False
children    False
smoker      False
region      False
charges     False
dtype: bool

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
def one_hot_encoder_one(data,feature,keep_first=True):

    oh = OneHotEncoder()
   
    oh_df = pd.DataFrame(oh.fit_transform(data[[feature]]).toarray()) 
    
    oh_df.columns = oh.get_feature_names()
    
    for col in oh_df.columns:
        oh_df.rename({col:f'{feature}_'+col.split('_')[1]},axis=1,inplace=True)
    
    new_data = pd.concat([data,oh_df],axis=1)
    new_data.drop(feature,axis=1,inplace=True)
    
    if keep_first == False:
        new_data=new_data.iloc[:,1:]
    
    return new_data

In [9]:
encoded_set = dataset

for column in encoded_set.select_dtypes(include=['int', 'float']).columns:
    encoded_set = encoded_set.drop([column], axis=1)

for columns in encoded_set.select_dtypes(include='object').columns:
    encoded_set = one_hot_encoder_one(encoded_set,columns)



In [10]:
encoded_set

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1333,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1334,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1335,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1336,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
def scale_feature(data,feature):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(data[[feature]]), columns=[f'scaled_{feature}'])
    
    new_data = pd.concat([data,scaled_df], axis=1).drop([feature], axis=1)
    return new_data
    

In [13]:
scaled_set = dataset

for column in scaled_set.select_dtypes(include=['object']).columns:
    scaled_set = scaled_set.drop([column], axis=1)

for column in scaled_set.select_dtypes(include=['int', 'float']).columns:
    scaled_set = scale_feature(scaled_set, column)

In [14]:
new_dataset = pd.concat([encoded_set, scaled_set], axis=1)

In [15]:
new_dataset

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,scaled_age,scaled_bmi,scaled_children,scaled_charges
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.438764,-0.453320,-0.908614,0.298584
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,-1.509965,0.509621,-0.078767,-0.953689
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.797954,0.383307,1.580926,-0.728675
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.441948,-1.305531,-0.908614,0.719843
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.513149,-0.292556,-0.908614,-0.776802
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.768473,0.050297,1.580926,-0.220551
1334,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.509965,0.206139,-0.908614,-0.914002
1335,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-1.509965,1.014878,-0.908614,-0.961596
1336,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.296362,-0.797813,-0.908614,-0.930362
