In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [2]:
np.random.seed(42)

In [3]:
med_cost = pd.read_csv('insurance.csv')
med_cost.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
med_cost.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
med_cost.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [6]:
med_cost['smoker'].describe()

count     1338
unique       2
top         no
freq      1064
Name: smoker, dtype: object

In [7]:
charges_raw = med_cost['charges']
features_raw = med_cost.drop('charges', axis=1)

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labEnc = LabelEncoder()
ohEnc = OneHotEncoder(sparse=False)

In [9]:
print('Before:', med_cost['smoker'].value_counts())
med_cost['smoker'] = labEnc.fit_transform(med_cost['smoker'])
print('After:', med_cost['smoker'].value_counts())

Before: no     1064
yes     274
Name: smoker, dtype: int64
After: 0    1064
1     274
Name: smoker, dtype: int64


In [10]:
print('Before:\n', med_cost['sex'].value_counts())
med_cost['sex'] = labEnc.fit_transform(med_cost['sex'])
print('After:\n', med_cost['sex'].value_counts())

Before:
 male      676
female    662
Name: sex, dtype: int64
After:
 1    676
0    662
Name: sex, dtype: int64


In [11]:
med_cost['region'].value_counts()
print('Before:\n', med_cost['region'].value_counts())
transformed_regions = pd.DataFrame(ohEnc.fit_transform(med_cost[['region']]))
med_cost = med_cost.join(transformed_regions)

Before:
 southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64


In [12]:
med_cost.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,0,1,2,3
0,19,0,27.9,0,1,southwest,16884.924,0.0,0.0,0.0,1.0
1,18,1,33.77,1,0,southeast,1725.5523,0.0,0.0,1.0,0.0
2,28,1,33.0,3,0,southeast,4449.462,0.0,0.0,1.0,0.0
3,33,1,22.705,0,0,northwest,21984.47061,0.0,1.0,0.0,0.0
4,32,1,28.88,0,0,northwest,3866.8552,0.0,1.0,0.0,0.0


In [13]:
med_cost.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
 7   0         1338 non-null   float64
 8   1         1338 non-null   float64
 9   2         1338 non-null   float64
 10  3         1338 non-null   float64
dtypes: float64(6), int64(4), object(1)
memory usage: 115.1+ KB


In [14]:
med_cost.describe()

Unnamed: 0,age,sex,bmi,children,smoker,charges,0,1,2,3
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,0.505232,30.663397,1.094918,0.204783,13270.422265,0.242152,0.2429,0.272048,0.2429
std,14.04996,0.50016,6.098187,1.205493,0.403694,12110.011237,0.428546,0.428995,0.445181,0.428995
min,18.0,0.0,15.96,0.0,0.0,1121.8739,0.0,0.0,0.0,0.0
25%,27.0,0.0,26.29625,0.0,0.0,4740.28715,0.0,0.0,0.0,0.0
50%,39.0,1.0,30.4,1.0,0.0,9382.033,0.0,0.0,0.0,0.0
75%,51.0,1.0,34.69375,2.0,0.0,16639.912515,0.0,0.0,1.0,0.0
max,64.0,1.0,53.13,5.0,1.0,63770.42801,1.0,1.0,1.0,1.0
