In [3]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('/content/insurance.csv')

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# label encoder

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
le = LabelEncoder()

In [10]:
df.sex = le.fit_transform(df[['sex']])

In [11]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [13]:
le.classes_

array(['female', 'male'], dtype=object)

In [14]:
le.transform(le.classes_)

array([0, 1])

# decode classes


In [15]:
decoded_classes = le.inverse_transform(df.sex)
decoded_classes

array(['female', 'male', 'male', ..., 'female', 'female', 'female'],
      dtype=object)

In [16]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [17]:
# df.sex = decode_classes

# Label Encoding with Label mapping


In [18]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [19]:
label_mapping = {'southwest':0 ,'southeast':1 , 'northwest':2 , 'northeast':3 }

In [20]:
label_mapping

{'southwest': 0, 'southeast': 1, 'northwest': 2, 'northeast': 3}

In [21]:
label_mapping.keys()

dict_keys(['southwest', 'southeast', 'northwest', 'northeast'])

In [22]:
label_mapping.values()

dict_values([0, 1, 2, 3])

In [25]:
df.region = [label_mapping[region] for region in df.region] #List comprehension

In [26]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,0,16884.924
1,18,1,33.77,1,no,1,1725.5523
2,28,1,33.0,3,no,1,4449.462
3,33,1,22.705,0,no,2,21984.47061
4,32,1,28.88,0,no,2,3866.8552


# replace method


In [27]:
# df['region'] = df['region'].replace(label_mapping)

# One Hot encoder

In [28]:
df2 = pd.read_csv('/content/insurance.csv')

In [29]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [31]:
pd.get_dummies(df2['region']).head()

Unnamed: 0,northeast,northwest,southeast,southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0


In [33]:
dummy = pd.get_dummies(df2['region'],  prefix='region', prefix_sep='_' )
dummy

Unnamed: 0,region_northeast,region_northwest,region_southeast,region_southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


In [34]:
df2.drop('region', axis=1, inplace=True)


In [35]:
new_df2 = pd.concat([df2, dummy], axis=1)

In [36]:
new_df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,female,27.9,0,yes,16884.924,0,0,0,1
1,18,male,33.77,1,no,1725.5523,0,0,1,0
2,28,male,33.0,3,no,4449.462,0,0,1,0
3,33,male,22.705,0,no,21984.47061,0,1,0,0
4,32,male,28.88,0,no,3866.8552,0,1,0,0


# Ordinal Encoder

In [37]:
df2.sex.unique()

array(['female', 'male'], dtype=object)

In [38]:
from sklearn.preprocessing import OrdinalEncoder


In [39]:
ordinal = OrdinalEncoder(categories = [['female', 'male']])


In [41]:
df2.sex = ordinal.fit_transform(df2[['sex']])

In [42]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,0.0,27.9,0,yes,16884.924
1,18,1.0,33.77,1,no,1725.5523
2,28,1.0,33.0,3,no,4449.462
3,33,1.0,22.705,0,no,21984.47061
4,32,1.0,28.88,0,no,3866.8552
