In [2]:
# importing all required modules
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv('insurance.csv') # Load dataframe
df.info() # Print basic information of loaded data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df.nunique() # Prinnt number of unique varibales in features

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [5]:
df.isnull().sum() # print number of null values in features

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
df.head() # print first 5 rows from dataframe

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder # importing Label Encoder modules
lben = LabelEncoder() # Create object for Label Encoder 

In [8]:
# For one selected column:
df1 = df.copy() # copy the original dataset
df1['sex'] = lben.fit_transform(df1['sex']) # Encoding one object feature by label encoder
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [9]:
# For all categorical columns: 
df2 = df.copy()
cols = df2.select_dtypes(include='object') # Select all categorical features
for col in cols:
    df2[col]= lben.fit_transform(df2[col]) # Encoding all object features in dataset by label encoder
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# One-Hot Encoding

In [10]:
# For one selected column:
df3 = df.copy()
dummy = pd.get_dummies(df3['smoker'], drop_first=True, prefix='smoker') # create new fetaures for the variables in selected column and drop the first feature
df3.drop('smoker', axis=1, inplace=True) # remove that column to avoid duplicacy
df3 = pd.concat([df3, dummy], axis=1) # add created features in the dataframe
df3.head()

Unnamed: 0,age,sex,bmi,children,region,charges,smoker_yes
0,19,female,27.9,0,southwest,16884.924,1
1,18,male,33.77,1,southeast,1725.5523,0
2,28,male,33.0,3,southeast,4449.462,0
3,33,male,22.705,0,northwest,21984.47061,0
4,32,male,28.88,0,northwest,3866.8552,0


In [11]:
# For all categorical columns: 
df4 = df.copy()
cols = df4.select_dtypes(include='object') # Select all categorical features
for col in cols:
    ohen = pd.get_dummies(df[col], drop_first=True, prefix=col) # create new fetaures for the variables in all selected columns and drop the first feature for each column
    df4.drop(col, axis=1, inplace=True) # remove selected columns to avoid duplicacy
    df4 = pd.concat([df4, ohen], axis=1) # add created features in the dataframe
df4.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


# Ordinal Encoding

In [12]:
from sklearn.preprocessing import OrdinalEncoder # importing Ordinal Encoder modules

In [13]:
# For one selected column:
df5 = df.copy()
unique = df5['region'].unique() # get all unique values in selected column
df5['region'] = OrdinalEncoder(categories=[unique]).fit_transform(df5[['region']]) # Encoding all object features in dataset by Ordinal encoder
df5.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,0.0,16884.924
1,18,male,33.77,1,no,1.0,1725.5523
2,28,male,33.0,3,no,1.0,4449.462
3,33,male,22.705,0,no,2.0,21984.47061
4,32,male,28.88,0,no,2.0,3866.8552


In [14]:
# For all categorical columns: 
df6 = df.copy()
cols = df6.select_dtypes(include='object') # Select all categorical features
for col in cols:
    unique_value = df6[col].unique() # get all unique values in each selected columns
    df6[col] = OrdinalEncoder(categories=[unique_value]).fit_transform(df6[[col]])
df6.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,0.0,0.0,16884.924
1,18,1.0,33.77,1,1.0,1.0,1725.5523
2,28,1.0,33.0,3,1.0,1.0,4449.462
3,33,1.0,22.705,0,1.0,2.0,21984.47061
4,32,1.0,28.88,0,1.0,2.0,3866.8552


# Replacing Method

In [15]:
# For selected columns:
df7 = df.copy()
df7['sex'] = df7['sex'].replace(['female', 'male'], [0, 1])
df7['smoker'] = df7['smoker'].replace(['no', 'yes'], [0, 1])
df7['region'] = df7['region'].replace(['southwest','southeast','northwest','northeast'], [0, 1, 2, 3])
df7.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552
