In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
import warnings
warnings.filterwarnings("ignore")

In [4]:
dataset = pd.read_excel("insurance.xlsx")

In [5]:
dataset.head(6)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


# <span style="color:blue">Handling the Null Value</span>

In [7]:
dataset.isnull().sum()

age         0
gender      0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

#### <span style="color:green">Handling the null value not required</span>.

## Making multiple copies

In [8]:
dataset2 = dataset.copy()
dataset3 = dataset.copy()
dataset4 = dataset.copy()
dataset5 = dataset.copy()
dataset6 = dataset.copy()
dataset7 = dataset.copy()
dataset8 = dataset.copy()
dataset9 = dataset.copy()
dataset10 = dataset.copy()
dataset11 = dataset.copy()
dataset12 = dataset.copy()
dataset13 = dataset.copy()

# <span style="color:blue">Replacing the values instead of Encoding</span>

In [9]:
dataset.head(6)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216


In [10]:
# need to change gender, smoker, region

In [11]:
gender1 = dataset.gender.unique()
gender1

array(['female', 'male'], dtype=object)

In [12]:
dataset.gender = dataset.gender.replace(['female', 'male'],[2,1])

In [13]:
#female = 2 ||| male = 1

In [14]:
dataset.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,2,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [15]:
dataset.smoker.unique()

array(['yes', 'no'], dtype=object)

In [16]:
dataset.smoker = dataset.smoker.replace(['yes', 'no'], [1,0])

In [17]:
#yes = 1 ||| no = 0

In [18]:
dataset.head(6)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,2,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552
5,31,2,25.74,0,0,southeast,3756.6216


In [19]:
dataset.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [20]:
dataset.region = dataset.region.replace(['southwest', 'southeast',
                 'northwest', 'northeast'], [1,2,3,4])

In [21]:
# southwest = 1 ||| southeast = 2 ||| northwest = 3 ||| northeast = 4

In [22]:
dataset.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,2,27.9,0,1,1,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,3,21984.47061
4,32,1,28.88,0,0,3,3866.8552


# <span style="color:blue">Label Encoding</span>

### manually

In [23]:
le = LabelEncoder()

In [24]:
dataset2.head(6)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216


In [25]:
dataset2.gender = le.fit_transform(dataset2.gender)
dataset2.smoker = le.fit_transform(dataset2.smoker)
dataset2.region = le.fit_transform(dataset2.region)

In [26]:
dataset2.head(10)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552
5,31,0,25.74,0,0,2,3756.6216
6,46,0,33.44,1,0,2,8240.5896
7,37,0,27.74,3,0,1,7281.5056
8,37,1,29.83,2,0,0,6406.4107
9,60,0,25.84,0,0,1,28923.13692


### using loop

In [27]:
dataset3.columns

Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [28]:
for col in dataset3.columns:
    if dataset3[col].dtype == np.number:
        continue
    else:
        dataset3[col] = le.fit_transform(dataset3[col])

In [29]:
dataset3.head(10)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,1,0,27.9,0,1,3,16884.924
1,0,1,33.77,1,0,2,1725.5523
2,10,1,33.0,3,0,2,4449.462
3,15,1,22.705,0,0,1,21984.47061
4,14,1,28.88,0,0,1,3866.8552
5,13,0,25.74,0,0,2,3756.6216
6,28,0,33.44,1,0,2,8240.5896
7,19,0,27.74,3,0,1,7281.5056
8,19,1,29.83,2,0,0,6406.4107
9,42,0,25.84,0,0,1,28923.13692


# <span style="color:blue">One Hot Encoding</span>

In [30]:
dataset4.columns

Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [31]:
dummy = pd.get_dummies(dataset4['gender'], columns = dataset4.gender.unique)
dummy2 = pd.get_dummies(dataset4['smoker'], columns = dataset4.smoker.unique)
dummy3 = pd.get_dummies(dataset4['region'], columns = dataset4.region.unique)

In [32]:
dummy = dummy.replace({True:1, False:0})
dummy2 = dummy2.replace({True:1, False:0})
dummy3 = dummy3.replace({True:1, False:0})

In [33]:
dummy.head(10)

Unnamed: 0,female,male
0,1,0
1,0,1
2,0,1
3,0,1
4,0,1
5,1,0
6,1,0
7,1,0
8,0,1
9,1,0


In [34]:
dummy2.head(10)

Unnamed: 0,no,yes
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0
5,1,0
6,1,0
7,1,0
8,1,0
9,1,0


In [35]:
dummy3.head(10)

Unnamed: 0,northeast,northwest,southeast,southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
5,0,0,1,0
6,0,0,1,0
7,0,1,0,0
8,1,0,0,0
9,0,1,0,0


In [None]:
dataset13 = pd.read_excel("insurance.xlsx")
dataset13.head()

In [40]:
for cols in dataset13.columns:
    if dataset13[cols].dtype != np.number:
        dataset13.drop(cols, axis = 1, inplace=True)
    #else:
      #  dataset4.drop(cols, axis = 1, inplace=True)

In [41]:
dataset13.head()

Unnamed: 0,bmi,charges
0,27.9,16884.924
1,33.77,1725.5523
2,33.0,4449.462
3,22.705,21984.47061
4,28.88,3866.8552


# <span style="color:blue">Ordinal Encoding</span>

### manually

In [42]:
dataset7.head(6)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216


In [51]:
gender = dataset7.gender.unique()
smoker = dataset7.smoker.unique()
region = dataset7.region.unique()
gender, smoker, region

(array(['female', 'male'], dtype=object),
 array(['yes', 'no'], dtype=object),
 array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object))

In [52]:
oeG = OrdinalEncoder(categories = [gender])
oeS = OrdinalEncoder(categories = [smoker])
oeR = OrdinalEncoder(categories = [region])

In [54]:
encoded_gender = oeG.fit_transform(dataset7[['gender']])
encoded_smoker = oeS.fit_transform(dataset7[['smoker']])
encoded_region = oeR.fit_transform(dataset7[['region']])

In [63]:
encoded_gender[0:6]

array([[0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.]])

In [59]:
encoded_smoker[0:6]

array([[0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [61]:
encoded_region[0:6]

array([[0.],
       [1.],
       [1.],
       [2.],
       [2.],
       [1.]])

In [64]:
dataset7 = pd.concat([dataset7.drop('gender', axis = 1), encoded_gender]
                     , axis = 1)

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid