In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df= sns.load_dataset(name="tips")

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### One-hot encoding

<font size=4>
One-hot encoding is a technique used in machine learning to convert categorical variables into a format that machine learning models can understand.

<font size=4>
Machine learning models typically work with numerical data. One-hot encoding addresses this by creating a new binary variable for each unique category from the original categorical variable.
<br>These binary variables for each unique category are called dummy variables.
<br> In each new dummy variable, only one position will have a value of 1, indicating the presence of that particular category, and all other positions will be 0.

<font size = 4>
    
**When to use One-hot encoding?**<br>
One-hot encoding must be used for a nominal categorical variable ( a categorical variable with no inherent order or rank between the categories).

**Disadvantages of One-hot encoding-**<br>
Dummy variable trap: A categorical variable having many categories can lead to the creation of large no of columns, which can impact a machine learning model's performance. (Curse of Dimensionality)

**Note**
After creating dummy variables for n unique categories of the categorical variable, drop one of the n dummy variables. Thus, leading to (n-1) dummy variables

In [4]:
df.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

We have 4 categorical variables, namely 'sex','smoker','day','time'

In [5]:
df.sex.unique() # sex is nominal

['Female', 'Male']
Categories (2, object): ['Male', 'Female']

In [6]:
df.smoker.unique() # smoker is nominal

['No', 'Yes']
Categories (2, object): ['Yes', 'No']

In [7]:
df.day.unique() # This is ordinal

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [8]:
df.time.unique() # This is also ordinal

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

We can apply one-hot encoding to 'sex' and 'smoker' as they are nominal categorical variables

In [9]:
df_dummy = pd.get_dummies(df,columns=['sex','smoker']) # We can also give dtype="int"
df_dummy.head()

Unnamed: 0,total_bill,tip,day,time,size,sex_Male,sex_Female,smoker_Yes,smoker_No
0,16.99,1.01,Sun,Dinner,2,0,1,0,1
1,10.34,1.66,Sun,Dinner,3,1,0,0,1
2,21.01,3.5,Sun,Dinner,3,1,0,0,1
3,23.68,3.31,Sun,Dinner,2,1,0,0,1
4,24.59,3.61,Sun,Dinner,4,0,1,0,1


In [10]:
df_dummy.dtypes 

total_bill     float64
tip            float64
day           category
time          category
size             int64
sex_Male         uint8
sex_Female       uint8
smoker_Yes       uint8
smoker_No        uint8
dtype: object

To avoid the dummy-variable trap , let us drop one of the columns for each categorical variable

In [11]:
df_dummy.drop(columns=["sex_Female","smoker_No"],inplace=True)
df_dummy.head()

Unnamed: 0,total_bill,tip,day,time,size,sex_Male,smoker_Yes
0,16.99,1.01,Sun,Dinner,2,0,0
1,10.34,1.66,Sun,Dinner,3,1,0
2,21.01,3.5,Sun,Dinner,3,1,0
3,23.68,3.31,Sun,Dinner,2,1,0
4,24.59,3.61,Sun,Dinner,4,0,0


In [12]:
df = df_dummy
df.head()

Unnamed: 0,total_bill,tip,day,time,size,sex_Male,smoker_Yes
0,16.99,1.01,Sun,Dinner,2,0,0
1,10.34,1.66,Sun,Dinner,3,1,0
2,21.01,3.5,Sun,Dinner,3,1,0
3,23.68,3.31,Sun,Dinner,2,1,0
4,24.59,3.61,Sun,Dinner,4,0,0


### Label Encoding

- Label encoding involves assigning numbers to each unique category in the categorical variable.
- Unlike one hot encoding, it does not introduce new columns.

In [13]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()

In [14]:
df['day'] = le.fit_transform(df.day)
df['time'] = le.fit_transform(df.time)
df

Unnamed: 0,total_bill,tip,day,time,size,sex_Male,smoker_Yes
0,16.99,1.01,2,0,2,0,0
1,10.34,1.66,2,0,3,1,0
2,21.01,3.50,2,0,3,1,0
3,23.68,3.31,2,0,2,1,0
4,24.59,3.61,2,0,4,0,0
...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,3,1,0
240,27.18,2.00,1,0,2,0,1
241,22.67,2.00,1,0,2,1,1
242,17.82,1.75,1,0,2,1,0


In [15]:
df.day.unique()

array([2, 1, 3, 0])

In [16]:
df.time.unique()

array([0, 1])

In [17]:
df.dtypes

total_bill    float64
tip           float64
day             int32
time            int32
size            int64
sex_Male        uint8
smoker_Yes      uint8
dtype: object

### Task

<font size=3> Perform categorical encoding for the titanic dataset from the seaborn library.

In [18]:
a = sns.load_dataset("titanic")
a

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [19]:
a.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [20]:
a.adult_male

0       True
1      False
2      False
3      False
4       True
       ...  
886     True
887    False
888    False
889     True
890     True
Name: adult_male, Length: 891, dtype: bool

In [21]:
a.drop("deck",axis=1,inplace = True)
a

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [22]:
a.drop("embark_town",axis=1,inplace=True)
a

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,yes,True


In [23]:
a.drop("embarked",axis=1,inplace=True)
a

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,alive,alone
0,0,3,male,22.0,1,0,7.2500,Third,man,True,no,False
1,1,1,female,38.0,1,0,71.2833,First,woman,False,yes,False
2,1,3,female,26.0,0,0,7.9250,Third,woman,False,yes,True
3,1,1,female,35.0,1,0,53.1000,First,woman,False,yes,False
4,0,3,male,35.0,0,0,8.0500,Third,man,True,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,Second,man,True,no,True
887,1,1,female,19.0,0,0,30.0000,First,woman,False,yes,True
888,0,3,female,,1,2,23.4500,Third,woman,False,no,False
889,1,1,male,26.0,0,0,30.0000,First,man,True,yes,True


In [24]:
a.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [25]:
a.age.mean()

29.69911764705882

In [26]:
a.fillna(a.age.mean(),inplace=True)
a

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,alive,alone
0,0,3,male,22.000000,1,0,7.2500,Third,man,True,no,False
1,1,1,female,38.000000,1,0,71.2833,First,woman,False,yes,False
2,1,3,female,26.000000,0,0,7.9250,Third,woman,False,yes,True
3,1,1,female,35.000000,1,0,53.1000,First,woman,False,yes,False
4,0,3,male,35.000000,0,0,8.0500,Third,man,True,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,Second,man,True,no,True
887,1,1,female,19.000000,0,0,30.0000,First,woman,False,yes,True
888,0,3,female,29.699118,1,2,23.4500,Third,woman,False,no,False
889,1,1,male,26.000000,0,0,30.0000,First,man,True,yes,True


In [27]:
a.isnull().sum()            #   no missig values

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
class         0
who           0
adult_male    0
alive         0
alone         0
dtype: int64

In [28]:
a.dtypes

survived         int64
pclass           int64
sex             object
age            float64
sibsp            int64
parch            int64
fare           float64
class         category
who             object
adult_male        bool
alive           object
alone             bool
dtype: object

In [32]:
a["class"].unique()

['Third', 'First', 'Second']
Categories (3, object): ['First', 'Second', 'Third']

In [33]:
a["who"].unique()

array(['man', 'woman', 'child'], dtype=object)

In [38]:
a['sex'] = le.fit_transform(a.sex)           # label Encoding
a['who'] = le.fit_transform(a.who)           # label Encoding
a['class'] = le.fit_transform(a["class"])        # one-hot encoding(ordinal data)
a['adult_male'] = le.fit_transform(a.adult_male)    # label Encoding
a['alive'] = le.fit_transform(a.alive)           # label Encoding
a['alone'] = le.fit_transform(a.alone)        # label Encoding
a

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,alive,alone
0,0,3,1,22.000000,1,0,7.2500,2,1,1,0,0
1,1,1,0,38.000000,1,0,71.2833,0,2,0,1,0
2,1,3,0,26.000000,0,0,7.9250,2,2,0,1,1
3,1,1,0,35.000000,1,0,53.1000,0,2,0,1,0
4,0,3,1,35.000000,0,0,8.0500,2,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,1,1,1,0,1
887,1,1,0,19.000000,0,0,30.0000,0,2,0,1,1
888,0,3,0,29.699118,1,2,23.4500,2,2,0,0,0
889,1,1,1,26.000000,0,0,30.0000,0,1,1,1,1
