In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
df = pd.read_csv('Customer.csv')
df.tail(5)

Unnamed: 0,age,gender,review,education,purchased
45,61,Male,Poor,PG,Yes
46,64,Female,Poor,PG,No
47,38,Female,Good,PG,Yes
48,39,Female,Good,UG,Yes
49,25,Female,Good,UG,No


In [4]:
for col in  df.columns:
    print(col,df[col].nunique())

age 41
gender 2
review 3
education 3
purchased 2


##### In above dataset we can apply ordinal encoding in review & education feature. Label Encoder in  purchased feature. And one hot encoding in gender feature.

#### Label Encoding

In [5]:
le = LabelEncoder()
df['purchased'] = le.fit_transform(df['purchased']) #Apply Label Encoder directly on target column.

In [6]:
df.tail(5)

Unnamed: 0,age,gender,review,education,purchased
45,61,Male,Poor,PG,1
46,64,Female,Poor,PG,0
47,38,Female,Good,PG,1
48,39,Female,Good,UG,1
49,25,Female,Good,UG,0


#### Ordinal Encoding

In [7]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,0
1,68,Female,Poor,UG,0
2,70,Female,Good,PG,0
3,72,Female,Good,PG,0
4,16,Female,Average,UG,0


In [9]:
oe = OrdinalEncoder(categories = [['Poor','Average','Good'], ['School','UG','PG']])
oe_df = df.iloc[0:,2:4]
oe_df = oe.fit_transform(oe_df)

In [10]:
oe = OrdinalEncoder(categories = [['Poor','Average','Good'],['School','UG','PG']])
oe_df = df.iloc[0:,2:4]
oe_df = oe.fit_transform(oe_df)

#Apply ordinal Encoder technique on 2 columns. Create ordinal encoder object and get the dataframe of those columns
#on which Ordinal Encoding needs to apply. And then fit transform

In [11]:
oe_df = pd.DataFrame(oe_df,columns=['review','education']) #Convert resulted columns into DataFrame

In [12]:
oe_df.head()

Unnamed: 0,review,education
0,1.0,0.0
1,0.0,1.0
2,2.0,2.0
3,2.0,2.0
4,1.0,1.0


In [13]:
df.drop(['review','education'],axis=1,inplace=True) #drop categorical columns.

In [14]:
newdf = pd.concat([df,oe_df],axis=1) #concat main dataframe with new ordinal encoder dataframe, with axis = 1

In [15]:
newdf['review'] = newdf['review'].astype('int32')
newdf['education'] = newdf['education'].astype('int32')

In [16]:
newdf.sample(5)

Unnamed: 0,age,gender,purchased,review,education
45,61,Male,1,0,2
12,51,Male,0,0,0
6,18,Male,0,2,0
19,97,Male,1,0,2
7,60,Female,1,0,0


#### One Hot Encoding - Nominal Feature

In [17]:
car = pd.read_csv('Cars.csv')
car.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [18]:
for col in  car.columns:
    print(col,car[col].nunique())

brand 32
km_driven 921
fuel 4
owner 5
selling_price 677


In [19]:
car['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

# 1. One-Hot Encoding using Pandas

In [20]:
pd.get_dummies(car['fuel'])

Unnamed: 0,CNG,Diesel,LPG,Petrol
0,0,1,0,0
1,0,1,0,0
2,0,0,0,1
3,0,1,0,0
4,0,0,0,1
...,...,...,...,...
8123,0,0,0,1
8124,0,1,0,0
8125,0,1,0,0
8126,0,1,0,0


In [21]:
pd.get_dummies(car,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


## 2. One Hot Encoding using OneHotEncoder

In [28]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
oh = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

In [23]:
car.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [26]:
oh_df = car.iloc[:,2:4]

In [27]:
oh_df['fuel'].value_counts()

Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64

In [28]:
oh_df = oh.fit_transform(oh_df)

In [29]:
oh_df = pd.DataFrame(oh_df,columns=['diesel','lpg','petrol','4','3','2','1'])

In [30]:
oh_df

Unnamed: 0,diesel,lpg,petrol,4,3,2,1
0,1,0,0,0,0,0,0
1,1,0,0,0,1,0,0
2,0,0,1,0,0,0,1
3,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
8123,0,0,1,0,0,0,0
8124,1,0,0,1,0,0,0
8125,1,0,0,0,0,0,0
8126,1,0,0,0,0,0,0


## 3. One Hot Encoding with Top Categories

In [31]:
counts = car['brand'].value_counts()

In [32]:
repl

NameError: name 'repl' is not defined

In [33]:
pd.get_dummies(car['brand'].replace(repl,'uncommon'))

NameError: name 'repl' is not defined

## In real scenarios, we don't apply single-single technique like Ordinal Encoding & One Hot Encoding on features one-by-one. Instead of we will apply Column Transfer Technique!

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [37]:
cdf = pd.read_csv('covid_toy.csv')
cdf.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [38]:
cdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [39]:
df.select_dtypes(include=['int']).columns.tolist()

['age', 'purchased']

In [40]:
for col in cdf.columns:
    print(col,cdf[col].nunique())

age 55
gender 2
fever 7
cough 2
city 4
has_covid 2


In [41]:
le = LabelEncoder()
le.fit_transform(cdf['has_covid'])

array([0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1])

In [77]:

transformer = ColumnTransformer(transformers=[
    ('tnf1',OneHotEncoder(sparse=False,drop='first'),['city']),
    ('tnf2',SimpleImputer(),['fever']),
    ('tnf3',OrdinalEncoder(categories=[['Mild','Strong']]),['cough'])
    #('tnf4',LabelEncoder(),['gender'])
],remainder='passthrough')
    #('tnf1',LabelEncoder,['gender'])],remainder='passthrough')
#     ('tnf2',OneHotEncoder(sparse=False,drop='first'),['city']),
#     ('tnf3',SimpleImputer(),['fever'])

transformer.fit_transform(cdf)

array([[0.0, 1.0, 0.0, 103.0, 0.0, 60, 'Male', 'No'],
       [1.0, 0.0, 0.0, 100.0, 0.0, 27, 'Male', 'Yes'],
       [1.0, 0.0, 0.0, 101.0, 0.0, 42, 'Male', 'No'],
       [0.0, 1.0, 0.0, 98.0, 0.0, 31, 'Female', 'No'],
       [0.0, 0.0, 1.0, 101.0, 0.0, 65, 'Female', 'No'],
       [0.0, 0.0, 0.0, 100.84444444444445, 0.0, 84, 'Female', 'Yes'],
       [0.0, 0.0, 0.0, 101.0, 1.0, 14, 'Male', 'No'],
       [0.0, 0.0, 1.0, 100.84444444444445, 1.0, 20, 'Female', 'Yes'],
       [0.0, 0.0, 0.0, 100.0, 1.0, 19, 'Female', 'No'],
       [1.0, 0.0, 0.0, 101.0, 0.0, 64, 'Female', 'No'],
       [1.0, 0.0, 0.0, 100.84444444444445, 0.0, 75, 'Female', 'No'],
       [0.0, 0.0, 1.0, 98.0, 0.0, 65, 'Female', 'Yes'],
       [0.0, 1.0, 0.0, 99.0, 1.0, 25, 'Female', 'No'],
       [0.0, 0.0, 0.0, 102.0, 0.0, 64, 'Male', 'Yes'],
       [0.0, 0.0, 0.0, 104.0, 0.0, 51, 'Male', 'No'],
       [0.0, 1.0, 0.0, 103.0, 1.0, 70, 'Male', 'Yes'],
       [0.0, 1.0, 0.0, 103.0, 0.0, 69, 'Female', 'Yes'],
       [1.0, 0.0, 0

In [78]:

transformer = ColumnTransformer(transformers=[
    ('tnf1',OneHotEncoder(sparse=False,drop='first'),['city']),
    ('tnf2',SimpleImputer(),['fever']),
    ('tnf3',OrdinalEncoder(categories=[['Mild','Strong']]),['cough'])
],remainder='passthrough')
    #('tnf1',LabelEncoder,['gender'])],remainder='passthrough')
#     ('tnf2',OneHotEncoder(sparse=False,drop='first'),['city']),
#     ('tnf3',SimpleImputer(),['fever'])

transformer.fit_transform(cdf)

array([[0.0, 1.0, 0.0, 103.0, 0.0, 60, 'Male', 'No'],
       [1.0, 0.0, 0.0, 100.0, 0.0, 27, 'Male', 'Yes'],
       [1.0, 0.0, 0.0, 101.0, 0.0, 42, 'Male', 'No'],
       [0.0, 1.0, 0.0, 98.0, 0.0, 31, 'Female', 'No'],
       [0.0, 0.0, 1.0, 101.0, 0.0, 65, 'Female', 'No'],
       [0.0, 0.0, 0.0, 100.84444444444445, 0.0, 84, 'Female', 'Yes'],
       [0.0, 0.0, 0.0, 101.0, 1.0, 14, 'Male', 'No'],
       [0.0, 0.0, 1.0, 100.84444444444445, 1.0, 20, 'Female', 'Yes'],
       [0.0, 0.0, 0.0, 100.0, 1.0, 19, 'Female', 'No'],
       [1.0, 0.0, 0.0, 101.0, 0.0, 64, 'Female', 'No'],
       [1.0, 0.0, 0.0, 100.84444444444445, 0.0, 75, 'Female', 'No'],
       [0.0, 0.0, 1.0, 98.0, 0.0, 65, 'Female', 'Yes'],
       [0.0, 1.0, 0.0, 99.0, 1.0, 25, 'Female', 'No'],
       [0.0, 0.0, 0.0, 102.0, 0.0, 64, 'Male', 'Yes'],
       [0.0, 0.0, 0.0, 104.0, 0.0, 51, 'Male', 'No'],
       [0.0, 1.0, 0.0, 103.0, 1.0, 70, 'Male', 'Yes'],
       [0.0, 1.0, 0.0, 103.0, 0.0, 69, 'Female', 'Yes'],
       [1.0, 0.0, 0

In [79]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [80]:
transformer.fit_transform(cdf)

array([[103.0, 0.0, 1.0, 0.0, 1.0, 0.0, 60, 'No'],
       [100.0, 0.0, 1.0, 1.0, 0.0, 0.0, 27, 'Yes'],
       [101.0, 0.0, 1.0, 1.0, 0.0, 0.0, 42, 'No'],
       [98.0, 0.0, 0.0, 0.0, 1.0, 0.0, 31, 'No'],
       [101.0, 0.0, 0.0, 0.0, 0.0, 1.0, 65, 'No'],
       [100.84444444444445, 0.0, 0.0, 0.0, 0.0, 0.0, 84, 'Yes'],
       [101.0, 1.0, 1.0, 0.0, 0.0, 0.0, 14, 'No'],
       [100.84444444444445, 1.0, 0.0, 0.0, 0.0, 1.0, 20, 'Yes'],
       [100.0, 1.0, 0.0, 0.0, 0.0, 0.0, 19, 'No'],
       [101.0, 0.0, 0.0, 1.0, 0.0, 0.0, 64, 'No'],
       [100.84444444444445, 0.0, 0.0, 1.0, 0.0, 0.0, 75, 'No'],
       [98.0, 0.0, 0.0, 0.0, 0.0, 1.0, 65, 'Yes'],
       [99.0, 1.0, 0.0, 0.0, 1.0, 0.0, 25, 'No'],
       [102.0, 0.0, 1.0, 0.0, 0.0, 0.0, 64, 'Yes'],
       [104.0, 0.0, 1.0, 0.0, 0.0, 0.0, 51, 'No'],
       [103.0, 1.0, 1.0, 0.0, 1.0, 0.0, 70, 'Yes'],
       [103.0, 0.0, 0.0, 0.0, 1.0, 0.0, 69, 'Yes'],
       [98.0, 1.0, 0.0, 1.0, 0.0, 0.0, 40, 'No'],
       [98.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6

In [81]:
cdf

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [82]:
cdf.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [83]:
for col in cdf.columns:
    print(col, cdf[col].nunique())

age 55
gender 2
fever 7
cough 2
city 4
has_covid 2


In [84]:
cdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [85]:
cdf['has_covid'].value_counts()

No     55
Yes    45
Name: has_covid, dtype: int64

##### We will perform Column Transformer Technique. 
1. Gender - Label Encoder
2. Fever - SimpleImputer
3. Cough - Ordinal Encoder
4. City - OneHot Encoder
5. Has_Covid - Label Encoder

In [86]:
from sklearn.compose import ColumnTransformer

In [87]:
transformer = ColumnTransformer(transformers=[
    ('tf1',SimpleImputer(),['fever']),
    #('tf2',LabelEncoder(),['gender','has_covid']),
    ('tf3',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tf4',OneHotEncoder(sparse=False,drop='first'),['city'])
],remainder='passthrough')
transformer.fit_transform(cdf)

# transformer = ColumnTransformer(transformers=[
#     ('tnf1',SimpleImputer(),['fever']),
#     ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
#     ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
# ],remainder='passthrough')
# transformer.fit_transform(cdf)

array([[103.0, 0.0, 0.0, 1.0, 0.0, 60, 'Male', 'No'],
       [100.0, 0.0, 1.0, 0.0, 0.0, 27, 'Male', 'Yes'],
       [101.0, 0.0, 1.0, 0.0, 0.0, 42, 'Male', 'No'],
       [98.0, 0.0, 0.0, 1.0, 0.0, 31, 'Female', 'No'],
       [101.0, 0.0, 0.0, 0.0, 1.0, 65, 'Female', 'No'],
       [100.84444444444445, 0.0, 0.0, 0.0, 0.0, 84, 'Female', 'Yes'],
       [101.0, 1.0, 0.0, 0.0, 0.0, 14, 'Male', 'No'],
       [100.84444444444445, 1.0, 0.0, 0.0, 1.0, 20, 'Female', 'Yes'],
       [100.0, 1.0, 0.0, 0.0, 0.0, 19, 'Female', 'No'],
       [101.0, 0.0, 1.0, 0.0, 0.0, 64, 'Female', 'No'],
       [100.84444444444445, 0.0, 1.0, 0.0, 0.0, 75, 'Female', 'No'],
       [98.0, 0.0, 0.0, 0.0, 1.0, 65, 'Female', 'Yes'],
       [99.0, 1.0, 0.0, 1.0, 0.0, 25, 'Female', 'No'],
       [102.0, 0.0, 0.0, 0.0, 0.0, 64, 'Male', 'Yes'],
       [104.0, 0.0, 0.0, 0.0, 0.0, 51, 'Male', 'No'],
       [103.0, 1.0, 0.0, 1.0, 0.0, 70, 'Male', 'Yes'],
       [103.0, 0.0, 0.0, 1.0, 0.0, 69, 'Female', 'Yes'],
       [98.0, 1.0, 

In [88]:
cdf.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [90]:
transformer2 = ColumnTransformer(transformers=[
    ('tnf1',LabelEncoder(),['gender'])
],remainder='passthrough')
transformer2.fit_transform(cdf)

TypeError: fit_transform() takes 2 positional arguments but 3 were given