# OneHot Encoding

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
data = {
    "Animal" : ['Dog' , 'Cat' , 'Rabbit' , 'Snake','Goldfish'] ,
    "Class"  : ['Mammal' , 'Mammal' , 'Mammal' , 'Reptile' , 'Fish']
}

df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,Animal,Class
0,Dog,Mammal
1,Cat,Mammal
2,Rabbit,Mammal
3,Snake,Reptile
4,Goldfish,Fish


In [4]:
encoder = OneHotEncoder(sparse = False)

In [5]:
enco_data = encoder.fit_transform(df[['Animal' , 'Class']])



In [6]:
enco_data

array([[0., 1., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0., 0.]])

In [7]:
enco_df = pd.DataFrame(enco_data , columns = encoder.get_feature_names_out(['Animal' , 'Class']))

In [8]:
enco_df

Unnamed: 0,Animal_Cat,Animal_Dog,Animal_Goldfish,Animal_Rabbit,Animal_Snake,Class_Fish,Class_Mammal,Class_Reptile
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


# Example 2

In [9]:
df = pd.read_csv("D:\\csv files\\cars.csv" , usecols= ['name' , 'km_driven' , 'fuel' , 'selling_price' , 'owner'])

In [10]:
df

Unnamed: 0,name,selling_price,km_driven,fuel,owner
0,Maruti 800 AC BSII,50000,100000,Petrol,Fourth & Above Owner
1,Maruti Gypsy E MG410W ST,95000,100000,Petrol,Second Owner
2,Mahindra Jeep CL 500 MDI,250000,35000,Diesel,Second Owner
3,Mahindra Jeep MM 540,200000,60000,Diesel,First Owner
4,Mahindra Jeep CL 500 MDI,150000,120000,Diesel,Third Owner
...,...,...,...,...,...
4335,Hyundai Venue SX Opt Turbo BSIV,1050000,1100,Petrol,First Owner
4336,Hyundai Grand i10 1.2 Kappa Magna BSIV,545000,5000,Petrol,First Owner
4337,Ford Figo Aspire 1.5 TDCi Titanium,530000,45000,Diesel,First Owner
4338,Tata Harrier XE,426000,1000,Diesel,First Owner


In [11]:
df.isnull().sum()

name             0
selling_price    0
km_driven        0
fuel             0
owner            0
dtype: int64

In [12]:
df['fuel'].value_counts()

fuel
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64

# Get Dummies Method

In [13]:
pd.get_dummies(df , columns = ['fuel' , 'owner'])

Unnamed: 0,name,selling_price,km_driven,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti 800 AC BSII,50000,100000,False,False,False,False,True,False,True,False,False,False
1,Maruti Gypsy E MG410W ST,95000,100000,False,False,False,False,True,False,False,True,False,False
2,Mahindra Jeep CL 500 MDI,250000,35000,False,True,False,False,False,False,False,True,False,False
3,Mahindra Jeep MM 540,200000,60000,False,True,False,False,False,True,False,False,False,False
4,Mahindra Jeep CL 500 MDI,150000,120000,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,Hyundai Venue SX Opt Turbo BSIV,1050000,1100,False,False,False,False,True,True,False,False,False,False
4336,Hyundai Grand i10 1.2 Kappa Magna BSIV,545000,5000,False,False,False,False,True,True,False,False,False,False
4337,Ford Figo Aspire 1.5 TDCi Titanium,530000,45000,False,True,False,False,False,True,False,False,False,False
4338,Tata Harrier XE,426000,1000,False,True,False,False,False,True,False,False,False,False


In [14]:
pd.get_dummies(df , columns = ['fuel' , 'owner'] , drop_first= True)

Unnamed: 0,name,selling_price,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti 800 AC BSII,50000,100000,False,False,False,True,True,False,False,False
1,Maruti Gypsy E MG410W ST,95000,100000,False,False,False,True,False,True,False,False
2,Mahindra Jeep CL 500 MDI,250000,35000,True,False,False,False,False,True,False,False
3,Mahindra Jeep MM 540,200000,60000,True,False,False,False,False,False,False,False
4,Mahindra Jeep CL 500 MDI,150000,120000,True,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
4335,Hyundai Venue SX Opt Turbo BSIV,1050000,1100,False,False,False,True,False,False,False,False
4336,Hyundai Grand i10 1.2 Kappa Magna BSIV,545000,5000,False,False,False,True,False,False,False,False
4337,Ford Figo Aspire 1.5 TDCi Titanium,530000,45000,True,False,False,False,False,False,False,False
4338,Tata Harrier XE,426000,1000,True,False,False,False,False,False,False,False


# OneHot

In [15]:
x_train , x_test ,y_train , y_test = train_test_split(df.iloc[ : ,  0:5 ] ,df.iloc[: , -3], test_size= 0.2)



In [16]:
y_train

1709    108731
3658     37000
1118     90000
216      90000
726     350000
         ...  
1093     70000
375      60000
1326     60000
3559    120000
2061     60000
Name: km_driven, Length: 3472, dtype: int64

In [17]:
ohe = OneHotEncoder(drop= 'first'  , sparse = False)

In [18]:
ohe

In [19]:
df.head()

Unnamed: 0,name,selling_price,km_driven,fuel,owner
0,Maruti 800 AC BSII,50000,100000,Petrol,Fourth & Above Owner
1,Maruti Gypsy E MG410W ST,95000,100000,Petrol,Second Owner
2,Mahindra Jeep CL 500 MDI,250000,35000,Diesel,Second Owner
3,Mahindra Jeep MM 540,200000,60000,Diesel,First Owner
4,Mahindra Jeep CL 500 MDI,150000,120000,Diesel,Third Owner


In [20]:
x_train_new = ohe.fit_transform(x_train[['fuel' , 'owner'  ]])



In [21]:
x_train_new

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [22]:
x_test_new = ohe.fit_transform(x_test[['fuel','owner' ]])



In [23]:
x_test_new

array([[1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])