In [1]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('../../../data/cars.csv')

In [6]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
1404,Ford,80000,Diesel,Third Owner,170000
3231,Maruti,35000,Diesel,First Owner,675000
5986,Maruti,71000,Petrol,First Owner,229999
6452,Ford,110000,Diesel,First Owner,220000
7829,Hyundai,60000,Petrol,First Owner,600000


# One Hot Encoding using Pandas


In [None]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# K-1 OneHotEncoding

In [9]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


In [None]:
# Pandas do not change the actual dataset and do not remember the order of encoding.So we only use it to analyze data.

# One Hot Encoding using Sklearn

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=0)

In [11]:
X_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
3558,Hyundai,40000,Diesel,First Owner
233,Mahindra,70000,Diesel,First Owner
7952,Maruti,5000,Petrol,First Owner
572,Maruti,120000,Petrol,Third Owner
6960,Lexus,20000,Petrol,First Owner


In [12]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
ohe = OneHotEncoder(drop='first',dtype=np.int32)

In [23]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray() #--> fit and transform train set in single step
# --> use two square brackets when accessing diff columns together from dataframe

In [24]:
X_test_new = ohe.fit_transform(X_test[['fuel','owner']]).toarray() 

In [25]:
X_train_new.shape

(6502, 7)

In [26]:
# ohe = OneHotEncoder(sparse=False) #--> this will convert the data into np array directly. 
# X_train_new = ohe.fit_transform(X_train[['fuel','owner']])
# no need to use .toarray()

In [30]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 60000, 0, ..., 0, 0, 0],
       ['Tata', 150000, 1, ..., 0, 0, 1],
       ['Hyundai', 110000, 1, ..., 1, 0, 0],
       ...,
       ['Hyundai', 90000, 0, ..., 1, 0, 0],
       ['Volkswagen', 90000, 1, ..., 0, 0, 0],
       ['Hyundai', 110000, 0, ..., 0, 0, 0]], dtype=object)

# One Hot Encoding brand column

In [31]:
counts = df['brand'].value_counts()

In [None]:
df['brand'].nunique()
threshold = 100
# the brands which have less cars then 100 will be merged together.

In [36]:
repl = counts[counts<=threshold].index
# will store the index of brands with less then 100 cars.

In [37]:
pd.get_dummies(df['brand'].replace(repl,'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False
