In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('cars.csv')

In [None]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [None]:
df.sample(10)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
832,Maruti,15000,Petrol,First Owner,241000
1048,Maruti,25000,Petrol,First Owner,215000
4573,Maruti,35000,Diesel,First Owner,850000
402,Maruti,35000,CNG,First Owner,409999
7970,Renault,18000,Diesel,First Owner,1050000
4527,Mercedes-Benz,110000,Diesel,Third Owner,1700000
3025,Maruti,63000,Petrol,Third Owner,80000
3149,Tata,35000,Diesel,First Owner,484999
807,Maruti,110000,Diesel,Second Owner,320000
2782,Mahindra,50000,Diesel,First Owner,700000


In [None]:
df['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [None]:
df['brand'].nunique()

32

In [None]:
df['fuel'].value_counts()

Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64

In [None]:
df['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

## 1. OneHotEncoding using Pandas

In [None]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


## 2. K-1 OneHotEncoding

In [None]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


## 3. OneHotEncoding using Sklearn

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2)

In [None]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6124,Maruti,70000,Diesel,First Owner
5866,Maruti,67000,Petrol,First Owner
6518,Tata,2560,Petrol,First Owner
5738,Maruti,70000,Petrol,First Owner
4193,Ford,68609,Diesel,First Owner


In [None]:
X_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
1676,Mercedes-Benz,110000,Diesel,Third Owner
7493,Maruti,60000,Petrol,Second Owner
609,Mahindra,50000,Diesel,Second Owner
137,BMW,29500,Diesel,First Owner
4066,Honda,30000,Diesel,First Owner


In [None]:
y_train.head()

6124    520000
5866    175000
6518    520000
5738    135000
4193    515000
Name: selling_price, dtype: int64

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder()

In [None]:
ohe.fit_transform(X_train[['fuel','owner']])

<6502x9 sparse matrix of type '<class 'numpy.float64'>'
	with 13004 stored elements in Compressed Sparse Row format>

In [None]:
ohe.fit_transform(X_train[['fuel','owner']]).toarray()

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()

In [None]:
X_test_new = ohe.transform(X_test[['fuel','owner']]).toarray()

In [None]:
X_train_new

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
X_train[['brand','km_driven']].values

array([['Maruti', 70000],
       ['Maruti', 67000],
       ['Tata', 2560],
       ...,
       ['Renault', 34000],
       ['Tata', 50000],
       ['Hyundai', 15000]], dtype=object)

In [None]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Maruti', 70000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Maruti', 67000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Tata', 2560, 0.0, ..., 0.0, 0.0, 0.0],
       ...,
       ['Renault', 34000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Tata', 50000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Hyundai', 15000, 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

In [None]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new)).shape

(6502, 11)

In [None]:
ohe = OneHotEncoder(drop='first')

In [None]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()

In [None]:
X_train_new.shape

(6502, 7)

In [None]:
ohe = OneHotEncoder(drop='first',dtype=np.int32)

In [None]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()

## 4. OneHotEncoding with Top Categories

In [None]:
df['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [None]:
counts = df['brand'].value_counts()

In [None]:
df['brand'].nunique()
threshold = 100

In [None]:
counts[counts <= threshold].index

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')

In [None]:
repl = counts[counts <= threshold].index

In [None]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
3085,0,0,0,0,0,0,1,0,0,0,0,0,0
256,0,0,0,0,0,0,1,0,0,0,0,0,0
4513,0,0,0,0,0,0,1,0,0,0,0,0,0
1346,0,0,0,0,0,0,0,1,0,0,0,0,0
4986,0,0,0,0,0,0,1,0,0,0,0,0,0


In [None]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0
