# Import the Dependencies

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Load the Dataset

In [3]:
cars = pd.read_csv("cars.csv")
cars.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [4]:
cars.shape

(8128, 5)

In [5]:
cars["owner"].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [6]:
cars["fuel"].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

# OneHotEncoding Using Pandas

In [7]:
pd.get_dummies(cars, columns = ["fuel", "owner"], dtype = int)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


# (K-1) OneHotEncoding

In [7]:
pd.get_dummies(cars, columns = ["owner", "fuel"], drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,fuel_Diesel,fuel_LPG,fuel_Petrol
0,Maruti,145500,450000,False,False,False,False,True,False,False
1,Skoda,120000,370000,False,True,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,True
3,Hyundai,127000,225000,False,False,False,False,True,False,False
4,Maruti,120000,130000,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,False,False,False,True
8124,Hyundai,119000,135000,True,False,False,False,True,False,False
8125,Maruti,120000,382000,False,False,False,False,True,False,False
8126,Tata,25000,290000,False,False,False,False,True,False,False


# Splitting the Dataset into Features and Target

In [8]:
X = cars.drop("selling_price", axis = 1)
y = cars["selling_price"]

In [9]:
X.head()

Unnamed: 0,brand,km_driven,fuel,owner
0,Maruti,145500,Diesel,First Owner
1,Skoda,120000,Diesel,Second Owner
2,Honda,140000,Petrol,Third Owner
3,Hyundai,127000,Diesel,First Owner
4,Maruti,120000,Petrol,First Owner


# Split the Dataset into Training and Testing Data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [11]:
print(X.shape, X_train.shape, X_test.shape)

(8128, 4) (6502, 4) (1626, 4)


# OneHotEncoding using Sklearn

In [12]:
# Load the One Hot Encoder
ohe = OneHotEncoder(drop = "first", sparse_output = False, dtype = np.int32)

In [13]:
X_train_new = ohe.fit_transform(X_train[["fuel", "owner"]])

In [14]:
X_test_new = ohe.fit(X_test[["fuel", "owner"]])

In [15]:
X_train_new.shape

(6502, 7)

In [16]:
np.hstack((X_train[["brand", "km_driven"]].values, X_train_new))

array([['Tata', 2560, 0, ..., 0, 0, 0],
       ['Honda', 80000, 0, ..., 1, 0, 0],
       ['Hyundai', 150000, 1, ..., 0, 0, 0],
       ...,
       ['Hyundai', 35000, 0, ..., 0, 0, 0],
       ['Maruti', 27000, 1, ..., 0, 0, 0],
       ['Maruti', 70000, 0, ..., 1, 0, 0]], dtype=object)

# OneHotEncoding with Top Categories

In [17]:
counts = cars["brand"].value_counts()

In [18]:
cars["brand"].nunique()
threshold = 100

In [19]:
repl = counts[counts <= threshold].index

In [21]:
pd.get_dummies(cars["brand"].replace(repl, "uncommon"), dtype = int).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
6677,0,0,0,0,1,0,0,0,0,0,0,0,0
7379,0,0,0,0,0,0,0,0,0,0,0,0,1
651,0,0,0,0,0,0,0,0,0,0,1,0,0
2060,0,0,0,0,0,0,0,0,0,0,0,1,0
6426,0,0,0,0,0,0,0,0,0,1,0,0,0
