In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('/content/cars.csv')

In [3]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [4]:
df['brand'].value_counts()

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,2448
Hyundai,1415
Mahindra,772
Tata,734
Toyota,488
Honda,467
Ford,397
Chevrolet,230
Renault,228
Volkswagen,186


In [5]:
df['brand'].nunique()

32

In [6]:
 df['fuel'].value_counts()

Unnamed: 0_level_0,count
fuel,Unnamed: 1_level_1
Diesel,4402
Petrol,3631
CNG,57
LPG,38


In [7]:
df['owner'].value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,5289
Second Owner,2105
Third Owner,555
Fourth & Above Owner,174
Test Drive Car,5


<h1>OneHotEncoding using Pandas</h1>
<h3>Why Avoid Pandas get_dummies() for ML Training Data</h3><br>
<h6>🚫 No State Tracking: Pandas get_dummies() dynamically encodes columns based on input data, not saving the encoding logic.

🔀 Column Mismatch Risk: Train/test datasets with differing categories (e.g., new/missing values) will produce inconsistent features, crashing your model.

🛠️ Manual Alignment Required: You must manually ensure columns match between datasets (error-prone and impractical).</h6>

In [8]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


<h1>K-1 OneHotEncoding - Removing Multicollinearity</h1>

In [9]:
pd.get_dummies(df,columns=['fuel','owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


<h1>OneHotEncoding using Sklearn</h1>

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4,], df.iloc[:,-1], test_size=0.2, random_state=0)

In [11]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
3042,Hyundai,60000,LPG,First Owner
1520,Tata,150000,Diesel,Third Owner
2611,Hyundai,110000,Diesel,Second Owner
3544,Mahindra,28000,Diesel,Second Owner
4138,Maruti,15000,Petrol,First Owner


In [12]:
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
#Drop true means the first column of each category will drop for removing multicollinearity
#Spare False will get you numpy array and you have to convert it .toarray()

In [13]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])

In [14]:
X_train_new.shape

(6502, 7)

In [15]:
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [16]:
X_test_new.shape

(1626, 7)

In [17]:
X_train_fortraining = np.hstack((X_train[['brand','km_driven']].values, X_train_new))

In [18]:
X_train_fortraining.shape

(6502, 9)

<h1>OneHotEncoding (brand column) Top Categories</h1>

In [19]:
# categorizing all the brands<100 to Others
counts = df['brand'].value_counts()
threshold = 100

In [41]:
replace = counts[counts <= threshold].index
replaced = df['brand'].replace(replace, 'others')

In [39]:
replace

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [36]:
pd.get_dummies(df['brand'].replace(replace, 'others'),drop_first = True)

Unnamed: 0,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,others
0,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,True,False,False,False
