In [1]:
# One Hot Encoding (OHE) - Main Concepts

# 1. Why Handle Categorical Data?
# Many ML algorithms work only with numerical input, so categorical features (like color or brand) must be converted into numeric form [attached_file:1][attached_file:2].

# 2. Types of Categorical Data
# - Nominal: Categories without any inherent order (e.g., color: red, blue, yellow).
# - Ordinal: Categories with a meaningful order (e.g., ratings: poor, average, good, excellent).
# Use ordinal encoding for ordinal data, and one hot encoding for nominal data [attached_file:1][attached_file:2].

# 3. One Hot Encoding Explained
# - OHE converts each category in a column to a new column (binary feature).
# - If a column 'Color' has values ['red', 'blue', 'yellow'], OHE creates 'Color_red', 'Color_blue', 'Color_yellow'.
# - Each row gets 1 in the matching category column and 0 elsewhere.
# Example:
# Colors: red, blue, yellow
# OHE columns: [Color_red, Color_blue, Color_yellow]
# Data point 'blue' => [0, 1, 0]
# Data point 'red' => [1, 0, 0]
# Helps algorithms treat each category equally, removing false numeric relationships [attached_file:1][attached_file:2].

# 4. Dummy Variable Trap (Multicollinearity Issue)
# - If a category column creates 3 binary columns, their sum per row is always 1, making them mathematically dependent.
# - In regression algorithms, this causes multicollinearity.
# - To avoid, always drop one of the dummy columns (say, the first) after OHE.
# - For n categories, n-1 columns are kept.
# Example:
# If you drop 'Color_red', [0, 0] means 'red'; [1, 0] means 'blue'; [0, 1] means 'yellow'.
# This solves the problem and keeps information loss minimal [attached_file:1][attached_file:2].

# 5. High Cardinality (Many Categories)
# - If a categorical column has many unique values (like a 'Brand' column with 50 brands), OHE adds 50 columns.
# - This increases data dimensionality and can slow computation.
# - Solution: Group less frequent categories into a new 'Other' category or use only the top K frequent categories and merge the rest.
# - Example: If 'Maruti', 'Mahindra', 'BMW' are frequent, keep them and mark rare brands as 'Other' before OHE [attached_file:1][attached_file:2].

# 6. OHE in Python - Pandas
# - Use pandas 'get_dummies' function:
#   pd.get_dummies(data, columns=['Color'], drop_first=True)
# - drop_first=True will automatically remove the first column (preventing dummy variable trap) [attached_file:1][attached_file:2].

# 7. OHE in Python - sklearn
# - Use sklearn's OneHotEncoder for more complex ML pipelines.
#   from sklearn.preprocessing import OneHotEncoder
#   encoder = OneHotEncoder(drop='first', sparse=False)
#   X_encoded = encoder.fit_transform(X[['Color']])
# - Need to select appropriate columns and join encoded features back to the main DataFrame.
# - Use 'ColumnTransformer' if different columns need different preprocessing steps [attached_file:1][attached_file:2].

# 8. Practical Example Workflow
# - Identify categorical columns ('Color', 'Fuel', 'Owner').
# - Perform value counts to identify high cardinality.
# - Decide which categories to keep and which to group.
# - Apply OHE either via pandas or sklearn.
# - Always check feature shape after transformation to confirm expected column count [attached_file:1][attached_file:2].

# 9. Multi-Column OHE
# - Sometimes, only specific columns in the DataFrame need encoding.
# - Extract those columns, apply OHE, merge with numerical columns back.
# - Use sklearn's ColumnTransformer for multiple encodings in ML pipelines [attached_file:1][attached_file:2].

# 10. Key Takeaways
# - Always convert categorical to numeric before ML modeling.
# - Use OHE for nominal columns, ordinal encoding for ordered columns.
# - Avoid dummy variable trap by dropping one OHE column.
# - Reduce dimensions for high cardinality by grouping rare categories.
# - Use proper Python tools for encoding (pandas/sklearn).
# - ColumnTransformer allows flexible encoding for ML pipelines [attached_file:1][attached_file:2].


In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('cars.csv')

In [4]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
1193,Mahindra,120000,Diesel,Third Owner,370000
4676,Audi,30000,Diesel,First Owner,2375000
133,Jaguar,45000,Diesel,First Owner,3200000
3687,Force,13000,Diesel,First Owner,1050000
667,Hyundai,98000,Petrol,Third Owner,275000


In [5]:
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Land                6
Force               6
Isuzu               5
Ambassador          4
Kia                 4
MG                  3
Daewoo              3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [6]:
df['brand'].nunique()

32

In [7]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [8]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

# 1. OneHotEncodign Using Pandas

In [9]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


# 1. K - 1 OneHotEncoding

In [10]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# 3. OneHotEncoding Using Sklearn

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train , y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size = 0.2 , random_state=2)

In [12]:
X_train,X_test

(         brand  km_driven    fuel         owner
 5571   Hyundai      35000  Diesel   First Owner
 2038      Jeep      60000  Diesel   First Owner
 2957   Hyundai      25000  Petrol   First Owner
 7618  Mahindra     130000  Diesel  Second Owner
 6684   Hyundai     155000  Diesel   First Owner
 ...        ...        ...     ...           ...
 3606      Ford      35000  Diesel   First Owner
 5704    Maruti     120000  Petrol   First Owner
 6637      Tata      15000  Petrol   First Owner
 2575    Maruti      32500  Diesel  Second Owner
 7336     Isuzu     121000  Diesel   First Owner
 
 [6502 rows x 4 columns],
          brand  km_driven    fuel         owner
 606    Hyundai      80000  Petrol   First Owner
 7575  Mahindra      70000  Diesel  Second Owner
 7705    Toyota      68089  Petrol   First Owner
 4305   Hyundai      70000  Petrol  Second Owner
 2685  Mahindra      97000  Diesel  Second Owner
 ...        ...        ...     ...           ...
 1537    Maruti      50000  Diesel   Firs

In [13]:
y_train , y_test

(5571    1150000
 2038    1689999
 2957     580000
 7618     150000
 6684     320000
          ...   
 3606     620000
 5704     335000
 6637     450000
 2575     651000
 7336    1160000
 Name: selling_price, Length: 6502, dtype: int64,
 606      200000
 7575     750000
 7705    2000000
 4305      52000
 2685     509999
          ...   
 1537     650000
 6153     220000
 7954     320000
 2310     150000
 712      715000
 Name: selling_price, Length: 1626, dtype: int64)

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
ohe = OneHotEncoder(drop='first',sparse_output = False,dtype = np.int32)

In [22]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])

In [23]:
X_train_new.shape

(6502, 7)

In [24]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

In [25]:
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [26]:
X_test_new

array([[0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [29]:
np.hstack((X_test[['brand','km_driven']].values,X_test_new))

array([['Hyundai', 80000, 0, ..., 0, 0, 0],
       ['Mahindra', 70000, 1, ..., 1, 0, 0],
       ['Toyota', 68089, 0, ..., 0, 0, 0],
       ...,
       ['Maruti', 40000, 0, ..., 0, 0, 0],
       ['Hyundai', 80000, 0, ..., 1, 0, 0],
       ['Hyundai', 50000, 1, ..., 0, 0, 0]], dtype=object)

# 4.OneHotEncoding with Top Categories

In [30]:
counts = df['brand'].value_counts()

In [34]:
df['brand'].value_counts()
threshold = 100

In [35]:
counts[counts <= threshold].index

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Land', 'Force', 'Isuzu', 'Ambassador',
       'Kia', 'MG', 'Daewoo', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [36]:
repl =counts[counts <= threshold].index

In [38]:
pd.get_dummies(df['brand'].replace(repl,'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
7547,False,False,False,False,True,False,False,False,False,False,False,False,False
5808,False,False,False,False,False,False,False,False,False,False,False,True,False
1583,False,False,False,False,False,False,True,False,False,False,False,False,False
2513,False,False,False,False,False,False,True,False,False,False,False,False,False
4933,False,False,False,False,False,False,True,False,False,False,False,False,False
