In [None]:
import numpy as np 
import pandas as pd

In [2]:
df = pd.read_csv('/content/cars.csv')

In [3]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [4]:
df['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

# One HotEncoding using Pandas

## One Hot Encoding is a technique used to convert categorical variables into numerical values, where each category is represented as a binary feature (also known as a dummy variable). The purpose of One Hot Encoding is to capture the relationship between the categories and allow machine learning algorithms to utilize this information.

## In Pandas, One Hot Encoding can be easily performed using the get_dummies function. This function takes a categorical feature as input and creates a new binary feature for each category in the feature, where each feature is either 0 or 1, depending on whether the observation belongs to that category or not.

## For example, suppose you have a categorical feature "color" with the categories "red", "green", and "blue". Using one-hot encoding in Pandas, you could create three new binary features, "color_red", "color_green", and "color_blue", where each feature is either 0 or 1, depending on the value of the original "color" feature.

## One Hot Encoding has several advantages, such as the ability to capture complex relationships between categories, handling categorical features with a large number of categories, and the ability to work well with many machine learning algorithms. However, it can also lead to an increase in the number of features, which can be computationally expensive and may affect the performance of some algorithms.

In [5]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


# K-1 Hot Encoding

## K-1 hot encoding is a variation of one-hot encoding, where instead of creating a binary feature for each category in a categorical feature, k-1 binary features are created, with the last category being represented by the absence of a 1 in all the features.

## The purpose of K-1 hot encoding is to reduce the dimensionality of the data, as compared to one-hot encoding, which can create a large number of new features for a categorical feature with many categories. By using K-1 hot encoding, one feature is dropped, reducing the number of new features created.

## For example, suppose you have a categorical feature "color" with the categories "red", "green", and "blue". Using one-hot encoding, you would create three new binary features, "color_red", "color_green", and "color_blue", where each feature is either 0 or 1. However, using K-1 hot encoding, you would create only two new binary features, "color_red" and "color_green", where the absence of a 1 in both features represents the "blue" category.

## It is important to note that K-1 hot encoding assumes that there is no meaningful ordinal relationship between the categories, as in one-hot encoding. The choice between one-hot encoding and K-1 hot encoding will depend on the specific problem and the nature of the data. In some cases, K-1 hot encoding may be a good choice to reduce the dimensionality of the data, but in others, one-hot encoding may be more appropriate to capture the relationships between categories.

In [6]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)
 

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


# One HotEncoding using Sklearn

##  Pandas provides a convenient method for one-hot encoding, scikit-learn's OneHotEncoder class is often preferred when building machine learning models due to its integration with the rest of the scikit-learn library, flexibility, and consistent interface for encoding categorical data.

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=0)

In [9]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
ohe = OneHotEncoder(drop='first')

In [23]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()

In [24]:
X_test_new = ohe.transform(X_test[['fuel','owner']]).toarray()

In [25]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new)).shape

(6502, 9)

# One Hot Encoding with Top Categories

In [28]:
counts = df['brand'].value_counts()

In [29]:
df['brand'].nunique()
threshold = 100

In [30]:
repl = counts[counts <= threshold].index

In [31]:
pd.get_dummies(df['brand'].replace(repl,'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0
