In [88]:
# onehot encoding 

In [89]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [90]:

# Sample data
data = {
    'Employee id': [10, 20, 15, 25, 30],
    'Gender': ['M', 'F', 'F', 'M', 'F'],
    'Remarks': ['Good', 'Nice', 'Good', 'Great', 'Nice']
}

# Convert the data into a pandas DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Employee id,Gender,Remarks
0,10,M,Good
1,20,F,Nice
2,15,F,Good
3,25,M,Great
4,30,F,Nice


In [91]:
# Use pd.get_dummies() to one-hot encode the categorical

df_pandas_encoded=pd.get_dummies(df,columns=['Remarks','Gender'],drop_first=False)
df_pandas_encoded

Unnamed: 0,Employee id,Remarks_Good,Remarks_Great,Remarks_Nice,Gender_F,Gender_M
0,10,True,False,False,False,True
1,20,False,False,True,True,False
2,15,True,False,False,True,False
3,25,False,True,False,False,True
4,30,False,False,True,True,False


In [92]:
# or 
df_pandas_enc=pd.get_dummies(df.Remarks)
df_pandas_enc

Unnamed: 0,Good,Great,Nice
0,True,False,False
1,False,False,True
2,True,False,False
3,False,True,False
4,False,False,True


We can observe that we have 3 Remarks and 2 Gender columns in the data. However, you can just use n-1 columns to define parameters if it has n unique labels. For example, if we only keep the Gender_Female column and drop the Gender_Male column, then also we can convey the entire information as when the label is 1, it means female and when the label is 0 it means male. This way we can encode the categorical data and reduce the number of parameters as well

In [93]:
ohe=OneHotEncoder(sparse_output=False,dtype=np.int32)
df

Unnamed: 0,Employee id,Gender,Remarks
0,10,M,Good
1,20,F,Nice
2,15,F,Good
3,25,M,Great
4,30,F,Nice


In [94]:
encoded_df=ohe.fit_transform(df[['Remarks','Gender']])
encoded_df.shape


(5, 5)

In [95]:
df.shape

(5, 3)

In [96]:
import numpy as np
df_ohe=np.hstack((df[['Employee id']].values,encoded_df))
df_ohe

array([[10,  1,  0,  0,  0,  1],
       [20,  0,  0,  1,  1,  0],
       [15,  1,  0,  0,  1,  0],
       [25,  0,  1,  0,  0,  1],
       [30,  0,  0,  1,  1,  0]], dtype=int64)

In [97]:
# NEW DATA
df=pd.read_csv('cars.csv')
df

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000
...,...,...,...,...,...
8123,Hyundai,110000,Petrol,First Owner,320000
8124,Hyundai,119000,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000,Diesel,First Owner,382000
8126,Tata,25000,Diesel,First Owner,290000


In [98]:
df['brand'].nunique()

32

In [99]:
threshold=100 
# less then this are in others categories
 

In [100]:
counts=df['brand'].value_counts()
counts

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [101]:
repl_cat=counts[counts<=threshold].index
repl_cat

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [102]:
df_dummies=pd.get_dummies(df.brand.replace(repl_cat,'others'))
df_dummies

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,others
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False


In [103]:
# Replace False with 0 and True with 1
df_dummies=df_dummies.astype(int)
df_dummies

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,others
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


In [104]:
# let apply ohe on fuel
ohe=OneHotEncoder(sparse_output=False,dtype=np.int32)

In [105]:
df_ohe=ohe.fit_transform(df[['fuel']])
df_ohe

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       ...,
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0]])

In [106]:
df_ohe.shape

(8128, 4)

In [107]:
df.shape

(8128, 5)

In [108]:
df_ohe_encoded=np.hstack((df[['brand','km_driven','owner','selling_price']],df_ohe))
df_ohe_encoded

array([['Maruti', 145500, 'First Owner', ..., 1, 0, 0],
       ['Skoda', 120000, 'Second Owner', ..., 1, 0, 0],
       ['Honda', 140000, 'Third Owner', ..., 0, 0, 1],
       ...,
       ['Maruti', 120000, 'First Owner', ..., 1, 0, 0],
       ['Tata', 25000, 'First Owner', ..., 1, 0, 0],
       ['Tata', 25000, 'First Owner', ..., 1, 0, 0]], dtype=object)