In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('sklearn_data/car-sales-extended.csv')

In [3]:
df.shape

(1000, 5)

In [4]:
df.head(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943


***Split into x and y***

In [5]:
X = df.drop('Price',axis=1)
y = df['Price']

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

onehot = OneHotEncoder()
categorical = ['Make','Colour','Doors']
transformer = ColumnTransformer([('one_hot',
                                 onehot,
                                 categorical)],
                               remainder='passthrough')

In [8]:
transformer

ColumnTransformer(remainder='passthrough',
                  transformers=[('one_hot', OneHotEncoder(),
                                 ['Make', 'Colour', 'Doors'])])

In [9]:
transformedX = transformer.fit_transform(X)

In [11]:
pd.DataFrame(transformedX).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


### Alternative way

In [12]:
df.head(1)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323


In [13]:
categorical = ['Make','Colour','Doors']
for col in categorical:
    df[col] = df[col].astype('category')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Make           1000 non-null   category
 1   Colour         1000 non-null   category
 2   Odometer (KM)  1000 non-null   int64   
 3   Doors          1000 non-null   category
 4   Price          1000 non-null   int64   
dtypes: category(3), int64(2)
memory usage: 19.2 KB


In [15]:
# Identify categorical features
df.select_dtypes(['category']).columns

Index(['Make', 'Colour', 'Doors'], dtype='object')

In [21]:
dummies = pd.get_dummies(df[['Make','Doors','Colour']],drop_first=True)

In [23]:
dummies.head()

Unnamed: 0,Make_Honda,Make_Nissan,Make_Toyota,Doors_4,Doors_5,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,1,0,0,1,0,0,0,0,1
1,0,0,0,0,1,1,0,0,0
2,1,0,0,1,0,0,0,0,1
3,0,0,1,1,0,0,0,0,1
4,0,1,0,0,0,1,0,0,0


In [24]:
df = pd.concat([df,dummies],axis=1)

In [26]:
df = df.drop(df[['Make','Doors','Colour']],axis=1)

In [27]:
df.head()

Unnamed: 0,Odometer (KM),Price,Make_Honda,Make_Nissan,Make_Toyota,Doors_4,Doors_5,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,35431,15323,1,0,0,1,0,0,0,0,1
1,192714,19943,0,0,0,0,1,1,0,0,0
2,84714,28343,1,0,0,1,0,0,0,0,1
3,154365,13434,0,0,1,1,0,0,0,0,1
4,181577,14043,0,1,0,0,0,1,0,0,0
