# Columns Transformer

    ColumnTransformer allows you to apply different transformations to different columns of your dataset.
    It's particularly useful when working with datasets containing heterogeneous data types (e.g., numeric and categorical features).
    By encapsulating preprocessing steps within a ColumnTransformer, you can create more efficient and flexible machine learning pipelines.

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer

#### column are stored in the variables

In [2]:
data = {'sales': [100000,222000,1000000,522000,111111,222222,1111111,20000,75000,90000,1000000,10000],
      'city': ['Tampa','Tampa','Orlando','Jacksonville','Miami','Jacksonville','Miami','Miami','Orlando','Orlando','Orlando','Orlando'],
      'size': ['Small', 'Medium','Large','Large','Small','Medium','Large','Small','Medium','Medium','Medium','Small']}

df = pd.DataFrame(data)
df

Unnamed: 0,sales,city,size
0,100000,Tampa,Small
1,222000,Tampa,Medium
2,1000000,Orlando,Large
3,522000,Jacksonville,Large
4,111111,Miami,Small
5,222222,Jacksonville,Medium
6,1111111,Miami,Large
7,20000,Miami,Small
8,75000,Orlando,Medium
9,90000,Orlando,Medium


In [3]:
data = {'sales': [100000,222000,1000000,522000,111111,222222,1111111,20000,75000,90000,1000000,10000],
      'city': ['Tampa','Tampa','Orlando','Jacksonville','Miami','Jacksonville','Miami','Miami','Orlando','Orlando','Orlando','Orlando'],
      'size': ['Small', 'Medium','Large','Large','Small','Medium','Large','Small','Medium','Medium','Medium','Small']}

df = pd.DataFrame(data)

size = df['size'].unique()

OHE_column = ['city']
ORE_column = ['size']

OHE = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ORE = OrdinalEncoder(categories=[size])

CT = make_column_transformer(
    (OHE, OHE_column),
    (ORE, ORE_column),
    remainder = 'passthrough'
)

CT.set_output(transform='pandas')

CT.fit_transform(df)

Unnamed: 0,onehotencoder__city_Jacksonville,onehotencoder__city_Miami,onehotencoder__city_Orlando,onehotencoder__city_Tampa,ordinalencoder__size,remainder__sales
0,0.0,0.0,0.0,1.0,0.0,100000
1,0.0,0.0,0.0,1.0,1.0,222000
2,0.0,0.0,1.0,0.0,2.0,1000000
3,1.0,0.0,0.0,0.0,2.0,522000
4,0.0,1.0,0.0,0.0,0.0,111111
5,1.0,0.0,0.0,0.0,1.0,222222
6,0.0,1.0,0.0,0.0,2.0,1111111
7,0.0,1.0,0.0,0.0,0.0,20000
8,0.0,0.0,1.0,0.0,1.0,75000
9,0.0,0.0,1.0,0.0,1.0,90000


#### without storing the column in variables

In [4]:
data = {'sales': [100000,222000,1000000,522000,111111,222222,1111111,20000,75000,90000,1000000,10000],
      'city': ['Tampa','Tampa','Orlando','Jacksonville','Miami','Jacksonville','Miami','Miami','Orlando','Orlando','Orlando','Orlando'],
      'size': ['Small', 'Medium','Large','Large','Small','Medium','Large','Small','Medium','Medium','Medium','Small']}

df = pd.DataFrame(data)

size = df['size'].unique()

OHE = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ORE = OrdinalEncoder(categories=[size])

CT = make_column_transformer(
    (OHE, ['city']),
    (ORE, ['size']),
    remainder = 'passthrough'
)

CT.set_output(transform='pandas')  ## set output is important to 

CT.fit_transform(df)


Unnamed: 0,onehotencoder__city_Jacksonville,onehotencoder__city_Miami,onehotencoder__city_Orlando,onehotencoder__city_Tampa,ordinalencoder__size,remainder__sales
0,0.0,0.0,0.0,1.0,0.0,100000
1,0.0,0.0,0.0,1.0,1.0,222000
2,0.0,0.0,1.0,0.0,2.0,1000000
3,1.0,0.0,0.0,0.0,2.0,522000
4,0.0,1.0,0.0,0.0,0.0,111111
5,1.0,0.0,0.0,0.0,1.0,222222
6,0.0,1.0,0.0,0.0,2.0,1111111
7,0.0,1.0,0.0,0.0,0.0,20000
8,0.0,0.0,1.0,0.0,1.0,75000
9,0.0,0.0,1.0,0.0,1.0,90000


## Now using the drop in the remainder

In [5]:
data = {'sales': [100000,222000,1000000,522000,111111,222222,1111111,20000,75000,90000,1000000,10000],
      'city': ['Tampa','Tampa','Orlando','Jacksonville','Miami','Jacksonville','Miami','Miami','Orlando','Orlando','Orlando','Orlando'],
      'size': ['Small', 'Medium','Large','Large','Small','Medium','Large','Small','Medium','Medium','Medium','Small']}

df = pd.DataFrame(data)

size = df['size'].unique()

OHE = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ORE = OrdinalEncoder(categories=[size])

CT = make_column_transformer(
    (OHE, ['city']),
    (ORE, ['size']),
    remainder = 'drop'
)

CT.set_output(transform='pandas')  ## set output is important to 

CT.fit_transform(df)


Unnamed: 0,onehotencoder__city_Jacksonville,onehotencoder__city_Miami,onehotencoder__city_Orlando,onehotencoder__city_Tampa,ordinalencoder__size
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,1.0,0.0,2.0
3,1.0,0.0,0.0,0.0,2.0
4,0.0,1.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,1.0
6,0.0,1.0,0.0,0.0,2.0
7,0.0,1.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,1.0
9,0.0,0.0,1.0,0.0,1.0


### keep one column and drop one column

In [6]:
data = {'sales': [100000,222000,1000000,522000,111111,222222,1111111,20000,75000,90000,1000000,10000],
      'city': ['Tampa','Tampa','Orlando','Jacksonville','Miami','Jacksonville','Miami','Miami','Orlando','Orlando','Orlando','Orlando'],
      'size': ['Small', 'Medium','Large','Large','Small','Medium','Large','Small','Medium','Medium','Medium','Small']}

df = pd.DataFrame(data)

size = df['size'].unique()

OHE = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ORE = OrdinalEncoder(categories=[size])

CT = make_column_transformer(
    (OHE, ['city']),
    ('passthrough', ['size']),
    remainder = 'drop'
)

CT.set_output(transform='pandas')  ## set output is important to 

CT.fit_transform(df)


Unnamed: 0,onehotencoder__city_Jacksonville,onehotencoder__city_Miami,onehotencoder__city_Orlando,onehotencoder__city_Tampa,passthrough__size
0,0.0,0.0,0.0,1.0,Small
1,0.0,0.0,0.0,1.0,Medium
2,0.0,0.0,1.0,0.0,Large
3,1.0,0.0,0.0,0.0,Large
4,0.0,1.0,0.0,0.0,Small
5,1.0,0.0,0.0,0.0,Medium
6,0.0,1.0,0.0,0.0,Large
7,0.0,1.0,0.0,0.0,Small
8,0.0,0.0,1.0,0.0,Medium
9,0.0,0.0,1.0,0.0,Medium
