# Column Transformation:
**In Machine Learning, transform each column one by one is too hectic and time consuming task. To simplify this workflow, there's a class in scikit learn `Column Transformer` that can transform data by just one line of code.**

In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [28]:
df = pd.read_csv("C:\\Users\\Maryam\\100 Days of ML\\train.csv")
df = df[['Ship Mode', 'State','Category','Sales']]
df.head()

Unnamed: 0,Ship Mode,State,Category,Sales
0,Second Class,Kentucky,Furniture,261.96
1,Second Class,Kentucky,Furniture,731.94
2,Second Class,California,Office Supplies,14.62
3,Standard Class,Florida,Furniture,957.5775
4,Standard Class,Florida,Office Supplies,22.368


In [29]:
df.shape

(9800, 4)

In [30]:
df.isnull().sum()

Ship Mode    0
State        0
Category     0
Sales        0
dtype: int64

In [None]:
Ship ---> Ordinal Encoder
State ---> OHE
Category ---> OHE

In [32]:
state_counts = df['State'].value_counts()

In [43]:
others = state_counts[state_counts <=150].index

In [44]:
df['State'] = df['State'].replace(others, 'Others')

In [45]:
df['Category'].value_counts()

Category
Office Supplies    5909
Furniture          2078
Technology         1813
Name: count, dtype: int64

In [46]:
df['State'].value_counts()

State
California        1946
Others            1902
New York          1097
Texas              973
Pennsylvania       582
Washington         504
Illinois           483
Ohio               454
Florida            373
Michigan           253
North Carolina     247
Virginia           224
Arizona            223
Tennessee          183
Colorado           179
Georgia            177
Name: count, dtype: int64

In [53]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:3], df.iloc[:, -1], test_size=0.3, random_state=2)

In [56]:
transformer = ColumnTransformer(transformers=[
    ('tf1', OrdinalEncoder(categories=[['First Class', 'Second Class', 'Standard Class', 'Same Day']], dtype=int), ['Ship Mode']),
    ('tf2', OneHotEncoder(drop='first', sparse=False, dtype=int), ['State', 'Category'])
], remainder='passthrough')

In [59]:
X_train = transformer.fit_transform(X_train)



In [61]:
X_test = transformer.transform(X_test)