# ColumnTransformer
- A ColumnTransformer in machine learning (specifically in Python's scikit-learn) is a tool used to apply different preprocessing steps to different columns of your dataset.
### Why it's useful:
- In real-world datasets, different types of features require different preprocessing. For example:

- Numeric features might need scaling (e.g., StandardScaler).

- Categorical features might need encoding (e.g., OneHotEncoder).

Doing this manually for each column is inefficient. ColumnTransformer automates this process.

# Without Column transformer------

In [107]:
import pandas as pd

# Load data
df = pd.read_csv('people_data_with_target.csv')




In [108]:
df.head()

Unnamed: 0,Age,Gender,EducationLevel,City,Income,HighIncome
0,56,Male,High School,Los Angeles,102762,1
1,46,Male,Bachelors,Houston,100020,1
2,32,Male,Masters,New York,77310,0
3,60,Male,PhD,Los Angeles,38405,0
4,25,Male,Bachelors,Chicago,58522,0


In [109]:
df.describe()

Unnamed: 0,Age,Income,HighIncome
count,500.0,500.0,500.0
mean,41.278,71212.558,0.342
std,13.389072,20784.357982,0.474855
min,18.0,17803.0,0.0
25%,30.0,57747.25,0.0
50%,42.0,71026.0,0.0
75%,52.0,85387.0,1.0
max,64.0,132097.0,1.0


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             500 non-null    int64 
 1   Gender          500 non-null    object
 2   EducationLevel  470 non-null    object
 3   City            500 non-null    object
 4   Income          500 non-null    int64 
 5   HighIncome      500 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 23.6+ KB


In [110]:
df.shape

(500, 6)

In [111]:
df.isnull().sum()

Age                0
Gender             0
EducationLevel    30
City               0
Income             0
HighIncome         0
dtype: int64

In [50]:
# Drop 'Income' column to avoid leakage
df = df.drop(columns=['Income'])


In [112]:
df.sample(5)


Unnamed: 0,Age,Gender,EducationLevel,City,Income,HighIncome
52,24,Male,,Los Angeles,64033,0
80,38,Female,Bachelors,Houston,68342,0
364,40,Female,Masters,Phoenix,68789,0
259,59,Male,High School,Chicago,47449,0
227,23,Female,High School,New York,60411,0


In [113]:
df['City'].value_counts()

City
Phoenix        107
Chicago        106
Los Angeles    105
New York        96
Houston         86
Name: count, dtype: int64

In [114]:
df['EducationLevel'].value_counts()

EducationLevel
Masters        129
Bachelors      119
High School    118
PhD            104
Name: count, dtype: int64

In [115]:
# Missing value imputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
df["EducationLevel"] = imputer.fit_transform(df[["EducationLevel"]]).ravel()



In [116]:
df.isnull().sum()


Age               0
Gender            0
EducationLevel    0
City              0
Income            0
HighIncome        0
dtype: int64

In [56]:
df["EducationLevel"].value_counts()

EducationLevel
Masters        159
Bachelors      119
High School    118
PhD            104
Name: count, dtype: int64

In [117]:
df["EducationLevel"].unique()


array(['High School', 'Bachelors', 'Masters', 'PhD'], dtype=object)

In [104]:
from sklearn.preprocessing import OrdinalEncoder
oe=OrdinalEncoder(categories=[["High School","Bachelors","Masters","PhD"]])
Apply the transformation
df["EducationLevel"] = oe.fit_transform(df[["EducationLevel"]]).astype(int)

In [118]:
df.sample(10)

Unnamed: 0,Age,Gender,EducationLevel,City,Income,HighIncome
235,46,Male,Masters,Chicago,56757,0
170,61,Female,High School,Houston,37730,0
427,36,Male,Bachelors,Chicago,67666,0
0,56,Male,High School,Los Angeles,102762,1
121,54,Male,Masters,Chicago,78482,0
420,64,Female,High School,Chicago,62208,0
264,19,Female,High School,Phoenix,43763,0
249,56,Male,Bachelors,Houston,75308,0
457,46,Female,Bachelors,Los Angeles,70697,0
321,51,Male,PhD,New York,62072,0


In [119]:
# label encodimg
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
# Apply LabelEncoder to the 'HighIncome' column
df['HighIncome'] = le.fit_transform(df['HighIncome'])

In [106]:
df.sample(5)

Unnamed: 0,Age,Gender,EducationLevel,City,Income,HighIncome
458,31,Female,0,Houston,56328,0
192,61,Female,0,Los Angeles,26975,0
488,33,Female,0,New York,112578,1
21,55,Female,3,Chicago,51764,0
114,26,Female,1,Phoenix,54660,0


In [120]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['HighIncome']),df['HighIncome'],test_size=0.2)




In [121]:
# adding simple imputer 
si = SimpleImputer(strategy="most_frequent")
X_train_edu_lvl = si.fit_transform(X_train[['EducationLevel']])

# also the test data
X_test_edu_lvl = si.fit_transform(X_test[['EducationLevel']])
                                 
X_train_edu_lvl.shape

(400, 1)

In [124]:
# OneHotEncoding -> gender,city
ohe = OneHotEncoder(drop='first',sparse_output=False)
X_train_gender_city = ohe.fit_transform(X_train[['Gender','City']])

# also the test data
X_test_gender_city = ohe.fit_transform(X_test[['Gender','City']])

X_train_gender_city.shape


(400, 5)

In [126]:
from sklearn.preprocessing import OrdinalEncoder
# Apply ordinal encoding
oe = OrdinalEncoder(categories=[["High School", "Bachelors", "Masters", "PhD"]])
X_train_trs_edu_lvl = oe.fit_transform(X_train[["EducationLevel"]])
X_test_trs_edu_lvl = oe.transform(X_test[["EducationLevel"]])
X_train_trs_edu_lvl.shape

(400, 1)

In [130]:
# Extracting Age
X_train_age = X_train.drop(columns=['Gender','EducationLevel','City']).values

# also the test data
X_test_age = X_test.drop(columns=['Gender','EducationLevel','City']).values

X_train_age.shape

(400, 2)

In [136]:
# also the test data
X_test_transformed = np.concatenate((X_train_age,X_train_gender_city,X_train_trs_edu_lvl,),axis=1)

In [137]:
X_test_transformed.shape

(400, 8)

In [139]:
X_test_transformed

array([[1.80000e+01, 1.00426e+05, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 1.00000e+00],
       [1.90000e+01, 9.65410e+04, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 1.00000e+00],
       [1.80000e+01, 8.15070e+04, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 3.00000e+00],
       ...,
       [5.30000e+01, 6.50690e+04, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [4.80000e+01, 3.06170e+04, 1.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 3.00000e+00],
       [5.00000e+01, 6.18430e+04, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]], shape=(400, 8))

# Using Column tranformation

In [173]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['HighIncome']),df['HighIncome'],test_size=0.2)


In [175]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import pandas as pd

In [176]:

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

transformer = ColumnTransformer(transformers=[
    ('impute_education', SimpleImputer(strategy='most_frequent'), ['EducationLevel']),
    ('onehot_gender', OneHotEncoder(sparse_output=False, drop='first'), ['Gender']),
    ('onehot_city', OneHotEncoder(sparse_output=False, drop='first'), ['City'])
], remainder='passthrough')



In [1]:
transformer.fit_transform(X_train).shape


NameError: name 'transformer' is not defined

In [185]:

X_train_transformed=transformer.transform(X_test).shape


In [190]:
from scipy.sparse import csr_matrix

# Step 1: Apply the transformation
X_transformed = transformer.fit_transform(X_train)

# Step 2: Convert to dense array if sparse (to avoid errors)
X_transformed = X_transformed.toarray() if isinstance(X_transformed, csr_matrix) else X_transformed

# Step 3: Get the feature names after transformation
feature_names = transformer.get_feature_names_out()

# Step 4: Ensure the transformed data shape matches the number of feature names
print(X_transformed.shape)
print(len(feature_names))

# Step 5: Convert the transformed data to a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

# Step 6: Display the DataFrame
print(X_transformed_df.head())



(400, 8)
8
  impute_education__EducationLevel onehot_gender__Gender_Male  \
0                          Masters                        1.0   
1                          Masters                        1.0   
2                              PhD                        0.0   
3                      High School                        0.0   
4                          Masters                        1.0   

  onehot_city__City_Houston onehot_city__City_Los Angeles  \
0                       0.0                           1.0   
1                       1.0                           0.0   
2                       0.0                           0.0   
3                       0.0                           0.0   
4                       0.0                           0.0   

  onehot_city__City_New York onehot_city__City_Phoenix remainder__Age  \
0                        0.0                       0.0             46   
1                        0.0                       0.0             41   
2           