In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Sample dataset
df = pd.DataFrame({
    'age': [25, 32, 47, 51, 62],
    'income': [50000, 64000, 120000, 110000, 150000],
    'fuel': ['Petrol', 'Diesel', 'CNG', 'Petrol', 'Diesel'],
    'owner': ['First', 'Second', 'First', 'Third', 'Second'],
    'buy': [0, 1, 1, 0, 1]  # Target variable
})

X = df.drop('buy', axis=1)
y = df['buy']

# Define columns
numeric_features = ['age', 'income']
categorical_features = ['fuel', 'owner']

# Define transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)  # or sparse=False for older versions

# ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Build full pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit the pipeline
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)


In [5]:
# Manually transform X to see what's happening behind the scenes
X_transformed = preprocessor.fit_transform(X)
pd.DataFrame(X_transformed)

Unnamed: 0,0,1,2,3,4,5
0,-1.382872,-1.324367,0.0,1.0,0.0,0.0
1,-0.85678,-0.944426,1.0,0.0,1.0,0.0
2,0.270562,0.57534,0.0,0.0,0.0,0.0
3,0.571186,0.303953,0.0,1.0,0.0,1.0
4,1.397904,1.3895,1.0,0.0,1.0,0.0
