# Preprocessing & Feature Engineering

## Phase 2 â€” Building the ML Input Pipeline

**Objective:**
Transform raw data into a clean, consistent and leakage-free feature set
ready for machine learning models.

All transformations are designed to:
- Be learned only from the training data
- Be reproducible
- Preserve semantic meaning of features


In [6]:
import sys
import os

# Add project root to Python path
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

In [7]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [8]:
train_path = "../data/raw/train.csv"
test_path = "../data/raw/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [9]:
numeric_features = [
    "LotArea",
    "GrLivArea",
    "TotalBsmtSF",
    "1stFlrSF",
    "2ndFlrSF",
    "GarageArea",
    "WoodDeckSF",
    "OpenPorchSF"
]

ordinal_features = [
    "OverallQual",
    "OverallCond",
    "ExterQual",
    "ExterCond",
    "KitchenQual",
    "HeatingQC",
    "GarageQual",
    "GarageCond"
]

nominal_features = [
    "Neighborhood",
    "MSZoning",
    "BldgType",
    "HouseStyle",
    "RoofStyle",
    "Exterior1st"
]

temporal_features = [
    "YearBuilt",
    "YearRemodAdd",
    "YrSold"
]


In [10]:
from src.features import (
    NUMERIC_FEATURES,
    ORDINAL_FEATURES,
    NOMINAL_FEATURES,
    TEMPORAL_FEATURES
)
