
# Unified Preprocessing Pipeline

This notebook integrates all preprocessing steps implemented by individual members into a single, reusable pipeline.  Each transformation is encapsulated in a custom transformer class or scikit‑learn component so that it can be applied consistently during training and inference.  The pipeline performs the following operations:

1. **Initial cleaning** – drop the `Unnamed: 0` column.
2. **Missing value imputation** – fill missing numeric values with the median and missing categorical values with the most frequent category.
3. **Numeric parsing** – extract numeric values from `Mileage`, `Engine`, `Power` and `New_Price` and convert them to `Mileage_Num`, `Engine_CC`, `Power_BHP` and `New_Price_Num`.
4. **Name grouping** – collapse rare car names into an `Other` category and keep the top 50 most frequent names.
5. **Encoding** – one‑hot encode the grouped `Name` and other categorical features: `Location`, `Fuel_Type`, `Transmission`, and `Owner_Type`.
6. **Scaling** – standardize numeric features so they have zero mean and unit variance.

> **Note:** Outlier removal should be applied manually to the training set before fitting the pipeline.  Removing observations inside a transformer is not advised because it may misalign feature matrices with target labels.


In [None]:

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import re

# Custom transformers
class InitialCleaner(BaseEstimator, TransformerMixin):
    """Remove irrelevant columns (e.g., 'Unnamed: 0')."""
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        if 'Unnamed: 0' in X.columns:
            X = X.drop(columns=['Unnamed: 0'])
        return X

class NumericParser(BaseEstimator, TransformerMixin):
    """Parse textual numbers into numeric columns."""
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        def parse_numeric(value):
            if pd.isnull(value):
                return np.nan
            match = re.search(r"([0-9]*\.?[0-9]+)", str(value))
            return float(match.group(1)) if match else np.nan
        def parse_price(value):
            if pd.isnull(value):
                return np.nan
            match = re.search(r"([0-9]*\.?[0-9]+)", str(value))
            if match:
                num = float(match.group(1))
                if 'Cr' in str(value) or 'cr' in str(value).lower():
                    return num * 100  # convert crores to lakhs
                return num
            return np.nan
        X['Mileage_Num'] = X['Mileage'].apply(parse_numeric)
        X['Engine_CC'] = X['Engine'].apply(parse_numeric)
        X['Power_BHP'] = X['Power'].apply(parse_numeric)
        X['New_Price_Num'] = X['New_Price'].apply(parse_price)
        return X

class NameGrouper(BaseEstimator, TransformerMixin):
    """Group rare car names into 'Other' and keep top_n names."""
    def __init__(self, top_n=50):
        self.top_n = top_n
        self.top_names_ = None
    def fit(self, X, y=None):
        name_counts = X['Name'].value_counts()
        self.top_names_ = name_counts.nlargest(self.top_n).index.tolist()
        return self
    def transform(self, X):
        X = X.copy()
        X['Name_Grouped'] = X['Name'].apply(lambda x: x if x in self.top_names_ else 'Other')
        return X

# Function to remove outliers (to be applied on training data only)
def remove_outliers(df, columns, factor=1.5):
    cleaned_df = df.copy()
    for col in columns:
        Q1 = cleaned_df[col].quantile(0.25)
        Q3 = cleaned_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        cleaned_df = cleaned_df[(cleaned_df[col] >= lower_bound) & (cleaned_df[col] <= upper_bound)]
    return cleaned_df

# Load raw data
train_df = pd.read_csv('../data/train-data.csv', index_col=False)
test_df = pd.read_csv('../data/test-data.csv', index_col=False)

# Apply outlier removal on training data before pipeline fitting
outlier_columns = ['Kilometers_Driven', 'Year', 'Price']
train_df_no_outliers = remove_outliers(train_df, outlier_columns)
print(f"Training size before outlier removal: {train_df.shape[0]}, after: {train_df_no_outliers.shape[0]}")

# Define categorical and numeric feature lists (after parsing)
categorical_features = ['Name_Grouped', 'Location', 'Fuel_Type', 'Transmission', 'Owner_Type']
numeric_features = ['Year', 'Kilometers_Driven', 'Mileage_Num', 'Engine_CC', 'Power_BHP', 'Seats', 'New_Price_Num']

# Define preprocessing for numeric and categorical features
numeric_preprocess = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_preprocess = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
])

# Combine into a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_preprocess, numeric_features),
    ('cat', categorical_preprocess, categorical_features),
])

# Create full pipeline
full_pipeline = Pipeline([
    ('initial_cleaner', InitialCleaner()),
    ('numeric_parser', NumericParser()),
    ('name_grouper', NameGrouper(top_n=50)),
    ('preprocessor', preprocessor),
])

# Fit the pipeline on the cleaned training data
X_train = train_df_no_outliers.drop(columns=['Price'])
y_train = train_df_no_outliers['Price']

full_pipeline.fit(X_train)

# Transform the training and test sets
X_train_prepared = full_pipeline.transform(X_train)
X_test_prepared = full_pipeline.transform(test_df)

print(f"Prepared training feature matrix shape: {X_train_prepared.shape}")
print(f"Prepared test feature matrix shape: {X_test_prepared.shape}")
