## Imports

In [1]:
from datetime import datetime
import logging
import numpy as np
import pandas as pd
from sklearn import compose
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

## Load and segregate data 

In [2]:
# Read train data and set index
train = pd.read_csv("../data/raw/train.csv")
train.set_index("listing_id", inplace=True)
train.drop_duplicates(inplace=True)

# Read test data and set index
test = pd.read_csv("../data/raw/test.csv")
test.set_index("listing_id", inplace=True)
test.drop_duplicates(inplace=True)

# Set default logging level
# Change to logging.INFO to see related output
logging.basicConfig(level=logging.INFO, force=True)

## Columns Assigned:

- [x] coe
- [x] dereg_value
- [x] omv
- [x] arf
- [x] opc_scheme
- [x] indicative_price
- [x] price
- [x] depreciation
- [x] eco_category
- [x] mileage
- [x] road_tax
- [x] lifespan
- [x] features
- [x] accessories

## Pre-defined Transformers

In [3]:
def get_make_from_title(make_list, title):
    title = title.split(" ")
    for i in range(len(title)):
        if " ".join(title[0 : i + 1]) in make_list:
            return " ".join(title[0 : i + 1])
    return "unknwon"


class CommonPreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.make_list = []
        pass

    def fit(self, df):
        self.make_list = df.make.unique()
        return self

    def transform(self, df):
        df.loc[:, "reg_date"] = np.where(
            df["reg_date"].isnull(), df["original_reg_date"], df["reg_date"]
        )
        df.loc[:, "reg_date"] = pd.to_datetime(df.reg_date)
        df.loc[:, "reg_date_year"] = df.loc[:, "reg_date"].dt.year
        df.loc[:, "reg_date_month"] = (
            datetime.now() - df.loc[:, "reg_date"]
        ) / np.timedelta64(1, "M")
        df.loc[:, "no_of_owners"] = df["no_of_owners"].fillna(1)
        df.loc[:, "title"] = df["title"].str.lower()
        df.loc[:, "make"] = df.apply(
            lambda row: get_make_from_title(self.make_list, row["title"])
            if pd.isnull(row["make"])
            else row["make"],
            axis=1,
        )
        df.loc[:, "make_model"] = df.make + "-" + "df.model"
        return df

## Custom Transformer Definitions

In [4]:
# By observation of "coe_rebate", "dereg_value", "dereg_value_computed" for a few samples
DATASET_GENERATION_DATE = datetime(2021, 9, 14)
FIELDS_TO_DROP = ["indicative_price", "eco_category"]


class CoeTransformer(BaseEstimator, TransformerMixin):
    """
    Imputes missing coe values as well as incorrect ones
    """

    def fit(self, X):
        years_with_missing_coe = X[X["coe"].isnull()].coe_start_year.unique()
        res_of_interest = X[X["coe_start_year"].isin(years_with_missing_coe)]
        self.mean_coe_per_year = res_of_interest.groupby("coe_start_year").agg(
            {"coe": np.mean}
        )
        return self

    def transform(self, X):
        # Fill missing coe values with mean coe for that registration year
        combined_x = X.join(
            self.mean_coe_per_year, on="coe_start_year", rsuffix="_mean"
        )
        combined_x["coe"].fillna(combined_x["coe_mean"], inplace=True)
        combined_x.drop("coe_mean", axis=1, inplace=True)

        # Replace incorrect coe values with mean coe for 2021
        # Example: https://www.sgcarmart.com/used_cars/info.php?ID=1017335
        combined_x.coe.replace(10.0, self.mean_coe_per_year.loc[2021].coe, inplace=True)

        return combined_x


class ArfTransformer(BaseEstimator, TransformerMixin):
    """
    Imputes missing arf values based on its corresponding omv
    """

    DEDUCTION_AMOUNT_TO_RATE_TUPLE = [(20000, 1), (30000, 1.4), (0, 1.8)]

    @classmethod
    def compute_arf(cls, omv):
        """
        Given an omv, compute its corresponding ARF per https://www.sgcarmart.com/news/writeup.php?AID=13
        """
        arf = 0
        for amount_to_deduct, rate in cls.DEDUCTION_AMOUNT_TO_RATE_TUPLE:
            if omv >= amount_to_deduct and amount_to_deduct != 0:
                arf += rate * amount_to_deduct
            else:
                arf += rate * omv

            omv -= amount_to_deduct

            if omv <= 0:
                break

        return arf

    def fit(self, X):
        # Compute arf from omv for those records that have null arf
        rows_without_arf = X[X["arf"].isnull()]
        self.computed_arf = rows_without_arf["omv"].apply(self.compute_arf)
        self.computed_arf.rename("arf_computed", inplace=True)
        return self

    def transform(self, X):
        modified_x = X.join(self.computed_arf)
        modified_x["arf"].fillna(modified_x["arf_computed"], inplace=True)
        modified_x.drop("arf_computed", axis=1, inplace=True)

        if len(modified_x[modified_x.arf.isnull()]):
            logging.info(
                f"ArfTransformer - found {len(modified_x[modified_x.arf.isnull()])} rows with null arf"
            )
            modified_x = modified_x[~modified_x.arf.isnull()]

        return modified_x


class AgeFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a new column "vehicle_age" as min("manufactured", "reg_date_year")
    """

    def fit(self, X):

        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["vehicle_age"] = datetime.now().year - np.min(
            X[["manufactured", "reg_date_year"]], axis=1
        )
        return modified_x


class ParfFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a new column "parf" based on "vehicle_age" and "arf"
    """

    @classmethod
    def compute_parf(cls, row):
        """
        Compute parf from vehicle age and arf per https://www.sgcarmart.com/news/writeup.php?AID=13
        """
        parf = 0
        if not row["is_parf_car"]:
            return parf

        if row["vehicle_age"] <= 4:
            parf = row["arf"] * 0.75
        elif row["vehicle_age"] >= 5 and row["vehicle_age"] < 6:
            parf = row["arf"] * 0.70
        elif row["vehicle_age"] >= 6 and row["vehicle_age"] < 7:
            parf = row["arf"] * 0.65
        elif row["vehicle_age"] >= 7 and row["vehicle_age"] < 8:
            parf = row["arf"] * 0.60
        elif row["vehicle_age"] >= 8 and row["vehicle_age"] < 9:
            parf = row["arf"] * 0.55
        elif row["vehicle_age"] >= 9 and row["vehicle_age"] <= 10:
            parf = row["arf"] * 0.50

        return parf

    def fit(self, X):

        return self

    def transform(self, X):
        modified_x = X.copy()
        # If the car category does not contain "parf car" then it does not get any part rebate
        modified_x["is_parf_car"] = modified_x.category.apply(
            lambda value: 1 if "parf car" in value else 0
        )
        modified_x["parf"] = modified_x.apply(self.compute_parf, axis=1)
        return modified_x


class CoeStartDateFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a new column coe_start_date based on reg_date and coe
    For a few
    """

    def fit(self, X):
        return self

    def transform(self, X):
        modified_x = X.copy()

        coe_df = X[["reg_date", "original_reg_date", "coe", "dereg_value"]].copy()

        # Consider original_reg_date/reg_date as coe_start_date in general
        coe_df["coe_start_date"] = np.where(
            X["reg_date"].isnull(), X["original_reg_date"], X["reg_date"]
        )
        coe_df["coe_start_date"] = pd.to_datetime(coe_df["coe_start_date"])
        # Some rows have coe values as 10 - https://www.sgcarmart.com/used_cars/info.php?ID=1027957 (scraping error)
        # In such cases, consider DATASET_GENERATION_DATE as coe_start_date
        coe_df.loc[coe_df["coe"] == 10, "coe_start_date"] = DATASET_GENERATION_DATE

        # Compute coe_expiry date and months left
        coe_df["coe_expiry"] = coe_df.coe_start_date + np.timedelta64(10, "Y")
        coe_df["coe_expiry_months"] = (
            coe_df.coe_expiry - DATASET_GENERATION_DATE
        ) / np.timedelta64(1, "M")

        # If the coe expiry is in the past (incorrect), set it as 0
        coe_df.coe_expiry_months.clip(lower=0, inplace=True)

        # For rows with incorrect coe_start_date, compute it from dereg_value
        # These rows are not eligible for parf so it can be assumed that dereg_value == coe_rebate for such rows
        # cleaned_df[cleaned_df.coe_expiry_months == 0][cleaned_df.dereg_value == cleaned_df.coe_rebate]
        filter_mask = (coe_df.coe_expiry_months == 0) & (~coe_df.dereg_value.isnull())
        filtered_df = coe_df[filter_mask].copy()

        filtered_df["coe_expiry_months_computed"] = (
            filtered_df.dereg_value * 120
        ) / filtered_df.coe
        filtered_df[
            "coe_expiry_standardized"
        ] = filtered_df.coe_expiry_months_computed.apply(
            lambda value: np.timedelta64(int(value), "M")
        )
        filtered_df["coe_start_date_computed"] = (
            filtered_df["coe_expiry_standardized"] + DATASET_GENERATION_DATE
        ) - np.timedelta64(10, "Y")

        coe_df.loc[filter_mask, "coe_start_date"] = filtered_df[
            "coe_start_date_computed"
        ]
        coe_df.loc[filter_mask, "coe_expiry_months"] = filtered_df[
            "coe_expiry_months_computed"
        ]

        modified_x["coe_start_date"] = coe_df["coe_start_date"]
        modified_x["coe_start_year"] = coe_df["coe_start_date"].dt.year
        modified_x["coe_expiry_months"] = coe_df["coe_expiry_months"]
        return modified_x


class CoeRebateFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a new column "coe_rebate" based on "parf", "coe", and "reg_date"
    """

    def fit(self, X):
        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["coe_rebate"] = (modified_x.coe * modified_x.coe_expiry_months) / 120

        # If the computed coe_rebate is 0 (those records for which the coe_start_date is incorrect),
        # use dereg_value as coe_rebate.
        # r.loc[(r.coe_rebate == 0) & (r.dereg_value != 0)]
        # All of these records are for cars that are older than 10 years (no ARF)
        # and so this should be completely safe
        #
        # NOTE: One exception is https://www.sgcarmart.com/used_cars/info.php?ID=1029135 where the
        # coe > coe_rebate - this needs further investigation
        modified_x["coe_rebate"] = np.where(
            (modified_x["coe_rebate"] == 0) & (modified_x["dereg_value"] != 0),
            modified_x["dereg_value"],
            modified_x["coe_rebate"],
        )

        return modified_x


class DeregValueComputedFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a column dereg_value_computed based on coe_rebate and parf
    """

    def fit(self, X):
        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["dereg_value_computed"] = X["coe_rebate"] + X["parf"]
        return modified_x


class DeregValueTransformer(BaseEstimator, TransformerMixin):
    """
    Imputes missing dereg_value values based on its corresponding dereg_value_computed
    """

    def fit(self, X):
        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["dereg_value"] = np.where(
            X["dereg_value"].isnull(), X["dereg_value_computed"], X["dereg_value"],
        )

        # Drop rows for which it is not possible to compute dereg_value
        # This amounts to 228 rows. Example: https://www.sgcarmart.com/used_cars/info.php?ID=1031249
        # The above is due to error in scraping script which outside the scope of our problem
        #         logging.info(f"DeregValueTransformer - dropping {len(modified_x[modified_x["dereg_value"].isnull()])} for which dereg_value cannot be computed")
        modified_x = modified_x.loc[~modified_x["dereg_value"].isnull()]
        return modified_x


class DepreciationTransformer(BaseEstimator, TransformerMixin):
    """
    Imputes missing depreciation values based on its corresponding price and parf
    """

    def fit(self, X):
        return self

    def transform(self, X):
        modified_x = X.copy()
        depreciation_mask = X.depreciation.isnull()
        # Ideally, this should be (price - parf) / no_of_coe_years_left but this formula gives
        # depreciation which are vastly different to the ones in the given dataset - because of incorrect coe_start_date
        # which in turn is due to a scraping error in dataset generation
        #         modified_x.loc[depreciation_mask, "depreciation"] = (
        #             X.loc[depreciation_mask, "price"] - X.loc[depreciation_mask, "parf"]
        #         ) / 10
        if len(modified_x[depreciation_mask]):
            logging.info(
                f"DepreciationTransformer - found {len(modified_x[depreciation_mask])} rows with null depreciation"
            )
            modified_x = modified_x[~depreciation_mask]

        return modified_x


class OpcSchemeTransformer(BaseEstimator, TransformerMixin):
    """
    Standardizes and imputes opc_scheme values
    """

    REVISED_OPC = "revised_opc"
    NORMAL_OPC = "normal_opc"
    OLD_OPC = "old_opc"

    def fit(self, X):
        return self

    def transform(self, X):
        modified_x = X.copy()
        opc_scheme = modified_x["opc_scheme"]

        opc_scheme.replace(
            "revised opc scheme . learn more about opc schemes.",
            self.REVISED_OPC,
            inplace=True,
        )

        # Only record with this value - https://www.sgcarmart.com/used_cars/info.php?ID=989043
        opc_scheme.replace("1100", self.REVISED_OPC, inplace=True)

        opc_scheme.fillna(self.NORMAL_OPC, inplace=True)

        opc_scheme.replace(
            "old opc scheme . learn more about opc schemes.", self.OLD_OPC, inplace=True
        )
        modified_x["opc_scheme"] = opc_scheme
        return modified_x


class LifespanRestrictionFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a new feature lifespan_restriction based on lifespan value
    1  - no restruction
    -1 - restriction applies
    """

    def fit(self, X):
        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["lifespan_restriction"] = -1
        modified_x.loc[modified_x["lifespan"].isnull(), "lifespan_restriction"] = 1

        return modified_x


class CountUniqueItemsFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Creates a new feature column that reflects the number of unique items in a
    string column that is separated by given separator
    """

    def __init__(self, feature, new_feature_name, separator=","):
        super(CountUniqueItemsFeatureCreator, self).__init__()
        self.feature = feature
        self.new_feature_name = new_feature_name
        self.separator = separator

    def fit(self, X):
        return self

    def transform(self, X):
        modified_x = X.copy()
        new_feature = pd.Series(np.zeros(len(X)), index=X.index, dtype=np.int16)
        new_feature.loc[~X[self.feature].isnull()] = X[~X[self.feature].isnull()][
            self.feature
        ].apply(lambda value: len(value.split(self.separator)))
        modified_x[self.new_feature_name] = new_feature
        return modified_x


class HierarchicalGroupImputer(BaseEstimator, TransformerMixin):
    """
    For missing values in the given feature, this imputer tries filling such values with the agg value
    derived from each group and as fallback uses the entire feature columns' agg value
    
    If fallback is True, for records that cannot be filled with agg value of any of the groups provided,
    it will be filled with the agg value of the feature column
    """

    RSUFFIX = "_computed"

    def __init__(self, feature, groups, agg_type, fallback=True):
        super(HierarchicalGroupImputer, self).__init__()
        self.feature = feature
        self.groups = groups
        # Order groups from most-specific to least-specific
        self.groups.sort(key=len, reverse=True)
        self.agg_type = agg_type
        self.fallback = fallback

    def fit(self, X):
        self.agg_results = {}
        for group in self.groups:
            self.agg_results[tuple(group)] = X.groupby(group).agg(
                {self.feature: self.agg_type}
            )

        # If there are still empty values after the above, fill those values with the
        # column-level agg value
        if self.fallback:
            self.feature_agg = X[self.feature].agg(self.agg_type)

        return self

    def transform(self, X):

        # If there are no empty records, return
        if not len(X[X[self.feature].isnull()]):
            logging.info(
                f"HierarchicalGroupImputer - found null values to impute for {self.feature}"
            )
            return X

        modified_x = X.copy()
        logging.info(
            f"HierarchicalGroupImputer - found {(len(modified_x[modified_x[self.feature].isnull()]))} null values to impute for {self.feature}"
        )
        for group in self.groups:
            feature_computed = modified_x.join(
                self.agg_results[tuple(group)], on=group, rsuffix=self.RSUFFIX
            )[self.feature + self.RSUFFIX]
            modified_x[self.feature] = np.where(
                modified_x[self.feature].isnull(),
                feature_computed,
                modified_x[self.feature],
            )

            logging.info(
                f"HierarchicalGroupImputer - {(len(modified_x[modified_x[self.feature].isnull()]))} null values left to impute for {self.feature}"
            )
            if not len(modified_x[modified_x[self.feature].isnull()]):
                break

        # If there are still empty values after the above, fill those values with the
        # column-level agg value
        if len(modified_x[modified_x[self.feature].isnull()][self.feature]):

            if self.fallback:
                modified_x.loc[
                    modified_x[self.feature].isnull(), self.feature
                ] = self.feature_agg
            else:
                null_records = modified_x[self.feature].isnull()

                logging.info(
                    f"HierarchicalGroupImputer - Dropping {len(modified_x[null_records])} rows with null {self.feature} values"
                )

                modified_x = modified_x[~null_records]

        return modified_x


pipeline_for_columns_15_and_above = Pipeline(
    steps=[
        ("common_ops", CommonPreProcessing()),
        ("coe_start_date", CoeStartDateFeatureCreator()),
        ("vehicle_age", AgeFeatureCreator()),
        (
            "omv",
            HierarchicalGroupImputer(
                "omv",
                [["make", "model", "vehicle_age"], ["make", "model"]],
                "mean",
                True,
            ),
        ),
        ("coe", CoeTransformer()),
        ("arf", ArfTransformer()),
        ("parf", ParfFeatureCreator()),
        ("coe_rebate", CoeRebateFeatureCreator()),
        ("dereg_value_computed", DeregValueComputedFeatureCreator()),
        ("dereg_value", DeregValueTransformer()),
        ("depreciation", DepreciationTransformer()),
        (
            "mileage",
            HierarchicalGroupImputer(
                "mileage",
                [["make", "model", "vehicle_age"], ["vehicle_age"]],
                "mean",
                True,
            ),
        ),
        ("opc_scheme", OpcSchemeTransformer()),
        # TODO: transform engine_cap as ranges as defined in https://www.sgcarmart.com/services/roadtax_calculator.php
        # This will further improve the accuracy of imputed road_tax values
        (
            "road_tax",
            HierarchicalGroupImputer(
                "road_tax",
                [
                    ["engine_cap", "opc_scheme", "vehicle_age", "fuel_type"],
                    ["engine_cap", "opc_scheme"],
                    ["opc_scheme"],
                ],
                "mean",
                True,
            ),
        ),
        ("lifespan_restriction", LifespanRestrictionFeatureCreator()),
        (
            "features_count",
            CountUniqueItemsFeatureCreator("features", "features_count"),
        ),
        (
            "accessories_count",
            CountUniqueItemsFeatureCreator("accessories", "accessories_count"),
        ),
    ]
)

In [7]:
cleaned_train = pipeline_for_columns_15_and_above.fit_transform(train)
cleaned_train.head()

INFO:root:HierarchicalGroupImputer - found 41 null values to impute for omv
INFO:root:HierarchicalGroupImputer - 34 null values left to impute for omv
INFO:root:HierarchicalGroupImputer - 12 null values left to impute for omv
INFO:root:DepreciationTransformer - found 411 rows with null depreciation
INFO:root:HierarchicalGroupImputer - found 3508 null values to impute for mileage
INFO:root:HierarchicalGroupImputer - 539 null values left to impute for mileage
INFO:root:HierarchicalGroupImputer - 15 null values left to impute for mileage
INFO:root:HierarchicalGroupImputer - found 2047 null values to impute for road_tax
INFO:root:HierarchicalGroupImputer - 1748 null values left to impute for road_tax
INFO:root:HierarchicalGroupImputer - 754 null values left to impute for road_tax
INFO:root:HierarchicalGroupImputer - 0 null values left to impute for road_tax


Unnamed: 0_level_0,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,...,coe_start_year,coe_expiry_months,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,lifespan_restriction,features_count,accessories_count
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
365241,rover 216 cabriolet (coe till 04/2029),rover,216,new 10 years coe! affordable rare beautiful ca...,1993.0,,1993-06-22,sports car,coe car,auto,...,2019,92.217765,28.0,0,0.0,20115.0,20115.0,1,4,0
397461,audi tt roadster 1.8a tfsi s-tronic (new 10-yr...,audi,tt,138000,,,2013-01-09,sports car,"coe car, low mileage car",auto,...,2013,15.850017,8.0,0,0.0,8005.859045,8005.859045,1,2,6
540570,aston martin db9 cabriolet 6.0a (coe till 04/2...,aston martin,db9,new number plate! low mileage! very well maint...,2009.0,01-feb-2009,2011-01-27,sports car,"imported used vehicle, coe car, low mileage car",auto,...,2019,92.218802,12.0,0,0.0,27213.0,27213.0,1,4,5
592916,isuzu cyz52r (coe till 07/2027),isuzu,cyz52r,28 tons lorry with palfinger pk18500 crane.,2007.0,,2007-11-12,truck,coe car,manual,...,2017,71.211036,14.0,0,0.0,20218.0,20218.0,-1,1,0
642522,mitsubishi fuso fighter fk61 (coe till 12/2029),mitsubishi,fuso,"very low mileage of 93,175 km done only. 6 ton...",2009.0,,2009-12-04,truck,"coe car, low mileage car",manual,...,2019,99.351761,12.0,0,0.0,21176.0,21176.0,-1,0,0


In [8]:
cleaned_test = pipeline_for_columns_15_and_above.transform(test)
cleaned_test.head()

INFO:root:HierarchicalGroupImputer - found 12 null values to impute for omv
INFO:root:HierarchicalGroupImputer - 10 null values left to impute for omv
INFO:root:HierarchicalGroupImputer - 2 null values left to impute for omv
INFO:root:ArfTransformer - found 87 rows with null arf
INFO:root:DepreciationTransformer - found 63 rows with null depreciation
INFO:root:HierarchicalGroupImputer - found 1059 null values to impute for mileage
INFO:root:HierarchicalGroupImputer - 132 null values left to impute for mileage
INFO:root:HierarchicalGroupImputer - 0 null values left to impute for mileage
INFO:root:HierarchicalGroupImputer - found 576 null values to impute for road_tax
INFO:root:HierarchicalGroupImputer - 478 null values left to impute for road_tax
INFO:root:HierarchicalGroupImputer - 196 null values left to impute for road_tax
INFO:root:HierarchicalGroupImputer - 0 null values left to impute for road_tax


Unnamed: 0_level_0,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,...,coe_start_year,coe_expiry_months,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,lifespan_restriction,features_count,accessories_count
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
767227,bmw 5 series 528i se,bmw,528i,extended warranty by mbm wheelpower pte ltd un...,2013.0,,2014-06-19,luxury sedan,"parf car, consignment car",auto,...,2014,33.131686,8.0,1,37576.55,18582.458558,56159.008558,1,3,6
792071,mercedes-benz 200e (coe till 03/2029),mercedes-benz,200,fully restored. all done. excellence condition...,1991.0,,1991-04-30,luxury sedan,coe car,auto,...,2019,91.233872,30.0,0,0.0,25103.0,25103.0,1,0,0
806600,bentley flying spur 6.0a,bentley,flying,,2013.0,,2014-03-07,luxury sedan,"parf car, rare & exotic, premium ad car",auto,...,2014,29.714779,8.0,1,209016.5,18646.023532,227662.523532,1,6,6
811107,honda civic crx (coe till 04/2029),honda,civic,super best deal. choose your favourite. new pa...,1992.0,,1993-01-04,sports car,coe car,manual,...,2019,92.217765,29.0,0,0.0,20115.0,20115.0,1,1,1
827494,lexus rx turbo rx200t luxury,lexus,rx,buy now! established since 1981. good tyre tre...,2016.0,,2016-12-28,suv,parf car,auto,...,2016,63.456744,5.0,1,51240.0,29031.989411,80271.989411,1,7,7


## Summary
The above pipeline removes 269 records due to 2 issues
1. Missing omv
2. Missing dereg_value

Detailed description for each of the above is available in their corresponding transformers