## Imports

In [1]:
from datetime import datetime
import logging
import numpy as np
import pandas as pd
from sklearn import compose
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

## Load and segregate data 

In [2]:
# Read train data and set index
train = pd.read_csv("../data/raw/train.csv")
train.set_index("listing_id", inplace=True)
train.drop_duplicates(inplace=True)

# Separate x and y columns
x_cols = list(train.columns)
x_cols.remove("price")
train_features = train[x_cols]
train_label = pd.DataFrame(train.price)

# Set default logging level
# Change to logging.INFO to see related output
logging.basicConfig(level=logging.WARN, force=True)

## Columns Assigned:

- [x] coe
- [x] dereg_value
- [x] omv
- [x] arf
- [x] opc_scheme
- [x] indicative_price
- [x] price
- [ ] depreciation
- [ ] road_tax
- [ ] mileage
- [ ] lifespan
- [ ] eco_category
- [ ] features
- [ ] accessories

## Pre-defined Transformers

In [3]:
def get_make_from_title(make_list, title):
    title = title.split(" ")
    for i in range(len(title)):
        if " ".join(title[0 : i + 1]) in make_list:
            return " ".join(title[0 : i + 1])
    return "unknwon"


class CommonPreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.make_list = []
        pass

    def fit(self, df):
        self.make_list = df.make.unique()
        return self

    def transform(self, df):
        df.loc[:, "reg_date"] = np.where(
            df["reg_date"].isnull(), df["original_reg_date"], df["reg_date"]
        )
        df.loc[:, "reg_date"] = pd.to_datetime(df.reg_date)
        df.loc[:, "reg_date_year"] = df.loc[:, "reg_date"].dt.year
        df.loc[:, "reg_date_month"] = (
            datetime.now() - df.loc[:, "reg_date"]
        ) / np.timedelta64(1, "M")
        df.loc[:, "no_of_owners"] = df["no_of_owners"].fillna(1)
        df.loc[:, "title"] = df["title"].str.lower()
        df.loc[:, "make"] = df.apply(
            lambda row: get_make_from_title(self.make_list, row["title"])
            if pd.isnull(row["make"])
            else row["make"],
            axis=1,
        )
        df.loc[:, "make_model"] = df.make + "-" + "df.model"
        return df

## Custom Transformer Definitions

In [4]:
# By observation of "coe_rebate", "dereg_value", "dereg_value_computed" for a few samples
DATASET_GENERATION_DATE = datetime(2021, 9, 14)


class OmvTransformer(BaseEstimator, TransformerMixin):
    """
    Remove records that do not have OMV - other features such as arf, parf, depreciation, etc,
    depend on OMV and so those records with no OMV add no value to our training data
    
    We drop the records as there is only 41 such rows. Alternatively, we can try to see if there
    are other similar vehicles (with make, model, manufacturing year) and compute the OMV from
    those records
    """

    def fit(self, X):
        return self

    def transform(self, X):
        transformed_x = X.copy()
        null_omvs = transformed_x["omv"].isnull()

        logging.info(f"Found {len(null_omvs)} rows with null OMV values")

        transformed_x = transformed_x[~null_omvs]
        return transformed_x


class CoeTransformer(BaseEstimator, TransformerMixin):
    """
    Imputes missing coe values as well as incorrect ones
    """

    def fit(self, X):
        years_with_missing_coe = X[X["coe"].isnull()].coe_start_year.unique()
        res_of_interest = X[X["coe_start_year"].isin(years_with_missing_coe)]
        self.mean_coe_per_year = res_of_interest.groupby("coe_start_year").agg(
            {"coe": np.mean}
        )
        return self

    def transform(self, X):
        # Fill missing coe values with mean coe for that registration year
        combined_x = X.join(
            self.mean_coe_per_year, on="coe_start_year", rsuffix="_mean"
        )
        combined_x["coe"].fillna(combined_x["coe_mean"], inplace=True)
        combined_x.drop("coe_mean", axis=1, inplace=True)

        # Replace incorrect coe values with mean coe for 2021
        # Example: https://www.sgcarmart.com/used_cars/info.php?ID=1017335
        combined_x.coe.replace(10.0, self.mean_coe_per_year.loc[2021].coe, inplace=True)

        return combined_x


class ArfTransformer(BaseEstimator, TransformerMixin):
    """
    Imputes missing arf values based on its corresponding omv
    """

    DEDUCTION_AMOUNT_TO_RATE_TUPLE = [(20000, 1), (30000, 1.4), (0, 1.8)]

    @classmethod
    def compute_arf(cls, omv):
        """
        Given an omv, compute its corresponding ARF per https://www.sgcarmart.com/news/writeup.php?AID=13
        """
        arf = 0
        for amount_to_deduct, rate in cls.DEDUCTION_AMOUNT_TO_RATE_TUPLE:
            if omv >= amount_to_deduct and amount_to_deduct != 0:
                arf += rate * amount_to_deduct
            else:
                arf += rate * omv

            omv -= amount_to_deduct

            if omv <= 0:
                break

        return arf

    def fit(self, X):
        # Compute arf from omv for those records that have null arf
        rows_without_arf = X[X["arf"].isnull()]
        self.computed_arf = rows_without_arf["omv"].apply(self.compute_arf)
        self.computed_arf.rename("arf_computed", inplace=True)
        return self

    def transform(self, X):
        modified_x = X.join(self.computed_arf)
        modified_x["arf"].fillna(modified_x["arf_computed"], inplace=True)
        modified_x.drop("arf_computed", axis=1, inplace=True)

        return modified_x


class AgeFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a new column "vehicle_age" as min("manufactured", "reg_date_year")
    """

    def fit(self, X):
        self.vehicle_age = datetime.now().year - np.min(
            X[["manufactured", "reg_date_year"]], axis=1
        )
        self.vehicle_age = self.vehicle_age
        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["vehicle_age"] = self.vehicle_age
        return modified_x


class ParfFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a new column "parf" based on "vehicle_age" and "arf"
    """

    @classmethod
    def compute_parf(cls, row):
        """
        Compute parf from vehicle age and arf per https://www.sgcarmart.com/news/writeup.php?AID=13
        """
        parf = 0
        if not row["is_parf_car"]:
            return parf

        if row["vehicle_age"] <= 4:
            parf = row["arf"] * 0.75
        elif row["vehicle_age"] >= 5 and row["vehicle_age"] < 6:
            parf = row["arf"] * 0.70
        elif row["vehicle_age"] >= 6 and row["vehicle_age"] < 7:
            parf = row["arf"] * 0.65
        elif row["vehicle_age"] >= 7 and row["vehicle_age"] < 8:
            parf = row["arf"] * 0.60
        elif row["vehicle_age"] >= 8 and row["vehicle_age"] < 9:
            parf = row["arf"] * 0.55
        elif row["vehicle_age"] >= 9 and row["vehicle_age"] <= 10:
            parf = row["arf"] * 0.50

        return parf

    def fit(self, X):
        self.parf = X.copy()
        # If the car category does not contain "parf car" then it does not get any part rebate
        self.parf["is_parf_car"] = self.parf.category.apply(
            lambda value: 1 if "parf car" in value else 0
        )
        self.parf = self.parf.apply(self.compute_parf, axis=1)
        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["parf"] = self.parf
        return modified_x


class CoeRebateFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a new column "coe_rebate" based on "parf", "coe", and "reg_date"
    """

    def fit(self, X):
        self.x = X.copy()
        self.x["coe_expiry"] = self.x.coe_start_date + np.timedelta64(10, "Y")

        # datetime(2021, 9, 14) is used because the dataset seems to have been generated then
        self.x["coe_expiry_months"] = (
            self.x.coe_expiry - datetime(2021, 9, 14)
        ) / np.timedelta64(1, "M")

        self.x.coe_expiry_months.clip(lower=0, inplace=True)
        self.x["coe_rebate"] = (self.x.coe * self.x.coe_expiry_months) / 120

        # If the computed coe_rebate is 0 (those records for which the coe_start_date is incorrect),
        # use dereg_value as coe_rebate.
        # r.loc[(r.coe_rebate == 0) & (r.dereg_value != 0)]
        # All of these records are for cars that are older than 10 years (no ARF)
        # and so this should be completely safe
        #
        # NOTE: One exception is https://www.sgcarmart.com/used_cars/info.php?ID=1029135 where the
        # coe > coe_rebate - this needs further investigation
        self.x["coe_rebate"] = np.where(
            (self.x["coe_rebate"] == 0) & (self.x["dereg_value"] != 0),
            self.x["dereg_value"],
            self.x["coe_rebate"],
        )

        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["coe_rebate"] = self.x["coe_rebate"]
        modified_x["coe_expiry_months"] = self.x["coe_expiry_months"]

        return modified_x


class CoeStartDateFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a new column coe_start_date based on reg_date and coe
    For a few
    """

    def fit(self, X):
        self.coe_start_date_series = X[["reg_date", "original_reg_date", "coe"]].copy()
        self.coe_start_date_series["reg_date"] = np.where(
            X["reg_date"].isnull(), X["original_reg_date"], X["reg_date"]
        )
        self.coe_start_date_series["reg_date"] = pd.to_datetime(
            self.coe_start_date_series["reg_date"]
        )
        self.coe_start_date_series.loc[
            self.coe_start_date_series["coe"] == 10, "reg_date"
        ] = DATASET_GENERATION_DATE
        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["coe_start_date"] = self.coe_start_date_series["reg_date"]
        modified_x["coe_start_year"] = self.coe_start_date_series["reg_date"].dt.year
        return modified_x


class DeregValueComputedFeatureCreator(BaseEstimator, TransformerMixin):
    """
    Adds a column dereg_value_computed based on coe_rebate and parf
    """

    def fit(self, X):
        self.dereg_value = X["coe_rebate"] + X["parf"]
        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["dereg_value_computed"] = self.dereg_value
        return modified_x


class DeregValueTransformer(BaseEstimator, TransformerMixin):
    """
    Imputes missing dereg_value values based on its corresponding dereg_value_computed
    """

    def fit(self, X):
        return self

    def transform(self, X):
        modified_x = X.copy()
        modified_x["dereg_value"] = np.where(
            X["dereg_value"].isnull(), X["dereg_value_computed"], X["dereg_value"],
        )
        return modified_x


class Dummy(BaseEstimator, TransformerMixin):
    """
    Imputes missing arf values based on its corresponding omv
    """

    @classmethod
    def compute_arf(cls, omv):
        """
        Given an omv, compute its corresponding ARF per https://www.sgcarmart.com/news/writeup.php?AID=13
        """
        pass

    def fit(self, X):

        return self

    def transform(self, X):
        modified_x = X.copy()

        return modified_x


pipeline_for_columns_15_and_above = Pipeline(
    steps=[
        ("common_ops", CommonPreProcessing()),
        ("coe_start_date", CoeStartDateFeatureCreator()),
        ("omv", OmvTransformer()),
        ("coe", CoeTransformer()),
        ("arf", ArfTransformer()),
        ("age", AgeFeatureCreator()),
        ("parf", ParfFeatureCreator()),
        ("coe_rebate", CoeRebateFeatureCreator()),
        ("dereg_value_computed", DeregValueComputedFeatureCreator()),
        ("dereg_value", DeregValueTransformer()),
    ]
)

In [5]:
cleaned_df = pipeline_for_columns_15_and_above.fit_transform(train)
cleaned_df.head()

Unnamed: 0_level_0,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,...,reg_date_year,reg_date_month,make_model,coe_start_date,coe_start_year,vehicle_age,parf,coe_rebate,coe_expiry_months,dereg_value_computed
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
365241,rover 216 cabriolet (coe till 04/2029),rover,216,new 10 years coe! affordable rare beautiful ca...,1993.0,,1993-06-22,sports car,coe car,auto,...,1993,339.587567,rover-df.model,1993-06-22,1993,28.0,0.0,20115.0,0.0,20115.0
397461,audi tt roadster 1.8a tfsi s-tronic (new 10-yr...,audi,tt,138000,,,2013-01-09,sports car,"coe car, low mileage car",auto,...,2013,104.97084,audi-df.model,2013-01-09,2013,8.0,0.0,10033.935576,15.850017,10033.935576
540570,aston martin db9 cabriolet 6.0a (coe till 04/2...,aston martin,db9,new number plate! low mileage! very well maint...,2009.0,01-feb-2009,2011-01-27,sports car,"imported used vehicle, coe car, low mileage car",auto,...,2011,128.396373,aston martin-df.model,2011-01-27,2011,12.0,0.0,27213.0,0.0,27213.0
592916,isuzu cyz52r (coe till 07/2027),isuzu,cyz52r,28 tons lorry with palfinger pk18500 crane.,2007.0,,2007-11-12,truck,coe car,manual,...,2007,166.902297,isuzu-df.model,2007-11-12,2007,14.0,0.0,20218.0,0.0,20218.0
642522,mitsubishi fuso fighter fk61 (coe till 12/2029),mitsubishi,fuso,"very low mileage of 93,175 km done only. 6 ton...",2009.0,,2009-12-04,truck,"coe car, low mileage car",manual,...,2009,142.162569,mitsubishi-df.model,2009-12-04,2009,12.0,0.0,21176.0,0.0,21176.0
