## Imports

In [None]:
import sys

sys.path.append("..")

In [None]:
from datetime import datetime
import logging
import numpy as np
import pandas as pd
from sklearn import compose
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from src.transformers import *

## Load and segregate data 

In [None]:
# Read train data and set index
train = pd.read_csv("../data/raw/train.csv")
train.set_index("listing_id", inplace=True)
train = train[~train.index.duplicated(keep="first")]
train.reset_index(inplace=True)

# Read test data and set index
test = pd.read_csv("../data/raw/test.csv")
# test.set_index("listing_id", inplace=True)
# NOTE: Do not remove duplicates in test as kaggle expects 5000 entries in the submission file
# test = test[~test.index.duplicated(keep="first")]

# Set default logging level
# Change to logging.INFO to see related output
logging.basicConfig(level=logging.INFO, force=True)

In [None]:
train[train.depreciation.isnull()]

## Columns Assigned

- [x] coe
- [x] dereg_value
- [x] omv
- [x] arf
- [x] opc_scheme
- [x] indicative_price
- [x] price
- [x] depreciation
- [x] eco_category
- [x] mileage
- [x] road_tax
- [x] lifespan
- [x] features
- [x] accessories

## Pre-defined Transformers

In [None]:
def get_make_from_title(make_list, title):
    title = title.split(" ")
    for i in range(len(title)):
        if " ".join(title[0 : i + 1]) in make_list:
            return " ".join(title[0 : i + 1])
    return "unknwon"


class CommonPreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.make_list = []
        pass

    def fit(self, df):
        self.make_list = df.make.unique()
        return self

    def transform(self, df):
        df.loc[:, "reg_date"] = np.where(
            df["reg_date"].isnull(), df["original_reg_date"], df["reg_date"]
        )
        df.loc[:, "reg_date"] = pd.to_datetime(df.reg_date)
        df.loc[:, "reg_date_year"] = df.loc[:, "reg_date"].dt.year
        df.loc[:, "reg_date_month"] = (
            datetime.now() - df.loc[:, "reg_date"]
        ) / np.timedelta64(1, "M")
        df.loc[:, "no_of_owners"] = df["no_of_owners"].fillna(1)
        df.loc[:, "title"] = df["title"].str.lower()
        df.loc[:, "make"] = df.apply(
            lambda row: get_make_from_title(self.make_list, row["title"])
            if pd.isnull(row["make"])
            else row["make"],
            axis=1,
        )
        df.loc[:, "make_model"] = df.make + "-" + "df.model"
        return df

## Custom Transformer Definitions

In [None]:
pipeline_for_columns_15_and_above = Pipeline(
    steps=[
        ("common_ops", CommonPreProcessing()),
        ("coe_start_date", CoeStartDateFeatureCreator()),
    ])


In [None]:
pipeline_for_columns_15_and_above = Pipeline(
    steps=[
        ("common_ops", CommonPreProcessing()),
        ("coe_start_date", CoeStartDateFeatureCreator()),
        ("vehicle_age", AgeFeatureCreator()),
        (
            "omv",
            HierarchicalGroupImputer(
                "omv",
                [["make", "model", "vehicle_age"], ["make", "model"]],
                "mean",
                True,
            ),
        ),
        ("coe", CoeTransformer()),
        ("arf", ArfTransformer()),
        ("parf", ParfFeatureCreator()),
        ("coe_rebate", CoeRebateFeatureCreator()),
        ("dereg_value_computed", DeregValueComputedFeatureCreator()),
        ("dereg_value", DeregValueTransformer()),
        (
            "mileage",
            HierarchicalGroupImputer(
                "mileage",
                [["make", "model", "vehicle_age"], ["vehicle_age"]],
                "mean",
                True,
            ),
        ),
        (
            "vehicle_age_bins",
            ColumnValuesToCategory(
                "vehicle_age",
                "vehicle_age_bins",
                [0, 10, 20, 35, 50, np.inf],
                ["0-10", "10-20", "20-35", "35-50", ">50"],
            ),
        ),
        # Ideally, depreciation should be (price - parf) / no_of_coe_years_left
        #         modified_x.loc[depreciation_mask, "depreciation"] = (
        #             X.loc[depreciation_mask, "price"] - X.loc[depreciation_mask, "parf"]
        #         ) / 10
        # But this depends on price which is unavailable for test dataset so we resort to
        # using the mean of the below hierarchies
        (
            "depreciation",
            HierarchicalGroupImputer(
                "depreciation",
                [
                    ["make", "model", "vehicle_age_bins"],
                    ["make", "vehicle_age_bins"],
                    ["vehicle_age_bins"],
                    ["make"],
                ],
                "mean",
                True,
            ),
        ),
        ("opc_scheme", OpcSchemeTransformer()),
        (
            "convert_value_to_category",
            ColumnValuesToCategory(
                "engine_cap",
                "engine_cap_range",
                [0, 600, 1000, 1600, 3000, np.inf],
                [
                    "EC<=600 cc",
                    "600 cc < EC <= 1000 cc ",
                    "1000 cc < EC <= 1600 cc",
                    "1600 cc < EC <= 3000 cc",
                    "EC > 3000 cc",
                ],
            ),
        ),
        (
            "road_tax",
            HierarchicalGroupImputer(
                "road_tax",
                [
                    ["engine_cap_range", "opc_scheme", "vehicle_age", "fuel_type"],
                    ["engine_cap_range", "opc_scheme"],
                    ["opc_scheme"],
                ],
                "mean",
                True,
            ),
        ),
        ("lifespan_restriction", LifespanRestrictionFeatureCreator()),
        (
            "features_count",
            CountUniqueItemsFeatureCreator("features", "features_count"),
        ),
        (
            "accessories_count",
            CountUniqueItemsFeatureCreator("accessories", "accessories_count"),
        ),
    ]
)

In [None]:
cleaned_train = pipeline_for_columns_15_and_above.fit_transform(train)
print(train.shape, cleaned_train.shape)
cleaned_train.head()

In [None]:
cleaned_test = pipeline_for_columns_15_and_above.transform(test)
print(test.shape, cleaned_test.shape)
cleaned_test.head()