## Imports

In [1]:
import sys

sys.path.append("..")

In [2]:
from datetime import datetime
import logging
import numpy as np
import pandas as pd
from sklearn import compose
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from src.transformers import *

## Load and segregate data 

In [3]:
# Read train data and set index
train = pd.read_csv("../data/raw/train.csv")
train.set_index("listing_id", inplace=True)
train = train[~train.index.duplicated(keep="first")]

# Read test data and set index
test = pd.read_csv("../data/raw/test.csv")
test.set_index("listing_id", inplace=True)
# NOTE: Do not remove duplicates in test as kaggle expects 5000 entries in the submission file
# test = test[~test.index.duplicated(keep="first")]

# Set default logging level
# Change to logging.INFO to see related output
logging.basicConfig(level=logging.INFO, force=True)

## Columns Assigned

- [x] coe
- [x] dereg_value
- [x] omv
- [x] arf
- [x] opc_scheme
- [x] indicative_price
- [x] price
- [x] depreciation
- [x] eco_category
- [x] mileage
- [x] road_tax
- [x] lifespan
- [x] features
- [x] accessories

## Pre-defined Transformers

In [4]:
def get_make_from_title(make_list, title):
    title = title.split(" ")
    for i in range(len(title)):
        if " ".join(title[0 : i + 1]) in make_list:
            return " ".join(title[0 : i + 1])
    return "unknwon"


class CommonPreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.make_list = []
        pass

    def fit(self, df):
        self.make_list = df.make.unique()
        return self

    def transform(self, df):
        df.loc[:, "reg_date"] = np.where(
            df["reg_date"].isnull(), df["original_reg_date"], df["reg_date"]
        )
        df.loc[:, "reg_date"] = pd.to_datetime(df.reg_date)
        df.loc[:, "reg_date_year"] = df.loc[:, "reg_date"].dt.year
        df.loc[:, "reg_date_month"] = (
            datetime.now() - df.loc[:, "reg_date"]
        ) / np.timedelta64(1, "M")
        df.loc[:, "no_of_owners"] = df["no_of_owners"].fillna(1)
        df.loc[:, "title"] = df["title"].str.lower()
        df.loc[:, "make"] = df.apply(
            lambda row: get_make_from_title(self.make_list, row["title"])
            if pd.isnull(row["make"])
            else row["make"],
            axis=1,
        )
        df.loc[:, "make_model"] = df.make + "-" + "df.model"
        return df

## Custom Transformer Definitions

In [5]:
pipeline_for_columns_15_and_above = Pipeline(
    steps=[
        ("common_ops", CommonPreProcessing()),
        ("coe_start_date", CoeStartDateFeatureCreator()),
    ])


In [6]:
pipeline_for_columns_15_and_above = Pipeline(
    steps=[
        ("common_ops", CommonPreProcessing()),
        ("coe_start_date", CoeStartDateFeatureCreator()),
        ("vehicle_age", AgeFeatureCreator()),
        (
            "omv",
            HierarchicalGroupImputer(
                "omv",
                [["make", "model", "vehicle_age"], ["make", "model"]],
                "mean",
                True,
            ),
        ),
        ("coe", CoeTransformer()),
        ("arf", ArfTransformer()),
        ("parf", ParfFeatureCreator()),
        ("coe_rebate", CoeRebateFeatureCreator()),
        ("dereg_value_computed", DeregValueComputedFeatureCreator()),
        ("dereg_value", DeregValueTransformer()),
        ("depreciation", DepreciationTransformer()),
        (
            "mileage",
            HierarchicalGroupImputer(
                "mileage",
                [["make", "model", "vehicle_age"], ["vehicle_age"]],
                "mean",
                True,
            ),
        ),
        ("opc_scheme", OpcSchemeTransformer()),
        # TODO: transform engine_cap as ranges as defined in https://www.sgcarmart.com/services/roadtax_calculator.php
        # This will further improve the accuracy of imputed road_tax values
        (
            "road_tax",
            HierarchicalGroupImputer(
                "road_tax",
                [
                    ["engine_cap", "opc_scheme", "vehicle_age", "fuel_type"],
                    ["engine_cap", "opc_scheme"],
                    ["opc_scheme"],
                ],
                "mean",
                True,
            ),
        ),
        ("lifespan_restriction", LifespanRestrictionFeatureCreator()),
        (
            "features_count",
            CountUniqueItemsFeatureCreator("features", "features_count"),
        ),
        (
            "accessories_count",
            CountUniqueItemsFeatureCreator("accessories", "accessories_count"),
        ),
    ]
)

In [7]:
cleaned_train = pipeline_for_columns_15_and_above.fit_transform(train)
print(train.shape, cleaned_train.shape)
cleaned_train.head()

INFO:root:CoeStartDateFeatureCreator - Found 653 entries without COE Text
INFO:root:CoeStartDateFeatureCreator - 454 null entries left after using COE 10 
INFO:root:CoeStartDateFeatureCreator - 131 null entries left after using reg_date 
INFO:root:HierarchicalGroupImputer - total 41 null values to impute for omv
INFO:root:HierarchicalGroupImputer - 34 null values left for omv after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 12 null values left for omv after imputing with group ['make', 'model']
INFO:root:DepreciationTransformer - replacing 433 null values with 0
INFO:root:HierarchicalGroupImputer - total 3716 null values to impute for mileage
INFO:root:HierarchicalGroupImputer - 575 null values left for mileage after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 17 null values left for mileage after imputing with group ['vehicle_age']
INFO:root:HierarchicalGroupImputer - total 2148 null values to

(16728, 32) (16728, 46)


Unnamed: 0,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,...,coe_start_date,coe_start_year,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,lifespan_restriction,features_count,accessories_count
0,bmw 3 series 320i gran turismo m-sport,bmw,320i,1 owner! 320i gt m-sports model! big brake kit...,2013.0,,2013-12-09,luxury sedan,"parf car, premium ad car, low mileage car",auto,...,2013-12-25 22:24:36,2013,8.0,1,27754.1,16705.0,44459.1,1,6,7
1,toyota hiace 3.0m,toyota,hiace,high loan available! low mileage unit. wear an...,2014.0,,2015-01-26,van,premium ad car,manual,...,2015-01-25 14:42:54,2015,7.0,0,0.0,3464.5,3464.5,-1,1,1
2,mercedes-benz cla-class cla180,mercedes-benz,cla180,1 owner c&c unit. full agent service with 1 mo...,2016.0,,2016-07-25,luxury sedan,"parf car, premium ad car",auto,...,2016-07-26 11:26:42,2016,5.0,1,18228.7,25504.65,43733.35,1,1,4
3,mercedes-benz e-class e180 avantgarde,mercedes-benz,e180,"fully agent maintained, 3 years warranty 10 ye...",2019.0,,2020-11-17,luxury sedan,"parf car, almost new car, consignment car",auto,...,2020-11-25 04:39:54,2020,2.0,1,42732.75,36960.083333,79692.833333,1,5,4
4,honda civic 1.6a vti,honda,civic,"kah motor unit! 1 owner, lowest 1.98% for full...",2019.0,,2019-09-20,mid-sized sedan,parf car,auto,...,2019-09-26 01:52:30,2019,2.0,1,15075.75,21111.375,36187.125,1,7,6


In [8]:
cleaned_test = pipeline_for_columns_15_and_above.transform(test)
print(test.shape, cleaned_test.shape)
cleaned_test.head()

INFO:root:CoeStartDateFeatureCreator - Found 181 entries without COE Text
INFO:root:CoeStartDateFeatureCreator - 120 null entries left after using COE 10 
INFO:root:CoeStartDateFeatureCreator - 41 null entries left after using reg_date 
INFO:root:HierarchicalGroupImputer - total 12 null values to impute for omv
INFO:root:HierarchicalGroupImputer - 10 null values left for omv after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 2 null values left for omv after imputing with group ['make', 'model']
INFO:root:DepreciationTransformer - replacing 132 null values with 0
INFO:root:HierarchicalGroupImputer - total 1144 null values to impute for mileage
INFO:root:HierarchicalGroupImputer - 153 null values left for mileage after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 9 null values left for mileage after imputing with group ['vehicle_age']
INFO:root:HierarchicalGroupImputer - total 634 null values to imp

(5000, 31) (5000, 45)


Unnamed: 0,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,...,coe_start_date,coe_start_year,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,lifespan_restriction,features_count,accessories_count
0,bmw x6 xdrive35i sunroof (new 10-yr coe),bmw,x6,"owner consignment unit, viewing strictly by ap...",2012.0,,2012-06-27,suv,coe car,auto,...,2021-10-26 00:00:00,2021,9.0,0,0.0,39301.357272,39301.357272,1,2,6
1,porsche 911 carrera s coupe 3.8a pdk (coe till...,porsche,911,the 911 carrera s displacing 3.8 litres with m...,2010.0,,2010-05-11,sports car,"coe car, direct owner sale",auto,...,2019-01-25 13:59:42,2019,11.0,0,0.0,22875.925,22875.925,1,2,7
2,porsche macan diesel s 3.0a pdk,porsche,macan,comes with agent warranty till january 2022. a...,2016.0,,2017-01-18,suv,"parf car, premium ad car",auto,...,2017-01-18 00:00:00,2017,5.0,1,68502.0,28388.120783,96890.120783,1,1,9
3,bmw 5 series 530i luxury,bmw,530i,a careful owner upgraded to a porsche macan. t...,2017.0,,2017-06-28,luxury sedan,"parf car, sgcarmart warranty cars",auto,...,2017-07-26 17:15:54,2017,4.0,1,45831.75,27313.075,73144.825,1,5,8
4,honda vezel 1.5a x,honda,vezel,100% non phv-unit! excellent condition and wel...,2016.0,,2017-06-05,suv,parf car,auto,...,2017-06-26 06:46:48,2017,5.0,1,7180.6,26343.766667,33524.366667,1,7,6


## TODO

Find better alternatives to fill null values for the below features
1. depreciation

As of now, some rows are filled with 0 for missing entries. Detailed description for each of the above is available in their corresponding transformers

## The below scrapes the coe expiry information and persists the same in `../data/raw/` folder

In [9]:
# %load_ext jupyterlab_notify

In [10]:
# %%notify

# import requests
# import dask

# from tqdm import tqdm
# from bs4 import BeautifulSoup
# from dask.distributed import Client


# client = Client(serializers=['dask', 'pickle'],
#                 deserializers=['dask', 'msgpack'])
# client.cluster.scale(10)

# def scrape_coe_left(listing_id):

#         logging.info(
#             f"CoeStartDateFeatureCreator - scraping sgcarmart for listing_id - {listing_id}"
#         )

#         response = requests.get(
#             f"https://www.sgcarmart.com/used_cars/info.php?ID={listing_id}"
#         )
#         try:
#             soup = BeautifulSoup(response.text, features="html.parser")
#             result = soup.find(
#                 id="carInfo").contents[3].contents[-2].contents[-1]
#         except AttributeError as e:
#             result = ""

#         return (listing_id, result)
    
# futures = []
# for listing_id in tqdm(test.index):
#     future = client.submit(scrape_coe_left, listing_id)
#     futures.append(future)

# test_results = client.gather(futures, errors="skip")
# coe_text_info = pd.DataFrame(test_results, columns=["listing_id", "coe_text"])
# coe_text_info.set_index("listing_id", inplace=True)
# coe_text_info.to_csv("../data/raw/test_coe_text.csv")

# futures = []
# for listing_id in tqdm(train.index):
#     future = client.submit(scrape_coe_left, listing_id)
#     futures.append(future)

# train_results = client.gather(futures, errors="skip")
# coe_text_info = pd.DataFrame(train_results, columns=["listing_id", "coe_text"])
# coe_text_info.set_index("listing_id", inplace=True)
# coe_text_info.to_csv("../data/raw/train_coe_text.csv")

# client.shutdown()