## Imports

In [1]:
import sys

sys.path.append("..")

In [2]:
from src.transformers import *
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Load Data

In [3]:
# Set default logging level
# Change to logging.INFO to see related output
logging.basicConfig(level=logging.INFO, force=True)

In [4]:
# Read data and set indices
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")

train.set_index("listing_id", inplace=True)
train = train[~train.index.duplicated(keep="first")]
train = train.reset_index()


# test.set_index("listing_id", inplace=True)

In [5]:
test.iloc[3031]

listing_id                                                      992315
title                                   Rolls-Royce Wraith Black Badge
make                                                       rolls-royce
model                                                           wraith
description          brand new wraith black badge - view at our sho...
manufactured                                                       NaN
original_reg_date                                          20-mar-2021
reg_date                                                           NaN
type_of_vehicle                                             sports car
category                 imported used vehicle, coe car, rare & exotic
transmission                                                      auto
curb_weight                                                        NaN
power                                                              NaN
fuel_type                                                          NaN
engine

## Construct Pipeline

In [6]:
data_pipeline = Pipeline(
    [
        ("pre_processing", PreProcessing()),
        ("imp_power", GroupMissingValueImputer("power", ["type_of_vehicle"], "mean")),
        (
            "imp_engine_cap",
            GroupMissingValueImputer("engine_cap", ["make", "type_of_vehicle"], "mean"),
        ),
        (
            "imp_curb_weight",
            GroupMissingValueImputer("curb_weight", ["make", "type_of_vehicle"]),
        ),
        #         ("add_cat", SplitValuesToColumn("category")),
        # Columns 9 to 15
        (
            "curb_weight",
            CarSpecificationsTransformer(
                "curb_weight", ["make", "model", "type_of_vehicle", "manufactured"]
            ),
        ),
        (
            "power",
            CarSpecificationsTransformer(
                "power", ["make", "model", "type_of_vehicle", "manufactured"]
            ),
        ),
        (
            "fuel_type",
            CarSpecificationsTransformer("fuel_type", ["make", "model"], "mode"),
        ),
        (
            "engine_cap",
            CarSpecificationsTransformer(
                "engine_cap", ["make", "model", "type_of_vehicle", "manufactured"]
            ),
        ),
        ("fuel_type_missing", CarSpecsMissingWithTypeOfVehicle(["fuel_type"], "mode")),
        (
            "car_spec_missing",
            CarSpecsMissingWithTypeOfVehicle(
                ["curb_weight", "power", "engine_cap"], "mean"
            ),
        ),
        (
            "convert_value_to_category", 
            ColumnValuesToCategory('engine_cap', 'engine_cap_range',
                                                             [0, 600, 1000, 1600, 3000, np.inf], 
                                                             ['EC<=600 cc', '600 cc < EC <= 1000 cc ', 
                                                              '1000 cc < EC <= 1600 cc','1600 cc < EC <= 3000 cc',
                                                              'EC > 3000 cc']
                                  )
        ),
        (
            "fuel_type_one_hot", 
             OneHotTransformer('fuel_type', ['diesel', 'petrol-electric', 'petrol', 'electric']
             )
        
        ),
        (
            "transmission_one_hot",
            OneHotTransformer('transmission', ['auto', 'manual'])
        ),
        
        
        # Columns 15 and above
        ("coe_start_date", CoeStartDateFeatureCreator()),
        ("vehicle_age", AgeFeatureCreator()),
        (
            "omv",
            HierarchicalGroupImputer(
                "omv",
                [["make", "model", "vehicle_age"], ["make", "model"]],
                "mean",
                True,
            ),
        ),
        ("coe", CoeTransformer()),
        ("arf", ArfTransformer()),
        ("parf", ParfFeatureCreator()),
        ("coe_rebate", CoeRebateFeatureCreator()),
        ("dereg_value_computed", DeregValueComputedFeatureCreator()),
        ("dereg_value", DeregValueTransformer()),
        ("depreciation", DepreciationTransformer()),
        (
            "mileage",
            HierarchicalGroupImputer(
                "mileage",
                [["make", "model", "vehicle_age"], ["vehicle_age"]],
                "mean",
                True,
            ),
        ),
        ("opc_scheme", OpcSchemeTransformer()),
        # TODO: transform engine_cap as ranges as defined in https://www.sgcarmart.com/services/roadtax_calculator.php
        # This will further improve the accuracy of imputed road_tax values
        (
            "road_tax",
            HierarchicalGroupImputer(
                "road_tax",
                [
                    ["engine_cap_range", "opc_scheme", "vehicle_age", "fuel_type"],
                    ["engine_cap_range", "opc_scheme"],
                    ["engine_cap_range"],
                ],
                "mean",
                True,
            ),
        ),
        ("lifespan_restriction", LifespanRestrictionFeatureCreator()),
        (
            "features_count",
            CountUniqueItemsFeatureCreator("features", "features_count"),
        ),
        (
            "accessories_count",
            CountUniqueItemsFeatureCreator("accessories", "accessories_count"),
        ),
        (
            "brand_rank",
            BrandRankTransformer(),
        ),
    ]
)

In [7]:
cleaned_train = data_pipeline.fit_transform(train)
display(cleaned_train.head())
train.shape, cleaned_train.shape

modified


INFO:root:CoeStartDateFeatureCreator - Found 653 entries without COE Text
INFO:root:CoeStartDateFeatureCreator - 454 null entries left after using COE 10 
INFO:root:CoeStartDateFeatureCreator - 131 null entries left after using reg_date 
INFO:root:HierarchicalGroupImputer - total 41 null values to impute for omv
INFO:root:HierarchicalGroupImputer - 34 null values left for omv after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 12 null values left for omv after imputing with group ['make', 'model']
INFO:root:DepreciationTransformer - replacing 433 null values with 0
INFO:root:HierarchicalGroupImputer - total 3716 null values to impute for mileage
INFO:root:HierarchicalGroupImputer - 575 null values left for mileage after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 17 null values left for mileage after imputing with group ['vehicle_age']
INFO:root:HierarchicalGroupImputer - total 2148 null values to

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,coe_start_year,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,lifespan_restriction,features_count,accessories_count,brand_rank
0,1030324,bmw 3 series 320i gran turismo m-sport,bmw,320i,1 owner! 320i gt m-sports model! big brake kit...,2013.0,,2013-12-09,luxury sedan,"parf car, premium ad car, low mileage car",...,2013,8.0,1,27754.1,16705.0,44459.1,1,6,7,3
1,1021510,toyota hiace 3.0m,toyota,hiace,high loan available! low mileage unit. wear an...,2014.0,,2015-01-26,van,premium ad car,...,2015,7.0,0,0.0,3464.5,3464.5,-1,1,1,2
2,1026909,mercedes-benz cla-class cla180,mercedes-benz,cla180,1 owner c&c unit. full agent service with 1 mo...,2016.0,,2016-07-25,luxury sedan,"parf car, premium ad car",...,2016,5.0,1,18228.7,25504.65,43733.35,1,1,4,4
3,1019371,mercedes-benz e-class e180 avantgarde,mercedes-benz,e180,"fully agent maintained, 3 years warranty 10 ye...",2019.0,,2020-11-17,luxury sedan,"parf car, almost new car, consignment car",...,2020,2.0,1,42732.75,36960.083333,79692.833333,1,5,4,4
4,1031014,honda civic 1.6a vti,honda,civic,"kah motor unit! 1 owner, lowest 1.98% for full...",2019.0,,2019-09-20,mid-sized sedan,parf car,...,2019,2.0,1,15075.75,21111.375,36187.125,1,7,6,2


((16728, 32), (16728, 54))

In [8]:
cleaned_train.columns

Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured',
       'original_reg_date', 'reg_date', 'type_of_vehicle', 'category',
       'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category',
       'features', 'accessories', 'indicative_price', 'price', 'reg_date_year',
       'make_model', 'engine_cap_range', 'fuel_type_diesel',
       'fuel_type_petrol-electric', 'fuel_type_petrol', 'fuel_type_electric',
       'transmission_auto', 'transmission_manual', 'coe_text',
       'coe_expiry_days', 'coe_expiry_months', 'coe_expiry_date',
       'coe_start_date', 'coe_start_year', 'vehicle_age', 'is_parf_car',
       'parf', 'coe_rebate', 'dereg_value_computed', 'lifespan_restriction',
       'features_count', 'accessories_count', 'brand_rank'],
      dtype='object')

In [9]:
cleaned_test = data_pipeline.transform(test)
display(cleaned_test.head())
test.shape, cleaned_test.shape

INFO:root:CoeStartDateFeatureCreator - Found 181 entries without COE Text
INFO:root:CoeStartDateFeatureCreator - 120 null entries left after using COE 10 
INFO:root:CoeStartDateFeatureCreator - 41 null entries left after using reg_date 
INFO:root:HierarchicalGroupImputer - total 12 null values to impute for omv
INFO:root:HierarchicalGroupImputer - 10 null values left for omv after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 2 null values left for omv after imputing with group ['make', 'model']


modified


INFO:root:DepreciationTransformer - replacing 132 null values with 0
INFO:root:HierarchicalGroupImputer - total 1144 null values to impute for mileage
INFO:root:HierarchicalGroupImputer - 153 null values left for mileage after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 9 null values left for mileage after imputing with group ['vehicle_age']
INFO:root:HierarchicalGroupImputer - total 634 null values to impute for road_tax
INFO:root:HierarchicalGroupImputer - 53 null values left for road_tax after imputing with group ['engine_cap_range', 'opc_scheme', 'vehicle_age', 'fuel_type']
INFO:root:HierarchicalGroupImputer - 1 null values left for road_tax after imputing with group ['engine_cap_range', 'opc_scheme']
INFO:root:HierarchicalGroupImputer - 1 null values left for road_tax after imputing with group ['engine_cap_range']


Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,coe_start_year,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,lifespan_restriction,features_count,accessories_count,brand_rank
0,1029166,bmw x6 xdrive35i sunroof (new 10-yr coe),bmw,x6,"owner consignment unit, viewing strictly by ap...",2012.0,,2012-06-27,suv,coe car,...,2021,9.0,0,0.0,39301.357272,39301.357272,1,2,6,3
1,1017714,porsche 911 carrera s coupe 3.8a pdk (coe till...,porsche,911,the 911 carrera s displacing 3.8 litres with m...,2010.0,,2010-05-11,sports car,"coe car, direct owner sale",...,2019,11.0,0,0.0,22875.925,22875.925,1,2,7,5
2,1005265,porsche macan diesel s 3.0a pdk,porsche,macan,comes with agent warranty till january 2022. a...,2016.0,,2017-01-18,suv,"parf car, premium ad car",...,2017,5.0,1,68502.0,28388.120783,96890.120783,1,1,9,5
3,1029464,bmw 5 series 530i luxury,bmw,530i,a careful owner upgraded to a porsche macan. t...,2017.0,,2017-06-28,luxury sedan,"parf car, sgcarmart warranty cars",...,2017,4.0,1,45831.75,27313.075,73144.825,1,5,8,3
4,1017727,honda vezel 1.5a x,honda,vezel,100% non phv-unit! excellent condition and wel...,2016.0,,2017-06-05,suv,parf car,...,2017,5.0,1,7180.6,26343.766667,33524.366667,1,7,6,2


((5000, 31), (5000, 53))

In [10]:
cleaned_test.columns

Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured',
       'original_reg_date', 'reg_date', 'type_of_vehicle', 'category',
       'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category',
       'features', 'accessories', 'indicative_price', 'reg_date_year',
       'make_model', 'engine_cap_range', 'fuel_type_diesel',
       'fuel_type_petrol-electric', 'fuel_type_petrol', 'fuel_type_electric',
       'transmission_auto', 'transmission_manual', 'coe_text',
       'coe_expiry_days', 'coe_expiry_months', 'coe_expiry_date',
       'coe_start_date', 'coe_start_year', 'vehicle_age', 'is_parf_car',
       'parf', 'coe_rebate', 'dereg_value_computed', 'lifespan_restriction',
       'features_count', 'accessories_count', 'brand_rank'],
      dtype='object')

## Write to disk

In [11]:
cleaned_train.to_csv("../data/processed/train.csv", index=False)
cleaned_test.to_csv("../data/processed/test.csv", index=False)

In [12]:
cleaned_test.iloc[3031]

listing_id                                                              992315
title                                           rolls-royce wraith black badge
make                                                               rolls-royce
model                                                                   wraith
description                  brand new wraith black badge - view at our sho...
manufactured                                                            2021.0
original_reg_date                                                  20-mar-2021
reg_date                                                   2021-03-20 00:00:00
type_of_vehicle                                                     sports car
category                         imported used vehicle, coe car, rare & exotic
transmission                                                              auto
curb_weight                                                        1550.406895
power                                               

In [13]:
coe_text = pd.read_csv("../data/raw/test_coe_text.csv")
coe_text.set_index("listing_id", inplace=True)
coe_text = coe_text[~coe_text.index.duplicated(keep="first")]
