## Imports

In [1]:
import sys

sys.path.append("..")

In [2]:
from src.transformers import *
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Load Data

In [3]:
# Set default logging level
# Change to logging.INFO to see related output
logging.basicConfig(level=logging.INFO, force=True)

In [4]:
# Read data and set indices
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")

train.set_index("listing_id", inplace=True)
train.drop_duplicates(inplace=True)

test.set_index("listing_id", inplace=True)
test.drop_duplicates(inplace=True)

## Construct Pipeline

In [5]:
data_pipeline = Pipeline(
    [
        ("pre_processing", PreProcessing()),
        ("imp_power", GroupMissingValueImputer("power", ["type_of_vehicle"], "mean")),
        (
            "imp_engine_cap",
            GroupMissingValueImputer("engine_cap", ["make", "type_of_vehicle"], "mean"),
        ),
        (
            "imp_curb_weight",
            GroupMissingValueImputer("curb_weight", ["make", "type_of_vehicle"]),
        ),
        #         ("add_cat", SplitValuesToColumn("category")),
        # Columns 9 to 15
        (
            "curb_weight",
            CarSpecificationsTransformer(
                "curb_weight", ["make", "model", "type_of_vehicle", "manufactured"]
            ),
        ),
        (
            "power",
            CarSpecificationsTransformer(
                "power", ["make", "model", "type_of_vehicle", "manufactured"]
            ),
        ),
        (
            "fuel_type",
            CarSpecificationsTransformer("fuel_type", ["make", "model"], "mode"),
        ),
        (
            "engine_cap",
            CarSpecificationsTransformer(
                "engine_cap", ["make", "model", "type_of_vehicle", "manufactured"]
            ),
        ),
        ("fuel_type_missing", CarSpecsMissingWithTypeOfVehicle(["fuel_type"], "mode")),
        (
            "car_spec_missing",
            CarSpecsMissingWithTypeOfVehicle(
                ["curb_weight", "power", "engine_cap"], "mean"
            ),
        ),
        (
            "convert_value_to_category", 
            ColumnValuesToCategory('engine_cap', 'engine_cap_range',
                                                             [0, 600, 1000, 1600, 3000, np.inf], 
                                                             ['EC<=600 cc', '600 cc < EC <= 1000 cc ', 
                                                              '1000 cc < EC <= 1600 cc','1600 cc < EC <= 3000 cc',
                                                              'EC > 3000 cc']
                                  )
        ),
        (
            "fuel_type_one_hot", 
             OneHotTransformer('fuel_type', ['diesel', 'petrol-electric', 'petrol', 'electric']
             )
        
        ),
        (
            "transmission_one_hot",
            OneHotTransformer('transmission', ['auto', 'manual'])
        ),
        
        
        # Columns 15 and above
        ("coe_start_date", CoeStartDateFeatureCreator()),
        ("vehicle_age", AgeFeatureCreator()),
        (
            "omv",
            HierarchicalGroupImputer(
                "omv",
                [["make", "model", "vehicle_age"], ["make", "model"]],
                "mean",
                True,
            ),
        ),
        ("coe", CoeTransformer()),
        ("arf", ArfTransformer()),
        ("parf", ParfFeatureCreator()),
        ("coe_rebate", CoeRebateFeatureCreator()),
        ("dereg_value_computed", DeregValueComputedFeatureCreator()),
        ("dereg_value", DeregValueTransformer()),
        ("depreciation", DepreciationTransformer()),
        (
            "mileage",
            HierarchicalGroupImputer(
                "mileage",
                [["make", "model", "vehicle_age"], ["vehicle_age"]],
                "mean",
                True,
            ),
        ),
        ("opc_scheme", OpcSchemeTransformer()),
        # TODO: transform engine_cap as ranges as defined in https://www.sgcarmart.com/services/roadtax_calculator.php
        # This will further improve the accuracy of imputed road_tax values
        (
            "road_tax",
            HierarchicalGroupImputer(
                "road_tax",
                [
                    ["engine_cap", "opc_scheme", "vehicle_age", "fuel_type"],
                    ["engine_cap", "opc_scheme"],
                    ["opc_scheme"],
                ],
                "mean",
                True,
            ),
        ),
        ("lifespan_restriction", LifespanRestrictionFeatureCreator()),
        (
            "features_count",
            CountUniqueItemsFeatureCreator("features", "features_count"),
        ),
        (
            "accessories_count",
            CountUniqueItemsFeatureCreator("accessories", "accessories_count"),
        ),
        (
            "brand_rank",
            BrandRankTransformer(),
        ),
    ]
)

In [6]:
cleaned_train = data_pipeline.fit_transform(train)
display(cleaned_train.head())
train.shape, cleaned_train.shape

INFO:root:HierarchicalGroupImputer - total 41 null values to impute for omv
INFO:root:HierarchicalGroupImputer - 34 null values left for omv after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 12 null values left for omv after imputing with group ['make', 'model']
INFO:root:DeregValueTransformer - replacing 233 null values with 0
INFO:root:DepreciationTransformer - replacing 433 null values with 0
INFO:root:HierarchicalGroupImputer - total 3721 null values to impute for mileage
INFO:root:HierarchicalGroupImputer - 576 null values left for mileage after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 17 null values left for mileage after imputing with group ['vehicle_age']
INFO:root:HierarchicalGroupImputer - total 2152 null values to impute for road_tax
INFO:root:HierarchicalGroupImputer - 1537 null values left for road_tax after imputing with group ['engine_cap', 'opc_scheme', 'vehicle_age', 'fuel_ty

Unnamed: 0,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,...,coe_expiry_months,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,lifespan_restriction,features_count,accessories_count,brand_rank
0,bmw 3 series 320i gran turismo m-sport,bmw,320i,1 owner! 320i gt m-sports model! big brake kit...,2013.0,,2013-12-09,luxury sedan,"parf car, premium ad car, low mileage car",auto,...,26.823549,8.0,1,27754.1,17234.130064,44988.230064,1,6,7,3
1,toyota hiace 3.0m,toyota,hiace,high loan available! low mileage unit. wear an...,2014.0,,2015-01-26,van,premium ad car,manual,...,40.392616,7.0,0,0.0,3588.210709,3588.210709,-1,1,1,2
2,mercedes-benz cla-class cla180,mercedes-benz,cla180,1 owner c&c unit. full agent service with 1 mo...,2016.0,,2016-07-25,luxury sedan,"parf car, premium ad car",auto,...,58.331383,5.0,1,18228.7,26100.377133,44329.077133,1,1,4,4
3,mercedes-benz e-class e180 avantgarde,mercedes-benz,e180,"fully agent maintained, 3 years warranty 10 ye...",2019.0,,2020-11-17,luxury sedan,"parf car, almost new car, consignment car",auto,...,110.11068,2.0,1,42732.75,37336.696373,80069.446373,1,5,4,4
4,honda civic 1.6a vti,honda,civic,"kah motor unit! 1 owner, lowest 1.98% for full...",2019.0,,2019-09-20,mid-sized sedan,parf car,auto,...,96.180209,2.0,1,15075.75,21373.646954,36449.396954,1,7,6,2


((16780, 31), (16780, 50))

In [7]:
cleaned_train.columns

Index(['title', 'make', 'model', 'description', 'manufactured',
       'original_reg_date', 'reg_date', 'type_of_vehicle', 'category',
       'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category',
       'features', 'accessories', 'indicative_price', 'price', 'reg_date_year',
       'make_model', 'engine_cap_range', 'fuel_type_diesel',
       'fuel_type_petrol-electric', 'fuel_type_petrol', 'fuel_type_electric',
       'transmission_auto', 'transmission_manual', 'coe_start_date',
       'coe_start_year', 'coe_expiry_months', 'vehicle_age', 'is_parf_car',
       'parf', 'coe_rebate', 'dereg_value_computed', 'lifespan_restriction',
       'features_count', 'accessories_count', 'brand_rank'],
      dtype='object')

In [8]:
cleaned_test = data_pipeline.transform(test)
display(cleaned_test.head())
test.shape, cleaned_test.shape

INFO:root:HierarchicalGroupImputer - total 12 null values to impute for omv
INFO:root:HierarchicalGroupImputer - 10 null values left for omv after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 2 null values left for omv after imputing with group ['make', 'model']
INFO:root:DeregValueTransformer - replacing 67 null values with 0
INFO:root:DepreciationTransformer - replacing 132 null values with 0
INFO:root:HierarchicalGroupImputer - total 1144 null values to impute for mileage
INFO:root:HierarchicalGroupImputer - 153 null values left for mileage after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 9 null values left for mileage after imputing with group ['vehicle_age']
INFO:root:HierarchicalGroupImputer - total 634 null values to impute for road_tax
INFO:root:HierarchicalGroupImputer - 442 null values left for road_tax after imputing with group ['engine_cap', 'opc_scheme', 'vehicle_age', 'fuel_type']


Unnamed: 0,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,...,coe_expiry_months,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,lifespan_restriction,features_count,accessories_count,brand_rank
0,bmw x6 xdrive35i sunroof (new 10-yr coe),bmw,x6,"owner consignment unit, viewing strictly by ap...",2012.0,,2012-06-27,suv,coe car,auto,...,9.41046,9.0,0,0.0,4813.874077,4813.874077,1,2,6,3
1,porsche 911 carrera s coupe 3.8a pdk (coe till...,porsche,911,the 911 carrera s displacing 3.8 litres with m...,2010.0,,2010-05-11,sports car,"coe car, direct owner sale",auto,...,88.274332,11.0,0,0.0,23211.0,23211.0,1,2,7,5
2,porsche macan diesel s 3.0a pdk,porsche,macan,comes with agent warranty till january 2022. a...,2016.0,,2017-01-18,suv,"parf car, premium ad car",auto,...,64.146697,5.0,1,68502.0,28388.120783,96890.120783,1,1,9,5
3,bmw 5 series 530i luxury,bmw,530i,a careful owner upgraded to a porsche macan. t...,2017.0,,2017-06-28,luxury sedan,"parf car, sgcarmart warranty cars",auto,...,69.436333,4.0,1,45831.75,27485.793938,73317.543938,1,5,8,3
4,honda vezel 1.5a x,honda,vezel,100% non phv-unit! excellent condition and wel...,2016.0,,2017-06-05,suv,parf car,auto,...,68.680671,5.0,1,7180.6,26607.464308,33788.064308,1,7,6,2


((5000, 30), (5000, 49))

In [9]:
cleaned_test.columns

Index(['title', 'make', 'model', 'description', 'manufactured',
       'original_reg_date', 'reg_date', 'type_of_vehicle', 'category',
       'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category',
       'features', 'accessories', 'indicative_price', 'reg_date_year',
       'make_model', 'engine_cap_range', 'fuel_type_diesel',
       'fuel_type_petrol-electric', 'fuel_type_petrol', 'fuel_type_electric',
       'transmission_auto', 'transmission_manual', 'coe_start_date',
       'coe_start_year', 'coe_expiry_months', 'vehicle_age', 'is_parf_car',
       'parf', 'coe_rebate', 'dereg_value_computed', 'lifespan_restriction',
       'features_count', 'accessories_count', 'brand_rank'],
      dtype='object')

## Write to disk

In [10]:
cleaned_train.to_csv("../data/processed/train.csv")
cleaned_test.to_csv("../data/processed/test.csv")