## Imports

In [1]:
import sys

sys.path.append("..")

In [2]:
from src.transformers import *
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Load Data

In [3]:
# Set default logging level
# Change to logging.INFO to see related output
logging.basicConfig(level=logging.INFO, force=True)

In [4]:
# Read data and set indices
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")

train.set_index("listing_id", inplace=True)
train.drop_duplicates(inplace=True)

test.set_index("listing_id", inplace=True)
test.drop_duplicates(inplace=True)

## Construct Pipeline

In [5]:
data_pipeline = Pipeline(
    [
        ("pre_processing", PreProcessing()),
        (
            "imp_manufactured",
            GroupMissingValueImputer(
                "manufactured", ["make", "model", "type_of_vehicle"], "first"
            ),
        ),
        ("imp_power", GroupMissingValueImputer("power", ["type_of_vehicle"], "mean")),
        (
            "imp_engine_cap",
            GroupMissingValueImputer("engine_cap", ["make", "type_of_vehicle"], "mean"),
        ),
        (
            "imp_curb_weight",
            GroupMissingValueImputer("curb_weight", ["make", "type_of_vehicle"]),
        ),
        #         ("add_cat", SplitValuesToColumn("category")),
        # Columns 9 to 15
        (
            "curb_weight",
            CarSpecificationsTransformer(
                "curb_weight", ["make", "model", "type_of_vehicle", "manufactured"]
            ),
        ),
        (
            "power",
            CarSpecificationsTransformer(
                "power", ["make", "model", "type_of_vehicle", "manufactured"]
            ),
        ),
        (
            "fuel_type",
            CarSpecificationsTransformer("fuel_type", ["make", "model"], "mode"),
        ),
        (
            "engine_cap",
            CarSpecificationsTransformer(
                "engine_cap", ["make", "model", "type_of_vehicle", "manufactured"]
            ),
        ),
        ("fuel_type_missing", CarSpecsMissingWithTypeOfVehicle(["fuel_type"], "mode")),
        (
            "car_spec_missing",
            CarSpecsMissingWithTypeOfVehicle(
                ["curb_weight", "power", "engine_cap"], "mean"
            ),
        ),
        (
            "convert_value_to_category", 
            ColumnValuesToCategory('engine_cap', 'engine_cap_range',
                                                             [0, 600, 1000, 1600, 3000, np.inf], 
                                                             ['EC<=600 cc', '600 cc < EC <= 1000 cc ', 
                                                              '1000 cc < EC <= 1600 cc','1600 cc < EC <= 3000 cc',
                                                              'EC > 3000 cc']
                                  )
        )
        
        
        
        # Columns 15 and above
        ("coe_start_date", CoeStartDateFeatureCreator()),
        ("vehicle_age", AgeFeatureCreator()),
        (
            "omv",
            HierarchicalGroupImputer(
                "omv",
                [["make", "model", "vehicle_age"], ["make", "model"]],
                "mean",
                True,
            ),
        ),
        ("coe", CoeTransformer()),
        ("arf", ArfTransformer()),
        ("parf", ParfFeatureCreator()),
        ("coe_rebate", CoeRebateFeatureCreator()),
        ("dereg_value_computed", DeregValueComputedFeatureCreator()),
        ("dereg_value", DeregValueTransformer()),
        ("depreciation", DepreciationTransformer()),
        (
            "mileage",
            HierarchicalGroupImputer(
                "mileage",
                [["make", "model", "vehicle_age"], ["vehicle_age"]],
                "mean",
                True,
            ),
        ),
        ("opc_scheme", OpcSchemeTransformer()),
        # TODO: transform engine_cap as ranges as defined in https://www.sgcarmart.com/services/roadtax_calculator.php
        # This will further improve the accuracy of imputed road_tax values
        (
            "road_tax",
            HierarchicalGroupImputer(
                "road_tax",
                [
                    ["engine_cap", "opc_scheme", "vehicle_age", "fuel_type"],
                    ["engine_cap", "opc_scheme"],
                    ["opc_scheme"],
                ],
                "mean",
                True,
            ),
        ),
        ("lifespan_restriction", LifespanRestrictionFeatureCreator()),
        (
            "features_count",
            CountUniqueItemsFeatureCreator("features", "features_count"),
        ),
        (
            "accessories_count",
            CountUniqueItemsFeatureCreator("accessories", "accessories_count"),
        ),
    ]
)

In [6]:
cleaned_train = data_pipeline.fit_transform(train)
display(cleaned_train.head())
train.shape, cleaned_train.shape

INFO:root:HierarchicalGroupImputer - total 41 null values to impute for omv
INFO:root:HierarchicalGroupImputer - 33 null values left for omv after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 12 null values left for omv after imputing with group ['make', 'model']
INFO:root:DeregValueTransformer - replacing 233 null values with 0
INFO:root:DepreciationTransformer - replacing 433 null values with 0
INFO:root:HierarchicalGroupImputer - total 3721 null values to impute for mileage
INFO:root:HierarchicalGroupImputer - 570 null values left for mileage after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 17 null values left for mileage after imputing with group ['vehicle_age']
INFO:root:HierarchicalGroupImputer - total 2152 null values to impute for road_tax
INFO:root:HierarchicalGroupImputer - 1557 null values left for road_tax after imputing with group ['engine_cap', 'opc_scheme', 'vehicle_age', 'fuel_ty

Unnamed: 0_level_0,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,...,coe_start_year,coe_expiry_months,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,lifespan_restriction,features_count,accessories_count
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
365241,rover 216 cabriolet (coe till 04/2029),rover,216,new 10 years coe! affordable rare beautiful ca...,1993.0,,1993-06-22,sports car,coe car,auto,...,2019,92.217765,28.0,0,0.0,20115.0,20115.0,1,4,0
397461,audi tt roadster 1.8a tfsi s-tronic (new 10-yr...,audi,tt,138000,2016.0,,2013-01-09,sports car,"coe car, low mileage car",auto,...,2013,15.850017,8.0,0,0.0,8005.859045,8005.859045,1,2,6
488792,mercedes-benz sl-class sl280 (coe till 09/2025),mercedes-benz,sl280,34,2016.0,,1995-09-07,sports car,coe car,auto,...,1995,0.0,26.0,0,0.0,,,1,1,4
540570,aston martin db9 cabriolet 6.0a (coe till 04/2...,aston martin,db9,new number plate! low mileage! very well maint...,2009.0,01-feb-2009,2011-01-27,sports car,"imported used vehicle, coe car, low mileage car",auto,...,2019,92.218802,12.0,0,0.0,27213.0,27213.0,1,4,5
592916,isuzu cyz52r (coe till 07/2027),isuzu,cyz52r,28 tons lorry with palfinger pk18500 crane.,2007.0,,2007-11-12,truck,coe car,manual,...,2017,71.211036,14.0,0,0.0,20218.0,20218.0,-1,1,0


((16780, 32), (16780, 43))

In [7]:
cleaned_test = data_pipeline.transform(test)
display(cleaned_test.head())
test.shape, cleaned_test.shape

INFO:root:HierarchicalGroupImputer - total 12 null values to impute for omv
INFO:root:HierarchicalGroupImputer - 10 null values left for omv after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 2 null values left for omv after imputing with group ['make', 'model']
INFO:root:DeregValueTransformer - replacing 67 null values with 0
INFO:root:DepreciationTransformer - replacing 132 null values with 0
INFO:root:HierarchicalGroupImputer - total 1144 null values to impute for mileage
INFO:root:HierarchicalGroupImputer - 154 null values left for mileage after imputing with group ['make', 'model', 'vehicle_age']
INFO:root:HierarchicalGroupImputer - 9 null values left for mileage after imputing with group ['vehicle_age']
INFO:root:HierarchicalGroupImputer - total 634 null values to impute for road_tax
INFO:root:HierarchicalGroupImputer - 445 null values left for road_tax after imputing with group ['engine_cap', 'opc_scheme', 'vehicle_age', 'fuel_type']


Unnamed: 0_level_0,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,...,coe_start_year,coe_expiry_months,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,lifespan_restriction,features_count,accessories_count
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
728100,toyota hiace 3.0m (coe till 06/2022),toyota,hiace,very well maintained by previous owner. most i...,2007.0,,2007-06-06,van,coe car,manual,...,2013,18.674628,14.0,0,0.0,2919.0,2919.0,-1,2,2
767227,bmw 5 series 528i se,bmw,528i,extended warranty by mbm wheelpower pte ltd un...,2013.0,,2014-06-19,luxury sedan,"parf car, consignment car",auto,...,2014,33.131686,8.0,1,37576.55,18582.458558,56159.008558,1,3,6
772738,mercedes-benz 250s (coe till 04/2029),mercedes-benz,250,beautiful classic w108 mercedes 250s. call now...,1967.0,,1967-05-26,luxury sedan,"coe car, rare & exotic, vintage cars",auto,...,1967,0.0,54.0,0,0.0,,,1,0,0
786155,mercedes-benz 200 (new 10-yr coe),mercedes-benz,200,6,1984.0,,1984-05-02,luxury sedan,"coe car, premium ad car",auto,...,1984,0.0,37.0,0,0.0,,,1,0,1
792071,mercedes-benz 200e (coe till 03/2029),mercedes-benz,200,fully restored. all done. excellence condition...,1991.0,,1991-04-30,luxury sedan,coe car,auto,...,2019,91.233872,30.0,0,0.0,25103.0,25103.0,1,0,0


((5000, 31), (5000, 42))

## Write to disk

In [9]:
# cleaned_train.to_csv("../data/processed/train.csv")
# cleaned_test.to_csv("../data/processed/test.csv")