In [1]:
from datetime import datetime
import logging
import numpy as np
import pandas as pd
from sklearn import compose
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
import scipy

In [2]:
# Read train data and set index
train = pd.read_csv("../data/raw/train.csv")
train.set_index("listing_id", inplace=True)
train.drop_duplicates(inplace=True)

# Separate x and y columns
x_cols = list(train.columns)
x_cols.remove("price")
train_features = train[x_cols]
train_label = pd.DataFrame(train.price)

# Set default logging level
# Change to logging.INFO to see related output
logging.basicConfig(level=logging.WARN, force=True)

In [3]:
print('Missing type_of_vehicle: ',len(train[train.type_of_vehicle.isnull()]))
print('Missing category : ',len(train[train.category .isnull()]))
print('Missing transmission  : ',len(train[train.transmission  .isnull()]))
print('Missing curb_weight: ', len(train[train.curb_weight.isnull()]))
print('Missing power: ', len(train[train.power.isnull()]))
print('Missing engine_cap: ',len(train[train.engine_cap.isnull()]))
print('Missing fuel_type: ',len(train[train.fuel_type.isnull()]))
print()

Missing type_of_vehicle:  0
Missing category :  0
Missing transmission  :  0
Missing curb_weight:  579
Missing power:  2335
Missing engine_cap:  53
Missing fuel_type:  13292



In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class GroupMissingValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self, col, group_cols, agg='mean'):
        self.group_mapping = {}
        self.group_cols = group_cols
        self.agg = agg
        self.col = col
    
    def fit(self, df):
        col = self.col
        if self.agg=='first':
            self.group_mapping = df[~df[col].isnull()].groupby(self.group_cols).first()[col].to_dict()
        elif self.agg=='mean':
            self.group_mapping = df[~df[col].isnull()].groupby(self.group_cols).mean()[col].to_dict()
        elif self.agg=='median':
            self.group_mapping = df[~df[col].isnull()].groupby(self.group_cols).median()[col].to_dict()
        else:
            raise Exception("Unknown Agg type")
        return self
        
    def transform(self, input_df):
        col = self.col
        df = input_df.copy()
        if col is not None and col in df.columns:
            key = tuple(self.group_cols) 
            if df[col].dtype == np.object_:
                unknown_value = 'unknown'
            else:
                if self.agg=='mean':
                    unknown_value=df[col].mean()
                else:
                    unknown_value=df[col].median()
            result = df.apply(lambda row: self.group_mapping.get(key, unknown_value) if pd.isnull(row[col]) else row[col],
            axis=1)
            df.loc[:, col] = result
            return df
        return df

In [5]:
def get_make_from_title(make_list, title):
    title = title.split(" ")
    for i in range(len(title)):
        if " ".join(title[0 : i + 1]) in make_list:
            return " ".join(title[0 : i + 1])
    return "unknown"


class CommonPreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.make_list = []
        pass

    def fit(self, df):
        self.make_list = df.make.unique()
        return self

    def transform(self, df):
        df.loc[:, "reg_date"] = np.where(
            df["reg_date"].isnull(), df["original_reg_date"], df["reg_date"]
        )
        df.loc[:, "reg_date"] = pd.to_datetime(df.reg_date)
        df.loc[:, "reg_date_year"] = df.loc[:, "reg_date"].dt.year
        df.loc[:, "reg_date_month"] = (
            datetime.now() - df.loc[:, "reg_date"]
        ) / np.timedelta64(1, "M")
        df.loc[:, "no_of_owners"] = df["no_of_owners"].fillna(1)
        df.loc[:, "title"] = df["title"].str.lower()
        df.loc[:, "make"] = df.apply(
            lambda row: get_make_from_title(self.make_list, row["title"])
            if pd.isnull(row["make"])
            else row["make"],
            axis=1,
        )
        df.loc[:, "make_model"] = df.make + "-" + "df.model"
        return df
    
class CarSpecificationsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, col, group_cols, agg='mean'):
        self.group_mapping_list = []
        self.group_cols = group_cols
        self.col = col
        self.agg = agg 
        
    def get_key(self, row, group_columns):
        lst = []
        if len(group_columns) > 1:
            for c in group_columns:
                lst.append(row[c])
            return tuple(lst)
        else:
            return row[group_columns[0]]
    
    def fit(self, df):
        group_cols = self.group_cols.copy()
        col = self.col
        for i in range(len(group_cols)):
            if self.agg == 'mean':
                group_mapping = df[~df[col].isnull()].groupby(group_cols).mean()[col].to_dict()
            elif self.agg == 'mode':
                group_mapping = df[~df[col].isnull()].groupby(group_cols).agg(lambda x: scipy.stats.mode(x)[0])[col].to_dict()
            self.group_mapping_list.append(group_mapping)
            group_cols.pop()
        return self

    def transform(self, input_df):
        group_cols = self.group_cols.copy()
        df = input_df.copy()
        for group_mapping in self.group_mapping_list:
            col = self.col
            
            if col is not None and col in df.columns:
                result = df.apply(lambda row: group_mapping.get(self.get_key(row, group_cols)) if pd.isnull(row[col]) else row[col],
                axis=1)
                df.loc[:, col] = result
            group_cols.pop()
        return df
    
class CarSpecsMissingWithTypeOfVehicle(BaseEstimator, TransformerMixin):
    def __init__(self, cols, agg='mean'):
        self.group_mapping = {}
        self.group_mapping_list = []
        self.cols = cols
        self.agg = agg 
        
    def fit(self, df):
        for col in self.cols:
            if self.agg == 'mean':
                group_mapping = df[~df[col].isnull()].groupby('type_of_vehicle').mean()[col].to_dict()
            elif self.agg == 'mode':
                group_mapping = df[~df[col].isnull()].groupby('type_of_vehicle').agg(lambda x: scipy.stats.mode(x)[0])[col].to_dict()
            self.group_mapping_list.append(group_mapping)
        return self
        
    def transform(self, input_df):
        cols = self.cols
        df = input_df.copy()
        group_mapping_list = self.group_mapping_list
        
        for i in range(len(group_mapping_list)):
            col = cols[i]
            if col is not None and col in df.columns:
                result = df.apply(lambda row: group_mapping_list[i].get(row['type_of_vehicle']) if pd.isnull(row[col]) else row[col],
                axis=1)
                df.loc[:, col] = result
        return df

In [29]:
class ColumnValuesToCategory(BaseEstimator, TransformerMixin):
    def __init__(self, col, new_col, bins, names):
        self.bins = bins
        self.names = names
        self.col = col
        self.new_col = new_col
        
    def fit(self, df):
        return self
    
    def transform(self, input_df):
        df = input_df.copy()
        df[self.new_col] = pd.cut(df[self.col], bins, labels=names)
        return df

In [30]:
pipeline_for_columns_9_to_15 = Pipeline(
    steps=[
        ("common_ops", CommonPreProcessing()),
        ('imp_manufactured', GroupMissingValueImputer('manufactured', ['make', 'model','type_of_vehicle'], 'first')),
        ("curb_weight", CarSpecificationsTransformer('curb_weight', ['make', 'model', 'type_of_vehicle', 'manufactured'])),
        ("power", CarSpecificationsTransformer('power', ['make', 'model', 'type_of_vehicle', 'manufactured'])),
        ("fuel_type", CarSpecificationsTransformer('fuel_type', ['make', 'model'], 'mode')),
        ("engine_cap", CarSpecificationsTransformer('engine_cap', ['make', 'model', 'type_of_vehicle', 'manufactured'])),
        ("fuel_type_missing", CarSpecsMissingWithTypeOfVehicle(['fuel_type'], 'mode')),
        ("car_spec_missing", CarSpecsMissingWithTypeOfVehicle(['curb_weight', 'power', 'engine_cap'], 'mean')),
        ("convert_value_to_category", ColumnValuesToCategory('engine_cap', 'engine_cap_range',
                                                             [0, 600, 1000, 1600, 3000, np.inf], 
                                                             ['EC<=600 cc', '600 cc < EC <= 1000 cc ', '1000 cc < EC <= 1600 cc',
                                                              '1600 cc < EC <= 3000 cc', 'EC > 3000 cc']))
    ]
)

In [31]:
cleaned_df = pipeline_for_columns_9_to_15.fit_transform(train)
# cleaned_df.head().iloc[:,0:15]


In [33]:
cleaned_df.loc[:, ['engine_cap', 'engine_cap_range']]

Unnamed: 0_level_0,engine_cap,engine_cap_range
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1030324,1997.0,1600 cc < EC <= 3000 cc
1021510,2982.0,1600 cc < EC <= 3000 cc
1026909,1595.0,1000 cc < EC <= 1600 cc
1019371,1497.0,1000 cc < EC <= 1600 cc
1031014,1597.0,1000 cc < EC <= 1600 cc
...,...,...
1030181,1969.0,1600 cc < EC <= 3000 cc
1027041,1580.0,1000 cc < EC <= 1600 cc
1021099,1598.0,1000 cc < EC <= 1600 cc
1019473,1317.0,1000 cc < EC <= 1600 cc


In [8]:
print('Missing curb_weight: ', len(cleaned_df[cleaned_df.curb_weight.isnull()]))
print('Missing power: ', len(cleaned_df[cleaned_df.power.isnull()]))
print('Missing engine_cap: ',len(cleaned_df[cleaned_df.engine_cap.isnull()]))
print('Missing fuel_type: ',len(cleaned_df[cleaned_df.fuel_type.isnull()]))
print()

Missing curb_weight:  0
Missing power:  0
Missing engine_cap:  0
Missing fuel_type:  0



In [26]:
class ColumnValuesToCategory(BaseEstimator, TransformerMixin):
    def __init__(self, col, new_col, bins, names):
        self.bins = bins
        self.names = names
        self.col = col
        self.new_col = new_col
        
    def fit(self, df):
        return self
    
    def transform(self, input_df):
        df = input_df.copy()
        df[self.new_col] = pd.cut(cleaned_df[self.col], bins, labels=names)
        return df

In [9]:
bins = [0, 600, 1000, 1600, 3000, np.inf]
names = ['EC<=600 cc', '600 cc < EC <= 1000 cc ', '1000 cc < EC <= 1600 cc', '1600 cc < EC <= 3000 cc', 'EC > 3000 cc']

cleaned_df['engine_cap_range'] = pd.cut(cleaned_df['engine_cap'], bins, labels=names)


In [25]:
cleaned_df[cleaned_df.engine_cap == 1600].loc[:, ['engine_cap_range', 'engine_cap']]
cleaned_df

Unnamed: 0_level_0,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,...,lifespan,eco_category,features,accessories,indicative_price,price,reg_date_year,reg_date_month,make_model,engine_cap_range
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1030324,bmw 3 series 320i gran turismo m-sport,bmw,320i,1 owner! 320i gt m-sports model! big brake kit...,2013.0,,2013-12-09,luxury sedan,"parf car, premium ad car, low mileage car",auto,...,,uncategorized,"5 doors gt, powerful and fuel efficient 2.0l t...","bmw i-drive, navigation, bluetooth/aux/usb inp...",,71300.0,2013,94.444050,bmw-df.model,1600 cc < EC <= 3000 cc
1021510,toyota hiace 3.0m,toyota,hiace,high loan available! low mileage unit. wear an...,2014.0,,2015-01-26,van,premium ad car,manual,...,25-jan-2035,uncategorized,low mileage unit. well maintained vehicle. vie...,factory radio setting. front recording camera....,,43800.0,2015,80.874983,toyota-df.model,1600 cc < EC <= 3000 cc
1026909,mercedes-benz cla-class cla180,mercedes-benz,cla180,1 owner c&c unit. full agent service with 1 mo...,2016.0,,2016-07-25,luxury sedan,"parf car, premium ad car",auto,...,,uncategorized,responsive and fuel efficient 1.6l inline 4 cy...,dual electric/memory seats. factory fitted aud...,,95500.0,2016,62.936216,mercedes-benz-df.model,1000 cc < EC <= 1600 cc
1019371,mercedes-benz e-class e180 avantgarde,mercedes-benz,e180,"fully agent maintained, 3 years warranty 10 ye...",2019.0,,2020-11-17,luxury sedan,"parf car, almost new car, consignment car",auto,...,,uncategorized,"1.5l inline-4 twin scroll turbocharged engine,...",64 colour ambient lighting. active parking ass...,,197900.0,2020,11.156919,mercedes-benz-df.model,1000 cc < EC <= 1600 cc
1031014,honda civic 1.6a vti,honda,civic,"kah motor unit! 1 owner, lowest 1.98% for full...",2019.0,,2019-09-20,mid-sized sedan,parf car,auto,...,,uncategorized,"1.6l i-vtec engine, 123 bhp, earth dreams cvt ...","s/rims, premium leather seats, factory touch s...",,103200.0,2019,25.087390,honda-df.model,1000 cc < EC <= 1600 cc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030181,volvo xc90 t5 momentum,volvo,xc90,"just arrived, serviced and maintained by wearn...",2015.0,,2016-01-22,suv,"parf car, premium ad car",auto,...,,uncategorized,"250bhp, 350nm, 4 cylinder inline 16 valve turb...","city safety with pedestrian/cyclist detection,...",,144400.0,2016,69.014370,volvo-df.model,1600 cc < EC <= 3000 cc
1027041,hyundai ioniq hybrid 1.6a dct sunroof,hyundai,ioniq,one owner unit! 10 years hybrid battery till 0...,2017.0,,2017-05-11,hatchback,"parf car, premium ad car, hybrid cars",auto,...,,uncategorized,"1.6l hybrid dohc inline-4, 16v dual-cvvt, 138b...","17"" rims, leather upholstery, reverse camera/s...",,70200.0,2017,53.408300,hyundai-df.model,1000 cc < EC <= 1600 cc
1021099,mini cooper cabriolet 1.6a (coe till 08/2030),mini,cooper,all wear and tear has been done up with receip...,2010.0,,2010-08-27,sports car,"coe car, premium ad car",auto,...,,uncategorized,1.6l responsive turbocharged engine. 6 speed a...,multifunction steering. auto headlights. rever...,,71300.0,2010,133.869911,mini-df.model,1000 cc < EC <= 1600 cc
1019473,honda fit 1.3a g f-package,honda,fit,"1 owner unit, 100% loan available! well-mainta...",2019.0,,2020-07-30,hatchback,"parf car, premium ad car",auto,...,,uncategorized,"4 cylinders dohc i-vtec engine, 97 bhp, cvt au...","sports rims, leather seats, reverse sensors, r...",,81200.0,2020,14.770956,honda-df.model,1000 cc < EC <= 1600 cc


In [None]:
EC<=600 cc
600 cc < EC <= 1000 
1000 cc < EC <= 1600 
1600 cc < EC <= 3000 
EC > 3000 cc