In [13]:
from datetime import datetime
import logging
import numpy as np
import pandas as pd
from sklearn import compose
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
import scipy

In [14]:
# Read train data and set index
train = pd.read_csv("../data/raw/train.csv")
train.set_index("listing_id", inplace=True)
train.drop_duplicates(inplace=True)

# Separate x and y columns
x_cols = list(train.columns)
x_cols.remove("price")
train_features = train[x_cols]
train_label = pd.DataFrame(train.price)

# Set default logging level
# Change to logging.INFO to see related output
logging.basicConfig(level=logging.WARN, force=True)

In [16]:
print('Missing type_of_vehicle: ',len(train[train.type_of_vehicle.isnull()]))
print('Missing category : ',len(train[train.category .isnull()]))
print('Missing transmission  : ',len(train[train.transmission  .isnull()]))
print('Missing curb_weight: ', len(train[train.curb_weight.isnull()]))
print('Missing power: ', len(train[train.power.isnull()]))
print('Missing engine_cap: ',len(train[train.engine_cap.isnull()]))
print('Missing fuel_type: ',len(train[train.fuel_type.isnull()]))
print()

Missing type_of_vehicle:  0
Missing category :  0
Missing transmission  :  0
Missing curb_weight:  579
Missing power:  2335
Missing engine_cap:  53
Missing fuel_type:  13292



In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class GroupMissingValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self, col, group_cols, agg='mean'):
        self.group_mapping = {}
        self.group_cols = group_cols
        self.agg = agg
        self.col = col
    
    def fit(self, df):
        col = self.col
        if self.agg=='first':
            self.group_mapping = df[~df[col].isnull()].groupby(self.group_cols).first()[col].to_dict()
        elif self.agg=='mean':
            self.group_mapping = df[~df[col].isnull()].groupby(self.group_cols).mean()[col].to_dict()
        elif self.agg=='median':
            self.group_mapping = df[~df[col].isnull()].groupby(self.group_cols).median()[col].to_dict()
        else:
            raise Exception("Unknown Agg type")
        return self
        
    def transform(self, input_df):
        col = self.col
        df = input_df.copy()
        if col is not None and col in df.columns:
            key = tuple(self.group_cols) 
            if df[col].dtype == np.object_:
                unknown_value = 'unknown'
            else:
                if self.agg=='mean':
                    unknown_value=df[col].mean()
                else:
                    unknown_value=df[col].median()
            result = df.apply(lambda row: self.group_mapping.get(key, unknown_value) if pd.isnull(row[col]) else row[col],
            axis=1)
            df.loc[:, col] = result
            return df
        return df

In [9]:
def get_make_from_title(make_list, title):
    title = title.split(" ")
    for i in range(len(title)):
        if " ".join(title[0 : i + 1]) in make_list:
            return " ".join(title[0 : i + 1])
    return "unknown"


class CommonPreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.make_list = []
        pass

    def fit(self, df):
        self.make_list = df.make.unique()
        return self

    def transform(self, df):
        df.loc[:, "reg_date"] = np.where(
            df["reg_date"].isnull(), df["original_reg_date"], df["reg_date"]
        )
        df.loc[:, "reg_date"] = pd.to_datetime(df.reg_date)
        df.loc[:, "reg_date_year"] = df.loc[:, "reg_date"].dt.year
        df.loc[:, "reg_date_month"] = (
            datetime.now() - df.loc[:, "reg_date"]
        ) / np.timedelta64(1, "M")
        df.loc[:, "no_of_owners"] = df["no_of_owners"].fillna(1)
        df.loc[:, "title"] = df["title"].str.lower()
        df.loc[:, "make"] = df.apply(
            lambda row: get_make_from_title(self.make_list, row["title"])
            if pd.isnull(row["make"])
            else row["make"],
            axis=1,
        )
        df.loc[:, "make_model"] = df.make + "-" + "df.model"
        return df
    
class CarSpecificationsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, col, group_cols, agg='mean'):
        self.group_mapping_list = []
        self.group_cols = group_cols
        self.col = col
        self.agg = agg 
        
    def get_key(self, row, group_columns):
        lst = []
        if len(group_columns) > 1:
            for c in group_columns:
                lst.append(row[c])
            return tuple(lst)
        else:
            return row[group_columns[0]]
    
    def fit(self, df):
        group_cols = self.group_cols.copy()
        col = self.col
        for i in range(len(group_cols)):
            if self.agg == 'mean':
                group_mapping = df[~df[col].isnull()].groupby(group_cols).mean()[col].to_dict()
            elif self.agg == 'mode':
                group_mapping = df[~df[col].isnull()].groupby(group_cols).agg(lambda x: scipy.stats.mode(x)[0])[col].to_dict()
            self.group_mapping_list.append(group_mapping)
            group_cols.pop()
        return self

    def transform(self, input_df):
        group_cols = self.group_cols.copy()
        df = input_df.copy()
        for group_mapping in self.group_mapping_list:
            col = self.col
            
            if col is not None and col in df.columns:
                result = df.apply(lambda row: group_mapping.get(self.get_key(row, group_cols)) if pd.isnull(row[col]) else row[col],
                axis=1)
                df.loc[:, col] = result
            group_cols.pop()
        return df
    
class CarSpecsMissingWithTypeOfVehicle(BaseEstimator, TransformerMixin):
    def __init__(self, cols, agg='mean'):
        self.group_mapping = {}
        self.group_mapping_list = []
        self.cols = cols
        self.agg = agg 
        
    def fit(self, df):
        for col in self.cols:
            if self.agg == 'mean':
                group_mapping = df[~df[col].isnull()].groupby('type_of_vehicle').mean()[col].to_dict()
            elif self.agg == 'mode':
                group_mapping = df[~df[col].isnull()].groupby('type_of_vehicle').agg(lambda x: scipy.stats.mode(x)[0])[col].to_dict()
            self.group_mapping_list.append(group_mapping)
        return self
        
    def transform(self, input_df):
        cols = self.cols
        df = input_df.copy()
        group_mapping_list = self.group_mapping_list
        
        for i in range(len(group_mapping_list)):
            col = cols[i]
            if col is not None and col in df.columns:
                result = df.apply(lambda row: group_mapping_list[i].get(row['type_of_vehicle']) if pd.isnull(row[col]) else row[col],
                axis=1)
                df.loc[:, col] = result
        return df

In [10]:
pipeline_for_columns_9_to_15 = Pipeline(
    steps=[
        ("common_ops", CommonPreProcessing()),
        ('imp_manufactured', GroupMissingValueImputer('manufactured', ['make', 'model','type_of_vehicle'], 'first')),
        ("curb_weight", CarSpecificationsTransformer('curb_weight', ['make', 'model', 'type_of_vehicle', 'manufactured'])),
        ("power", CarSpecificationsTransformer('power', ['make', 'model', 'type_of_vehicle', 'manufactured'])),
        ("fuel_type", CarSpecificationsTransformer('fuel_type', ['make', 'model'], 'mode')),
        ("engine_cap", CarSpecificationsTransformer('engine_cap', ['make', 'model', 'type_of_vehicle', 'manufactured'])),
        ("fuel_type_missing", CarSpecsMissingWithTypeOfVehicle(['fuel_type'], 'mode')),
        ("car_spec_missing", CarSpecsMissingWithTypeOfVehicle(['curb_weight', 'power', 'engine_cap'], 'mean'))
    ]
)

In [11]:
cleaned_df = pipeline_for_columns_9_to_15.fit_transform(train)
# cleaned_df.head().iloc[:,0:15]


In [12]:
print('Missing curb_weight: ', len(cleaned_df[cleaned_df.curb_weight.isnull()]))
print('Missing power: ', len(cleaned_df[cleaned_df.power.isnull()]))
print('Missing engine_cap: ',len(cleaned_df[cleaned_df.engine_cap.isnull()]))
print('Missing fuel_type: ',len(cleaned_df[cleaned_df.fuel_type.isnull()]))
print()

Missing curb_weight:  0
Missing power:  0
Missing engine_cap:  0
Missing fuel_type:  0

