# Import modules

In [389]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, RobustScaler
from ImportanceImputer import ImportanceImputer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score

sns.set(
    { "figure.figsize": (17, 7) },
    style='ticks',
    palette=sns.color_palette("Set2"),
    color_codes=True,
    font_scale=0.8
)
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')

# Load data

In [390]:
# Load the dataset
auto = pd.read_csv('adverts.csv')

In [391]:
seed = 42

# Identify erroneous data

### Price

In [392]:
auto = auto.loc[auto["price"] != 9999999] # Drop rows with erroneous prices

### Mileage

In [393]:
# Mileage of 0 is likely erroneous for older used cars
auto.loc[
    (auto["mileage"] < 0) & 
    (auto["vehicle_condition"] == "USED") & 
    (auto["year_of_registration"] < 2018), 
    "mileage"] = np.nan

### Year of reg

In [394]:
auto.loc[(auto["vehicle_condition"]=="NEW") & (auto["year_of_registration"].isna()), "year_of_registration"] = 2020

In [395]:
# Remove very old years for non-old makes
# Note for me: "~"" means not
auto.loc[(auto["year_of_registration"] < 1950) & (~auto["standard_make"].isin(["Austin", "Morris"])), "year_of_registration"] = np.nan

### Reg code

In [396]:
class RegCodeToYear(BaseEstimator, TransformerMixin):
    def __init__(self, column="reg_code", target_column="reg_code_year"):
        self.column = column
        self.target_column = target_column

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()

        if not(self.column in X):
            print("Skipping reg code to year as no reg code column is present.")
            return X
        
        def reg_to_year(reg_code):
            try:
                reg_code = int(reg_code)
                if reg_code > 71 or (50 > reg_code > 20): return np.nan
                return 2000 + reg_code % 50
            except ValueError:
                if not isinstance(reg_code, str): return np.nan
                letters = "ABCDEFGHJKLMNPRSTXY"
                if reg_code == "V": return (1979, 1999)
                if reg_code == "W": return (1980, 2000)
                if reg_code not in letters: return np.nan
                return (1983 + letters.find(reg_code), 1963 + letters.find(reg_code))

        # Apply the reg_to_year function and create the target column
        X[self.target_column] = X[self.column].map(reg_to_year)
        return X

In [397]:
class RegYearDisambiguator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        columns = pd.DataFrame(X).columns
        if not("reg_code" in X or "year_of_registration" in X or "reg_code_year" in X):
            print("Skipping reg year disambiguation as year is not present")
            return X
        
        def is_alpha(val):
            try: return not val.isnumeric()
            except: return False
            
        missing = X[(X['reg_code'].apply(is_alpha)) & (X["year_of_registration"].isna())]

        # For each ambiguous year select the year closest to the mean for that make.
        missing = missing.loc[missing["reg_code_year"].notna()]
        full_missing = X.loc[missing.index]

        for i in range(len(full_missing)):
            missing_model = full_missing.iloc[i]["standard_model"]
            years = full_missing.iloc[i]["reg_code_year"]
            used = X.loc[(X["standard_model"] == missing_model) & (X["year_of_registration"].notna())]

            mean_year = np.array(used["year_of_registration"]).mean()
            closest_year = years[0] if abs(mean_year - years[0]) < abs(mean_year - years[1]) else years[1]

            X.at[missing.index[i], "reg_code_year"] = closest_year

        return X

In [398]:
class FillYearWithReg(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        columns = pd.DataFrame(X).columns
        if not("reg_code" in X or "year_of_registration" in X or "reg_code_year" in X):
            print("Skipping fill year with reg as columns are not present.")
            return X

        # Copy the year from the reg code for missing years then drop reg code and reg code year
        X.loc[auto["year_of_registration"].isna(), "year_of_registration"] = X["reg_code_year"]

        X.drop(columns=["reg_code", "reg_code_year"], inplace=True)
            

        return X

# Process data for model

In [399]:
class AutoEncodeBinary(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()

        for col in X:
            if isinstance(col, str) and len(X[col].unique()) == 2:
                X[col] = X[col].replace({X[col].unique()[0]:0, X[col].unique()[1]:1})

        return X

In [400]:
class ParseReference(BaseEstimator, TransformerMixin):
    ''' Extract listing date(?) in days from reference and then drop the column '''
    
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        if not "public_reference" in X:
            print("Skipping ParseReference as no public reference found in frame.")
            return X

        time = pd.DataFrame()
        time['year'] = X['public_reference'].astype(str).str[:4].astype(int)
        time['month'] = X['public_reference'].astype(str).str[4:6].astype(int)
        time['day'] = X['public_reference'].astype(str).str[6:8].astype(int)
        X["public_reference"] = round(time['year']*365.25 + time['month']*30.436875 + time['day']).astype(int)
        X = X.rename(columns={"public_reference": "reference"})

        return X

In [401]:
class CombineMakeModel(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        # Combine make and model into one column
        X = pd.DataFrame(X, columns=["standard_colour", "standard_make", "standard_model", "vehicle_condition", "body_type", "crossover_car_and_van", "fuel_type"])
        X['make_model'] = X['standard_make'] + ' ' + X['standard_model']
        X.drop(columns=["standard_make", "standard_model"], inplace=True)

        return X

In [402]:
reg_year_pipeline = Pipeline([
    ("r2y", RegCodeToYear()),
    ("ryd", RegYearDisambiguator()),
    ("fyr", FillYearWithReg())
])

In [403]:
general_process = Pipeline([
    ("ryp", reg_year_pipeline),
    ("aeb", AutoEncodeBinary()),
    ("par", ParseReference())
])

# Split data

In [404]:
cat_cols = ["standard_colour", "standard_make", "standard_model", "vehicle_condition", "body_type", "crossover_car_and_van", "fuel_type"]
num_cols = ["reference", "mileage", "year_of_registration"]

cat_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("cmm", CombineMakeModel()),
    ("ohenc", OneHotEncoder(sparse_output=False, handle_unknown="ignore", dtype=int)),
])
num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("scl", RobustScaler()),
    ("tsf", PowerTransformer())
])
col_transformer = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ])
preprocessor = Pipeline([
    ("gen", general_process),
    ("pre", col_transformer)
])
auto_pipeline = Pipeline([
    ("pp", preprocessor),
    ('clf', KNeighborsRegressor(5))
])

In [None]:
X, y = auto.drop(columns="price"), auto[["price"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)

In [406]:
auto_pipeline.fit(X_train, y_train)

In [None]:
auto_pipeline.score(X_test, y_test)

In [None]:
cross_val_score(auto_pipeline, X_train, y_train)