In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error as MAE, 
    mean_absolute_percentage_error as MAPE, 
    mean_squared_error as MSE) 
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn import set_config
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from src.data import load_data

pd.set_option('display.precision',4)
plt.rcParams["figure.figsize"] = (20, 10)
plt.style.use('fivethirtyeight')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Predicting the price of used cars
In the data file [ToyotaCorrolla](../data/ToyotaCorolla.csv) we have data on used Toyotas from late 2004 in the Netherlands.  The goal is to predict the price based on it's specifications.

Note there has been some significant EDA done ahead of time to reduce the number of columns and find interesting data.
- Cylinders: dropped because all the values are 4

In [None]:
#
# Let's start by checking out the data
cars_dtypes = {'Model':str,'Automatic':bool,'ABS':bool,'Airbag_1':bool,'Airbag_2':bool,'BOVAG_Guarantee':bool,
    'Airco':bool,'Automatic_airco':bool,'Boardcomputer':bool,'CD_Player':bool,'Central_Lock':bool,
    'Met_Color':bool,'Powered_Windows':bool,'Power_Steering':bool,'Radio':bool,'Mistlamps':bool,'Mfr_Guarantee':bool,'Sport_Model':bool,
    'Backseat_Divider':bool,'Metallic_Rim':bool,'Radio_cassette':bool,'Parking_Assistant':bool,'Tow_Bar':bool}
cars_df = load_data('ToyotaCorolla',dtype= cars_dtypes).drop(columns=['Cylinders'])
cat_columns = ['Mfg_Month','Mfg_Year','Doors','Gears','Fuel_Type','Color']


for c in cat_columns:
    cars_df[c] = pd.Categorical(cars_df[c])

# Take out the target column before determining the number columns
num_columns = cars_df.drop(columns='Price').select_dtypes(include='number').columns
bool_columns = cars_df.select_dtypes(include='bool').columns

cars_df.head(5)
num_columns

In [None]:
cars_df.columns
cars_df.dtypes

In [None]:
iq1,iq2,iq3 = cars_df.Price.describe()[4:7]
sns.kdeplot(cars_df.Price, cumulative=True)
y=np.full(len(cars_df.Price.cumsum()),0.5)
plt.axhline(y=0.5,color='red',linestyle='--')
plt.axhline(y=0.75,color='green',linestyle='--')
plt.axhline(y=0.25,color='green',linestyle='--')
plt.text(x=0.1*max(cars_df.Price),y=0.51,s=f'Median: {iq2}')
plt.text(x=0.1*max(cars_df.Price),y=0.76,s=f'IQ3: {iq3}')
plt.text(x=0.1*max(cars_df.Price),y=0.26,s=f'IQ1: {iq1}')
plt.show()


cars_df.Price.describe()

In [None]:
cars_df[bool_columns].apply(pd.value_counts).T
cars_df[cat_columns].nunique()

In [None]:
fig, axs = plt.subplots(nrows=3, ncols=2,figsize=(20,25));
for ax, c in zip(axs.flat,cat_columns):
    sns.boxplot(data=cars_df, x=c,y='Price',ax=ax);

In [None]:
# We see above that it looks like these values may have some interest (certainly the newer the car the more valuable)
# There seems to be a small number of 2-doors let's see
cars_df['Doors'].value_counts()

In [None]:
# With only 2 samples of 2-door cars, we can safely drop these
cars_df = cars_df.query('Doors>2')

In [None]:
cars_df.groupby('Color')['Price'].mean().sort_values(ascending=False)

In [None]:
# Now let's look through our booleans and see if they have any signficant outliers in terms of what yields higher price
# We can do this with a simple t-Test
from scipy.stats import ttest_ind

for c in bool_columns:
    _, p_val = ttest_ind(cars_df[cars_df[c]==True].Price,cars_df[cars_df[c]==False].Price)
    cars_df.groupby(c).Price.mean()
    print(f'{c=} Mean Diff:{p_val<0.05}')
# for c in bool_columns:


In [None]:
X = cars_df.drop(columns='Price')
y = cars_df['Price']


In [None]:
# Here we are going to apply two transforms to our numeric columns
# An imputer, to fill any gaps in our dataset with the median value
# And a scaler which we can use to ensure our data is standardized
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

# For our categorical data, we'll use the OneHotEncoder
#  In essense this will dummy the columns for us
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_columns),
        ("cat", categorical_transformer, cat_columns)])

kbest = SelectKBest(score_func=f_regression,k=8)
column_transformer.fit_transform(X,y)
# pipe = Pipeline([('col_trans',column_transformer),('kbest',kbest)])

# pipe.fit_transform(X,y)
# best_filter = kbest.get_support()


In [None]:

set_config(display='diagram')

# Reduce data frame to the top 1000 rows and select columns for regression analysis
toyota_df = load_data('ToyotaCorolla',nrows=1000, 
            usecols=['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors', 'Quarterly_Tax', 'Weight', 'Price'])

outcome = 'Price'

num_columns = toyota_df.drop(columns=outcome).select_dtypes(include='number').columns
cat_columns = toyota_df.select_dtypes(exclude="number").columns

# Here we are going to apply two transforms to our numeric columns
# An imputer, to fill any gaps in our dataset with the median value
# And a scaler which we can use to ensure our data is standardized
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

# For our categorical data, we'll use the OneHotEncoder
#  In essense this will dummy the columns for us
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_columns),
        ("cat", categorical_transformer, cat_columns)])

toyota_reg = LinearRegression()

# Here we define the transformers to use and which columns to apply them too
pipeline = Pipeline([("col_transform",column_transformer)
            , ("feature_selection",SelectKBest())
            , ('regression_model',toyota_reg)])
pipeline