In [3]:
# Dependencies
import json
import csv
import os
import matplotlib.pyplot as plt
import scipy.stats as st
import pandas as pd
import numpy as np
import requests
import time
import datetime
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import linregress
import requests
import hvplot.pandas
from pathlib import Path
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, QuantileTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import tensorflow as tf
import keras
import joblib
from tensorflow_addons.metrics import RSquare

In [4]:
nn = tf.keras.models.load_model("./Models/UsedCarsPricePredict.h5")
X_scaler = joblib.load("./Models/scalerfunction.sav")
OHE = joblib.load("./Models/OHE.sav")

In [5]:
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                26000     
                                                                 
 dense_1 (Dense)             (None, 50)                4050      
                                                                 
 dense_2 (Dense)             (None, 30)                1530      
                                                                 
 dense_3 (Dense)             (None, 1)                 31        
                                                                 
Total params: 31611 (123.48 KB)
Trainable params: 31611 (123.48 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [6]:
# Store filepath in a variable
file_path = "Resources/usedcarsdata_sampletest.csv"

In [7]:
# Read our Data file with the pandas library
sample_df = pd.read_csv(file_path, encoding="ISO-8859-1")

In [8]:
sample_df.head()

Unnamed: 0,name,year,miles,color,condition,price
0,Toyota Corolla,2021,"53,824 miles","Gray exterior, Black interior","No accidents reported, 1 Owner","$19,000"
1,Honda CR-V,2019,"42,419 miles","Red exterior, Black interior","No accidents reported, 2 Owners","$24,986"
2,Kia Soul,2021,"56,090 miles","Black exterior, Gray interior","1 accident reported, 1 Owner","$19,000"
3,Chevrolet Sonic,2018,"77,568 miles","Gray exterior, Gray interior","No accidents reported, 2 Owners","$17,156"
4,Toyota RAV4,2022,"31,689 miles","White exterior, Black interior","No accidents reported, 1 Owner","$25,887"


In [9]:
sample_df.dtypes

name         object
year          int64
miles        object
color        object
condition    object
price        object
dtype: object

In [10]:
# create a copy dataframe and remove any duplicate values
usedcars_revise_df = sample_df.copy().drop_duplicates()

In [11]:
# update price to integer datatype
updated_price = [int(str(value).strip("$").replace(",", "")) for value in usedcars_revise_df['price'].values]

In [12]:
# update mileage to integer datatype
updated_mileage = [int(str(m).replace("miles", "").replace(",", "").strip()) for m in usedcars_revise_df["miles"].values]

In [13]:
# get make and model of the car from the full name

fullname = usedcars_revise_df["name"].copy()

full_name = [str(name).split(" ", maxsplit=1) for name in fullname.values]

# make of the car
car_make = [arr[0] for arr in full_name]

# model of the car
car_model = [arr[1] for arr in full_name]

In [14]:
# Separate the car colour into 2 columns: exterior-color and interior-color
car_colors = [str(color).split(",") for color in usedcars_revise_df["color"].values]

exterior_carcolor = [str(color[0]).replace("exterior", "").strip() for color in car_colors]
interior_carcolor = [str(color[1]).replace("interior", "").strip() for color in car_colors]

In [15]:
# Update car condition values to provide clear meaning:
car_cond = [str(cond).split(",") for cond in usedcars_revise_df['condition'].values ]

accident_state = [int(str(condition[0]).replace("No", "0").replace("accidents reported", "").replace("accident reported", "").strip()) for condition in car_cond]

In [16]:
# replace words to update to numeric
ownership_status = [int(str(condition[1]).replace("Owners", "").replace("Owner", "").strip()) for condition in car_cond]

In [17]:
# Create new dataframe with revised values:

usedcars_newdf = pd.DataFrame( 
    {
        "make": car_make,
        "model": car_model,
        "year_of_make": [int(year) for year in usedcars_revise_df['year'].copy().values],
        "miles": updated_mileage,
        "exterior_color": exterior_carcolor,
        "interior_color": interior_carcolor,
        "accidents_reported": accident_state,
        "number_of_owners": ownership_status,
        "price": updated_price
    }
)

In [18]:
usedcars_newdf

Unnamed: 0,make,model,year_of_make,miles,exterior_color,interior_color,accidents_reported,number_of_owners,price
0,Toyota,Corolla,2021,53824,Gray,Black,0,1,19000
1,Honda,CR-V,2019,42419,Red,Black,0,2,24986
2,Kia,Soul,2021,56090,Black,Gray,1,1,19000
3,Chevrolet,Sonic,2018,77568,Gray,Gray,0,2,17156
4,Toyota,RAV4,2022,31689,White,Black,0,1,25887


In [19]:
usedcars_newdf.dtypes

make                  object
model                 object
year_of_make           int64
miles                  int64
exterior_color        object
interior_color        object
accidents_reported     int64
number_of_owners       int64
price                  int64
dtype: object

In [20]:
usedcars_newdf.describe()

Unnamed: 0,year_of_make,miles,accidents_reported,number_of_owners,price
count,5.0,5.0,5.0,5.0,5.0
mean,2020.2,52318.0,0.2,1.4,21205.8
std,1.643168,17158.12025,0.447214,0.547723,3947.64274
min,2018.0,31689.0,0.0,1.0,17156.0
25%,2019.0,42419.0,0.0,1.0,19000.0
50%,2021.0,53824.0,0.0,1.0,19000.0
75%,2021.0,56090.0,0.0,2.0,24986.0
max,2022.0,77568.0,1.0,2.0,25887.0


In [21]:
# define a function that calculates lower and upper bounds to identify outliers using the Inter Quartile Range (IQR)
def return_bounds(q1: float, q3: float):
    IQR = q3 - q1
    lower = q1 - (1.5 * IQR)
    upper = q3 + (1.5 * IQR)
    return lower, upper

# make a copy of the dataframe
outliers_df = usedcars_newdf.copy()

# use the method we defined previously to remove outliers
lower_bound, upper_bound = return_bounds(q1=17991, q3=30999.25)

# find the indexes of the rows that have price outliers
ind_q1 = outliers_df[(outliers_df['price'] < lower_bound)].index
ind_q3 = outliers_df[(outliers_df['price'] > upper_bound)].index

# remove them from our dataframe
outliers_df.drop(index=ind_q1, inplace=True)
outliers_df.drop(index=ind_q3, inplace=True)

outliers_df

Unnamed: 0,make,model,year_of_make,miles,exterior_color,interior_color,accidents_reported,number_of_owners,price
0,Toyota,Corolla,2021,53824,Gray,Black,0,1,19000
1,Honda,CR-V,2019,42419,Red,Black,0,2,24986
2,Kia,Soul,2021,56090,Black,Gray,1,1,19000
3,Chevrolet,Sonic,2018,77568,Gray,Gray,0,2,17156
4,Toyota,RAV4,2022,31689,White,Black,0,1,25887


In [22]:
# Create a copy of our dataframe for the purpose of building a model:
usedcars_final_df = outliers_df.copy()

# features: make, model, year of make, miles, exterior color, interior color, accidents reported, and number of owners
X = usedcars_final_df.drop(columns=['price'])
# y variable: price
y = usedcars_final_df['price']

In [23]:
# Scale new data for use in the model:

usedcars_cat_test = X.select_dtypes(include=object)
usedcars_num_test = X.select_dtypes(exclude=object)

enc_cat_test = OHE.transform(usedcars_cat_test)

# Scale the data
X_test_transformed = X_scaler.transform(usedcars_num_test)

X_test_scaled = np.concatenate([enc_cat_test, X_test_transformed], axis=1)

In [26]:
y_pred = nn.predict(X_test_scaled)



In [29]:
# Create a copy of the original data
df_carprice_predicted = usedcars_final_df.copy()

# Add a column with the predicted salary values
df_carprice_predicted["price_predicted"] = y_pred

# Display sample data
df_carprice_predicted

Unnamed: 0,make,model,year_of_make,miles,exterior_color,interior_color,accidents_reported,number_of_owners,price,price_predicted
0,Toyota,Corolla,2021,53824,Gray,Black,0,1,19000,17444.429688
1,Honda,CR-V,2019,42419,Red,Black,0,2,24986,22210.931641
2,Kia,Soul,2021,56090,Black,Gray,1,1,19000,16164.094727
3,Chevrolet,Sonic,2018,77568,Gray,Gray,0,2,17156,15560.459961
4,Toyota,RAV4,2022,31689,White,Black,0,1,25887,30172.041016


From the sample dataset review above, we see that the variation between Predicted Price and Actual Price is about 15%.