In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle

#Cap the rows to 10000
num_rows = 10000
data = pd.read_csv("Kaggle_Data/vehicles.csv", nrows=num_rows)

In [3]:
#Create a dataframe of the dataset
df = pd.DataFrame(data)

#Drop columns that will not be useful for the model
df = df.drop(columns=['id', 'url', 'region', 'region_url', 'VIN', 'size', 
                      'image_url', 'description', 'county', 'lat', 'long',
                      'posting_date'], axis=1)

#Remove rows that do not have at least 12 columns with data
df = df.dropna(axis=0, thresh=12)

#Deal with the cylinders column, just want the number
df = df.assign(cylinders = lambda x: x['cylinders'].str.extract('(\d+)'))
df['cylinders'] = pd.to_numeric(df['cylinders'])

#Remove rows where NaN value in float columns
df = df.dropna(subset=['year', 'cylinders', 'odometer'])

#Fill missing values in categorical columns
df = df.fillna('')

#Rearrange the order of the columns
df = df[['manufacturer', 'model', 'year', 'condition', 'cylinders', 'fuel',
         'odometer', 'title_status', 'transmission', 'drive', 'type',
         'paint_color', 'state', 'price']]

#Data X,y split
X = pd.DataFrame(df, columns=['manufacturer', 'model', 'year', 
                                          'condition', 'cylinders', 'fuel',
                                          'odometer', 'title_status', 
                                          'transmission', 'drive', 'type',
                                          'paint_color', 'state'])
Y = pd.DataFrame(df, columns=['price'])

#Convert/encode the string columns to a number
X = pd.get_dummies(data=X, drop_first=True)

#Split the dataset into training and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1234)

In [4]:
from sklearn.ensemble import RandomForestRegressor

#Create a random forest regressor and test it's performance to the test data
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train.values.ravel())

#R^2 is approximately 0.81 using random forest regressor
print(regressor.score(X_test, y_test))

0.813351467049801


In [5]:
#Save the trained model
filename = 'usedcar_model.sav'
pickle.dump(regressor, open(filename, 'wb'))

In [19]:
#Example of what the X data looks like
X_train.head()

Unnamed: 0,year,cylinders,odometer,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,...,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow,state_al,state_az
9485,2003.0,6.0,219101.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
8907,2014.0,6.0,99233.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5980,2014.0,8.0,130302.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8903,2008.0,6.0,94000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8151,1989.0,6.0,289268.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [20]:
#Example of what the y data looks like
y_train.head()

Unnamed: 0,price
9485,2995
8907,18995
5980,36995
8903,15000
8151,1700


In [3]:
#Function to make predictions given info about a vehicle as input
def predict_price(manufacturer, model, year, condition, cylinders,
                    fuel, odometer, title_status, transmission,
                    drive, type, paint_color, state):
    #Clean up the manufacturer
    manufacturer = manufacturer.lower()
    manufacturer = "manufacturer_" + manufacturer
    #Clean up the model
    model = model.lower()
    model = "model_" + model
    #Clean up the condition
    condition = condition.lower()
    condition = "condition_" + condition
    #Clean up the fuel
    fuel = fuel.lower()
    fuel = "fuel_" + fuel
    #Clean up the title status
    title_status = title_status.lower()
    title_status = "title_status_" + title_status
    #Clean up the transmission
    transmission = transmission.lower()
    transmission = "transmission_" + transmission
    #Clean up the drive
    drive = drive.lower()
    drive = "drive_" + drive
    #Clean up the type
    type = type.lower()
    type = "type_" + type
    #Clean up the paint color
    paint_color = paint_color.lower()
    paint_color = "paint_color_" + paint_color
    #Clean up the state
    state = state.lower()
    state = "state_" + state
    
    #Find the columns that correspond to the input
    manufacturer_index = np.where(X.columns==manufacturer)[0]
    model_index = np.where(X.columns==model)[0]
    condition_index = np.where(X.columns==condition)[0]
    fuel_index = np.where(X.columns==fuel)[0]
    title_status_index = np.where(X.columns==title_status)[0]
    transmission_index = np.where(X.columns==transmission)[0]
    drive_index = np.where(X.columns==drive)[0]
    type_index = np.where(X.columns==type)[0]
    paint_color_index = np.where(X.columns==paint_color)[0]
    state_index = np.where(X.columns==state)[0]

    #Create an empty row for predicting, 1799
    x = np.zeros(len(X.columns))

    #Assign the categorical columsn that correspond to the input to 1
    if manufacturer_index >= 0:
        x[manufacturer_index] = 1
    if model_index >= 0:
        x[model_index] = 1
    if condition_index >= 0:
        x[condition_index] = 1
    if fuel_index >= 0:
        x[fuel_index] = 1
    if title_status_index >= 0:
        x[title_status_index] = 1
    if transmission_index >= 0:
        x[transmission_index] = 1
    if drive_index >= 0:
        x[drive_index] = 1
    if type_index >= 0:
        x[type_index] = 1
    if paint_color_index >= 0:
        x[paint_color_index] = 1
    if state_index >= 0:
        x[state_index] = 1

    #Populate these special columns (not categorical)
    x[0] = year
    x[1] = cylinders
    x[2] = odometer
        
    #Return the predicted price as an int to correspond with the prices in y
    return int(regressor.predict([x])[0] + 0.5)

In [4]:
#Proof of concept, predict the price using hardcoded values
predict_price("honda", "odyssey ex-l w/dvd", 2003.0, "", 6.0, "gas", 219101.0, "clean", "automatic", "fwd", "van", "silver", "az")

#The actual price is 2995, and the predicted price is 3305

NameError: name 'np' is not defined

In [24]:
#Console application to take user input
print("Enter the information for a vehicle.")
print("If you do not know a piece of information, simply press enter.")

#Read in all of the vehicle information
#Convert year, cylinders, and odometer to float because they are not categorical
manufacturer_in = input("Enter the manufacturer: ")
model_in = input("Enter the model: ")
year_in = float(input("Enter the year: "))
condition_in = input("Enter the condition: ")
cylinders_in = float(input("Enter the number of cylinders: "))
fuel_in = input("Enter the fuel type: ")
odometer_in = float(input("Enter the mileage: "))
title_status_in = input("Enter the title status: ")
transmission_in = input("Enter the transmission type: ")
drive_in = input("Enter the drive type: ")
type_in = input("Enter the type of the vehicle: ")
paint_color_in = input("Enter the paint color: ")
state_in = input("Enter the state where it will be sold: ")

#Run the prediction function and return a price
price_out = predict_price(manufacturer_in, model_in, year_in, condition_in, cylinders_in, 
                fuel_in, odometer_in, title_status_in, transmission_in, drive_in,
                type_in, paint_color_in, state_in)

print("The estimated value of that vehicle is:", price_out)

Enter the information for a vehicle.
If you do not know a piece of information, simply press enter.
The estimated value of that vehicle is: 10326


The above prediction used the following input:

* manufacturer: honda
* model: odyssey ex-l w/dvd
* year: 2003
* condition: (empty)
* cylinders: 6
* fuel: gas
* odometer: 219101
* title_status: clean
* transmission: automatic
* drive: fwd
* type: van
* paint_color: silver
* state: az

This shows that the model can take user input to generate a price.