In [39]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

In [40]:
df = pd.read_csv("carDataClean.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32596 entries, 0 to 32595
Data columns (total 19 columns):
price             32596 non-null int64
miles             32596 non-null int64
fuel_type         32596 non-null object
exterior_color    32357 non-null object
interior_color    28987 non-null object
drivetrain        32596 non-null object
transmission      32596 non-null object
engine            32596 non-null object
VIN               32594 non-null object
name              32596 non-null object
sellerAddress     32596 non-null object
id                32596 non-null int64
modelName         32596 non-null object
diesel            32596 non-null bool
turbo             32596 non-null bool
hybrid            32596 non-null bool
used              32596 non-null bool
year              32596 non-null int64
make              32596 non-null object
dtypes: bool(4), int64(4), object(11)
memory usage: 3.9+ MB


In [41]:
print(df['drivetrain'].unique())
print(df['transmission'].unique())

['FWD' 'RWD' '4WD']
['CVT' 'Auto' 'Manual']


In [42]:
# Convert catagorial data into number
drivetrain_map = {'FWD' : 0, 'RWD' : 1, '4WD' : 2}
drivetrain_numeric = [drivetrain_map[dt] for dt in df['drivetrain']]

tranmission_map = {'CVT' : 0, 'Auto': 1, "Manual": 2}
transmission_numeric = [tranmission_map[tr] for tr in df['transmission']]

engine_map = {'V12' : 12, 'V10': 10, 'V8': 8, 'V6': 6, 'V5': 5, 'V4': 4, 'V3': 3}
engine_numeric = [engine_map[en] for en in df['engine']]

diesel_numeric = [1 if d else 0 for d in df['diesel']]

turbo_numeric = [1 if d else 0 for d in df['turbo']]

hybrid_numeric = [1 if d else 0 for d in df['hybrid']]

model_map = {
    "C-Max Energi":50243, "C-Max Hybrid":49085, "Crown Victoria":20906, "E150":21050,
    "E250":26506, "E350":26502, "E350 Super Duty":26507, "EcoSport":36284899,
    "Edge":21039, "Escape":21088, "Excursion":21102, "Expedition":21104,
    "Expedition EL":21085, "Expedition Max":36324071, "Explorer":21105, "Explorer Sport Trac":21107,
    "F-150":21095, "F-250":21115, "F-350":21097, "Fiesta":21146, "Five Hundred":21156, "Flex":21136,
    "Focus":21138, "Focus ST":48704, "Freestar":21169, "Freestyle":21144, "Fusion":21175, "Fusion Energi":53027,
    "Fusion Hybrid":27661, "Mustang":21712, "Probe":21752, "Ranger":21874, "Sedan Police Interceptor":57387,
    "Shelby GT350":30021281, "Taurus":22164, "Thunderbird":22263, "Transit Connect":28203,
    "Transit-150":56747, "Transit-250":56748, "Transit-350":56749, "Model Unknown":29629
}
model_numeric = [model_map[md] for md in df['modelName']]

In [43]:
# Build datapoints we 
X = df.loc[:,['year', 'miles']]
X['model'] = model_numeric
X['transmission'] = transmission_numeric
X['engine'] = engine_numeric
X['turbo'] = turbo_numeric
X['diesel'] = diesel_numeric
X['hybrid'] = hybrid_numeric
X['drivetrain'] = drivetrain_numeric
Y = df['price']

# Train model with 80-20 test split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.2)
lasso_reg = LassoLarsCV(cv=3, max_n_alphas = 10).fit(X_Train, Y_Train)
# Pedict Y for each X_Test
Y_Pred = lasso_reg.predict(X_Test)

In [49]:
# Find the scores of the regression
r2_score = lasso_reg.score(X_Test, Y_Test)
ma_error = mean_absolute_error
ms_error = mean_squared_error(Y_Test, Y_Pred)

print('R^2 Score: ' + str(r2_score))
print('Mean Squared Error: ' + str(mean_absolute_error))

R^2 Score: 0.7657756977955485
Mean Squared Error: 56270956.09071867


In [15]:
year = float(input('What year?: '))
miles = float(input('How many miles?: '))
model = float(model_map[input('What model?: ')])
trans = float(tranmission_map[input('What transmission type (CVT/Auto/Manual?: ')])
engine = float(engine_map[input('What engine size (V#)?: ')])
turbo = float(input('Does it have a turbo (0:false/1:true)?: '))
diesel = float(input('Is it diesel (0:false/1:true)?: '))
hybrid = float(input('Is it hybrid (0:false/1:true)?: '))
drivetrain = float(drivetrain_map[input('What drivetrain (FWD/RWD/4WD)?: ')])

predicted_price = lasso_reg.predict([[year, miles, model, trans, engine, turbo, diesel, hybrid, drivetrain]])
print('The predicted price of this car is: ' + str(predicted_price[0]))

What year?: 2011
How many miles?: 100000
What model?: Fusion
What transmission type (CVT/Auto/Manual?: Auto
What engine size (V#)?: V6
Does it have a turbo (0:false/1:true)?: 0
Is it diesel (0:false/1:true)?: 0
Is it hybrid (0:false/1:true)?: 0
What drivetrain (FWD/RWD/4WD)?: FWD
The predicted price of this car is: 9268.825412868755
