# Car Price Prediction

In [1]:
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import seaborn as sns
from bs4 import BeautifulSoup as bs
import requests
import re
import time
from datetime import date
import numpy as np
import pickle
from uszipcode import SearchEngine
%matplotlib inline



## Getting scraped data from csv file

In [2]:
df = pd.read_csv('data/car_info_extended.csv', index_col=0)
df = df.dropna()
df = df[df.zipcode != 0]
# train_df, test_df = train_test_split(df, test_size=0.2)
# train_df.to_csv('data/train_df.csv', encoding='utf-8')
# test_df.to_csv('data/test_df.csv', encoding='utf-8')

In [3]:
# train_df = pd.read_csv('data/train_df.csv', index_col=0)
# test_df = pd.read_csv('data/test_df.csv', index_col=0)

In [4]:
# train_df.head()

## Data cleaning

### Fuel type

In [5]:
def is_gas(row):
    regex1 = re.compile('.*Gas.*')
    regex2 = re.compile('.*Flex.*')
    if regex1.match(row.fuel_type) or \
    regex2.match(row.fuel_type) or \
    row.fuel_type == 'Hybrid ':
        return 1
    else:
        return 0
    
def is_diesel(row):
    regex1 = re.compile('Diesel.*')
    regex2 = re.compile('.*Flex Fuel.*')
    if regex1.match(row.fuel_type) or \
    regex2.match(row.fuel_type):
        return 1
    else:
        return 0
    
def is_electric(row):
    regex = re.compile('.*Electric.*')
    if regex.match(row.fuel_type) or \
    row.fuel_type == 'Hybrid ':
        return 1
    else:
        return 0

In [6]:
df['is_gas'] = df.apply(lambda row : is_gas(row), axis=1)
df['is_diesel'] = df.apply(lambda row : is_diesel(row), axis=1)
df['is_electric'] = df.apply(lambda row : is_electric(row), axis=1)

### Year

In [7]:
def get_year_make_model(row):
    i = row['name']
    w = i.split()
    return w

In [8]:
df['age'] = df.apply(lambda row : date.today().year - int(get_year_make_model(row)[0]), axis=1)\
    .astype('int')
df['make_model'] = df.apply(lambda row : get_year_make_model(row)[1] + get_year_make_model(row)[2], axis=1)
# df['model'] = df.apply(lambda row : get_year_make_model(row)[2], axis=1)

### MPG

In [9]:
# drop rows with blank mpg
df = df[df['mpg'] != '–']
df = df[df['mpg'] != 'None']

In [10]:
def get_low_mpg(row):
    i = row.mpg.find('–')
    if i != -1:
        return row.mpg[0:i]
    else:
        return row.mpg

def get_high_mpg(row):
    i = row.mpg.find('–')
    if i != -1:
        return row.mpg[i+1:]
    else:
        return 0

In [11]:
df['mpg'] = df.apply(lambda row : int(get_low_mpg(row)) + int(get_high_mpg(row)), axis=1).astype('int')

### Transmission

In [12]:
def is_auto(row):
    regex = re.compile('.*([A|a]uto|A/T|Dual Shift).*')
    if regex.match(row.transmission):
        return 1
    else:
        return 0

def is_cvt(row):
    regex = re.compile('.*(CVT|Variable).*')
    if regex.match(row.transmission):
        return 1
    else:
        return 0
    
def is_manual(row):
    regex = re.compile('.*(Manual|M/T).*')
    if regex.match(row.transmission):
        return 1
    else:
        return 0
    
def get_speed(row):
    tmp = row.transmission
    tmp = re.sub('[^0-9]', '', tmp)
    if tmp == '':
        return 0
    else:
        return int(tmp)

In [13]:
df['is_auto'] = df.apply(lambda row : is_auto(row), axis=1)
df['is_cvt'] = df.apply(lambda row : is_cvt(row), axis=1)
df['is_manual'] = df.apply(lambda row : is_manual(row), axis=1)
df['speed'] = df.apply(lambda row : get_speed(row), axis=1)

### Engine

In [14]:
def get_L(row):
    i = row.engine.find('L')
    if i == -1:
        return 0
    else:
        return float(row.engine[i-3:i])

def get_V(row):
    i = row.engine
    a = re.compile('.*16V.*')
    b = re.compile('.*24V.*')
    c = re.compile('.*32V.*')
    d = re.compile('.*48V.*')
    e = re.compile('.*12V.*')
    if a.match(i):
        return 16
    elif b.match(i):
        return 24
    elif c.match(i):
        return 32
    elif d.match(i):
        return 48
    elif e.match(i):
        return 12
    else:
        return 0

In [15]:
df['engine_L'] = df.apply(lambda row : get_L(row), axis=1).astype('float')
df['engine_V'] = df.apply(lambda row : get_V(row), axis=1).astype('float')

### Drivetrain

In [16]:
# drop rows with blank drivetrain
df.drivetrain = df.drivetrain.apply(lambda x : x[1:-1])
df = df[df['drivetrain'] != '–']

In [17]:
def get_drivetrain(row):
    i = row.drivetrain
    four = re.compile('.*(Front|Rear|FWD).*')
    two = re.compile('.*(All|Four|4WD|AWD).*')
    if four.match(i):
        return 4
    elif two.match(i):
        return 2
    else:
        return 0

In [18]:
df['drivetrain'] = df.apply(lambda row : get_drivetrain(row), axis=1)

## Zipcode

In [19]:
zipcode_to_crime_rate = pd.DataFrame(columns=['zipcode', 'crime_rate'])
zipcode_to_crime_rate['zipcode'] = df.zipcode.unique()
zipcode_to_crime_rate.crime_rate =\
    zipcode_to_crime_rate.crime_rate.apply(lambda x : 0)

In [20]:
# get zip code
def get_major_city(row):
    engine = SearchEngine()
    zipcode = engine.by_zipcode(row.zipcode)
    return zipcode.major_city

def get_population(row):
    engine = SearchEngine()
    zipcode = engine.by_zipcode(row.zipcode)
    return zipcode.population

In [21]:
df['major_city'] = df.apply(lambda row : get_major_city(row), axis=1)
df['population'] = df.apply(lambda row : get_population(row), axis=1)

## Car make and model

In [22]:
df = pd.get_dummies(df, columns=['make_model'], \
                    prefix='mm', prefix_sep='_')
df = pd.get_dummies(df, columns=['drivetrain'], \
                    prefix='dt', prefix_sep='_')
df = pd.get_dummies(df, columns=['major_city'], \
                    prefix='is_in', prefix_sep='_')

### Drop useless columns

In [23]:
df = df.drop(columns=\
             ['name', 'mpg', 'fuel_type', 'transmission', 'engine', 'personal_use'])
df = df.dropna()

In [24]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_df.to_csv('data/train_df.csv', encoding='utf-8')
test_df.to_csv('data/test_df.csv', encoding='utf-8')

## Building regression model

### Lasso Regression model

In [25]:
# train_df, test_df = train_test_split(df, test_size=0.2)
X = train_df.drop(columns=['price'], axis=1)
y = train_df.price

In [26]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
poly = PolynomialFeatures(degree=2)

X_scaled_poly = poly.fit_transform(X_scaled)

alphas = 10**np.linspace(-4, 4, 50)

lasso = LassoCV(alphas=alphas, cv=5, tol=0.03)
lasso.fit(X_scaled_poly, y)
lasso.score(X_scaled_poly, y)

0.9893090675882391

In [27]:
X_test = test_df.drop(columns=['price'], axis=1)
y_test = test_df.price

X_test_scaled = scaler.transform(X_test)
X_test_scaled_poly = poly.transform(X_test_scaled)

lasso.score(X_test_scaled_poly, y_test)

0.8037841122433163

In [28]:
# save the model
with open('models/lasso_model_pkl', 'wb') as file:
    pickle.dump(lasso, file)