# Car Price Prediction

In [283]:
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
from sklearn.linear_model import LinearRegression
import seaborn as sns
from bs4 import BeautifulSoup as bs
import requests
import re
import time
import numpy as np

## Getting scraped data from csv file

In [284]:
df = pd.read_csv('car_info.csv', index_col=0)

In [285]:
df.head()

Unnamed: 0,index,name,price,mileage,ex_color,in_color,drivetrain,mpg,fuel_type,transmission,engine
0,0,2019 Honda Accord Sport,27995.0,27995.0,White,Red,Front-wheel Drive,29–35,Gasoline,Automatic CVT,1.5L I4 16V GDI DOHC Turbo
1,1,2018 Porsche Macan,41998.0,41998.0,Gray,Black,All-wheel Drive,20–25,Gasoline,7-Speed Automatic with Auto-Shift,2.0L I4 16V GDI DOHC Turbo
2,2,2015 Chevrolet Corvette Stingray Z51,52000.0,52000.0,Velocity Yellow Tinted Clearcoat,Black,Rear-wheel Drive,17–29,Gasoline,8-Speed Automatic,6.2L V8 16V GDI OHV
3,3,2020 Jeep Grand Cherokee Overland,41998.0,41998.0,Ivory,Sepia / Black,Four-wheel Drive,18–25,Gasoline,8-Speed Automatic,3.6L V6 24V MPFI DOHC
4,4,2017 Ford Mustang GT Premium,29995.0,29995.0,Ingot Silver Metallic,Ebony w/Metal Gray Stitch,Rear-wheel Drive,15–25,Gasoline,Automatic,Engine: 5.0L Ti-VCT V8


## Data cleaning

### Fuel type

In [286]:
def is_gas(row):
    regex1 = re.compile('.*Gas.*')
    regex2 = re.compile('.*Flex.*')
    if regex1.match(row.fuel_type) or \
    regex2.match(row.fuel_type) or \
    row.fuel_type == 'Hybrid ':
        return 1
    else:
        return 0

In [287]:
def is_diesel(row):
    regex1 = re.compile('Diesel.*')
    regex2 = re.compile('.*Flex Fuel.*')
    if regex1.match(row.fuel_type) or \
    regex2.match(row.fuel_type):
        return 1
    else:
        return 0

In [288]:
def is_electric(row):
    regex = re.compile('.*Electric.*')
    if regex.match(row.fuel_type) or \
    row.fuel_type == 'Hybrid ':
        return 1
    else:
        return 0

In [289]:
df['is_gas'] = df.apply(lambda row : is_gas(row), axis=1)
df['is_diesel'] = df.apply(lambda row : is_diesel(row), axis=1)
df['is_electric'] = df.apply(lambda row : is_electric(row), axis=1)

### Year

In [290]:
def get_year_make_module(row):
    i = row['name']
    w = i.split()
    return w

In [291]:
df['year'] = df.apply(lambda row : get_year_make_module(row)[0], axis=1)
df['make'] = df.apply(lambda row : get_year_make_module(row)[1], axis=1)
df['module'] = df.apply(lambda row : get_year_make_module(row)[2], axis=1)

### MPG

In [292]:
# drop rows with blank mpg
df = df[df['mpg'] != '-']

In [293]:
def get_low_mpg(row):
    return row.mpg[0:2]

def get_high_mpg(row):
    return row.mpg[3:5]

In [294]:
df['low_mpg'] = df.apply(lambda row : get_low_mpg(row), axis=1)
df['high_mpg'] = df.apply(lambda row : get_high_mpg(row), axis=1)

### Transmission

In [295]:
def is_auto(row):
    regex = re.compile('.*([A|a]uto|A/T|Dual Shift).*')
    if regex.match(row.transmission):
        return 1
    else:
        return 0

def is_cvt(row):
    regex = re.compile('.*(CVT|Variable).*')
    if regex.match(row.transmission):
        return 1
    else:
        return 0
    
def is_manual(row):
    regex = re.compile('.*(Manual|M/T).*')
    if regex.match(row.transmission):
        return 1
    else:
        return 0
    
def get_speed(row):
    tmp = row.transmission
    tmp = re.sub('[^0-9]', '', tmp)
    if tmp == '':
        return 0
    else:
        return int(tmp)

In [296]:
df['is_auto'] = df.apply(lambda row : is_auto(row), axis=1)
df['is_cvt'] = df.apply(lambda row : is_cvt(row), axis=1)
df['is_manual'] = df.apply(lambda row : is_manual(row), axis=1)
df['speed'] = df.apply(lambda row : get_speed(row), axis=1)

### Engine

In [297]:
def get_L(row):
    i = row.engine.find('L')
    return row.engine[i-3:i]

def get_V(row):
    i = row.engine
    a = re.compile('.*16V.*')
    b = re.compile('.*24V.*')
    c = re.compile('.*32V.*')
    d = re.compile('.*48V.*')
    e = re.compile('.*12V.*')
    if a.match(i):
        return 16
    elif b.match(i):
        return 24
    elif c.match(i):
        return 32
    elif d.match(i):
        return 48
    elif e.match(i):
        return 12
    else:
        return 0

In [298]:
df['engine_L'] = df.apply(lambda row : get_L(row), axis=1)
df['engine_V'] = df.apply(lambda row : get_V(row), axis=1)

### Drive train

In [299]:
# drop rows with blank drivetrain
df.drivetrain = df.drivetrain.apply(lambda x : x[1:-1])
df = df[df['drivetrain'] != '–']

In [300]:
def get_drivetrain(row):
    i = row.drivetrain
    four = re.compile('.*(Front|Rear|FWD).*')
    two = re.compile('.*(All|Four|4WD|AWD).*')
    if four.match(i):
        return 4
    elif two.match(i):
        return 2
    else:
        return 0

In [301]:
df['drivetrain'] = df.apply(lambda row : get_drivetrain(row), axis=1)

### Make

In [302]:
def get_make_module(row):
    i = row['name']
    w = i.split()
    return w

In [303]:
df['make'] = df.apply(lambda row : get_make_module(row)[1], axis=1)
df['module'] = df.apply(lambda row : get_make_module(row)[2], axis=1)

### Drop useless columns

In [306]:
df = df.drop(columns=\
             ['name', 'ex_color', 'in_color', \
              'mpg', 'fuel_type', 'transmission', 'engine'])

In [308]:
df.head()

Unnamed: 0,index,price,mileage,drivetrain,is_gas,is_diesel,is_electric,year,make,module,low_mpg,high_mpg,is_auto,is_cvt,is_manual,speed,engine_L,engine_V
0,0,27995.0,27995.0,4,1,0,0,2019,Honda,Accord,29,35,1,1,0,0,1.5,16
1,1,41998.0,41998.0,2,1,0,0,2018,Porsche,Macan,20,25,1,0,0,7,2.0,16
2,2,52000.0,52000.0,4,1,0,0,2015,Chevrolet,Corvette,17,29,1,0,0,8,6.2,16
3,3,41998.0,41998.0,2,1,0,0,2020,Jeep,Grand,18,25,1,0,0,8,3.6,24
4,4,29995.0,29995.0,4,1,0,0,2017,Ford,Mustang,15,25,1,0,0,0,5.0,0


## Build regression module