# Car Price Prediction - Web Scraping Part

In [2]:
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
from bs4 import BeautifulSoup as bs
import requests
import re
import time
import numpy as np

In [3]:
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

In [4]:
url1 = 'https://www.cars.com/shopping/results/\
?page='
url2 = '&page_size=20&list_price_max=&makes[]=\
&maximum_distance=all&models[]=&stock_type=used&zip='

url3 = 'https://www.cars.com/shopping/results/\
?page='
url4 = '&page_size=20&list_price_max=&makes[]=&maximum_distance=all&models[]=&stock_type=new&zip='

url_list = []

for i in range(120):
    url_list.append(url1 + str(i+1) + url2)

column_names = ['name', 'price', 'mileage', 'drivetrain',\
                'mpg', 'fuel_type', 'transmission', 'engine', 'zipcode', 'oneowner', 'personal_use']    

df = pd.DataFrame(columns = column_names)

response_list = []

tmp_df = pd.DataFrame()

for url in url_list:
    drivetrains = []
    mpgs = []
    fuel_types = []
    transmissions = []
    engines = []
    zipcodes = []
    oneowners = []
    personal_uses = []
    res = requests.get(url).text
    soup = bs(res)
    
    # getting names
    names = soup.find_all(name='h2', attrs={'class': 'title'})
    names = pd.DataFrame(names)
    tmp_df['name'] = names

    # getting prices
    prices = soup.find_all(name='span', attrs={'class': 'primary-price'})
    prices = pd.DataFrame(prices)
    prices = prices.replace('\D', '', regex=True)
    tmp_df['price'] = prices

    # getting mileages
    mileages = soup.find_all(name='div', attrs={'class': 'mileage'})
    mileages = pd.DataFrame(mileages)
    mileages = mileages.iloc[1:]
    mileages = mileages.reset_index(drop=True)
    tmp_df['mileage'] = mileages
    # tmp_df['mileage'] = tmp_df['mileage'].apply(lambda x : x.text)
    # tmp_df['mileage'] = tmp_df['mileage'].replace('\D', '', regex=True)

    # getting links
    link_list = soup.find_all(name='a', attrs={'class': 'vehicle-card-link js-gallery-click-link'})
    links = []
    for link in link_list:
        links.append('https://www.cars.com' + link.get('href'))
        

    for link in links:
        res = requests.get(link)
        # print(res.status_code)
        time.sleep(2)
        sp = bs(res.text)

        # adding drivetrain
        dt = sp.find(name='dt', text='Drivetrain')
        if dt == None:
            drivetrains.append('')
        else:
            dt = dt.next.next.next.text
            drivetrains.append(dt)

        # adding mpg
        spf = sp.find(name='a', attrs={'class': 'sds-tooltip__trigger'})
        if spf != None:
            mpg = spf.previous_sibling.previous_sibling.text
            mpgs.append(mpg)
        else:
            mpgs.append('None')

        # adding fuel type
        fuel_type = sp.find(name='dt', text='Fuel type')
        if fuel_type == None:
            fuel_types.append('')
        else:
            fuel_type = fuel_type.next.next.next.text
            fuel_types.append(fuel_type)

        # adding transmission
        transmission = sp.find(name='dt', text='Transmission')\
            .next.next.next.text
        transmissions.append(transmission)

        # adding engine
        engine = sp.find(name='dt', text='Engine').next.next.next.text
        engines.append(engine)
        
        # adding zip code
        zipcode = sp.find(name='div', attrs={'class': 'dealer-address'})
        if zipcode == None:
            zipcodes.append(0)
        else:
            zipcode = zipcode.text
            zipcodes.append(zipcode[-5:])
        
        # adding one_owner
        # oneowner = sp.find(name='dt', text='1-owner vehicle').next.next.next.text
        oneowner = sp.find(name='dd', attrs={'data-qa': 'one-owner-value'})
        if oneowner == None:
            oneowner = 3
        else:
            if oneowner.text == 'Yes':
                oneowner = 1
            else:
                oneowner = 0
        oneowners.append(oneowner)
        
        # adding personal use
        personal_use = sp.find(name='dd', attrs={'data-qa': 'personal-use-value'})
        if personal_use == None:
            personal_use = 3
        else:
            if personal_use.text == 'Yes':
                personal_use = 1
            else:
                personal_use = 0
        personal_uses.append(personal_use)
        
    tmp_df['drivetrain'] = drivetrains
    tmp_df['mpg'] = mpgs
    tmp_df['fuel_type'] = fuel_types
    tmp_df['transmission'] = transmissions
    tmp_df['engine'] = engines
    tmp_df['zipcode'] = zipcodes
    tmp_df['oneowner'] = oneowners
    tmp_df['personal_use'] = personal_uses
    
    # df.append(tmp_df)
    
    df = pd.concat([df, tmp_df], axis=0)
    
    tmp_df.drop(tmp_df.index,inplace=True)

df = df.reset_index()
df = df.drop(columns=['index'])
df = df.dropna()
df.mileage = df.mileage.apply(lambda x : x.text)
df.mileage = df.mileage.replace('\D', '', regex=True)

In [19]:
df.to_csv('car_info_extended.csv', encoding='utf-8')