In [12]:
import argparse
import logging
import pathlib
import sys
import os
sys.path.append('../')
import json

import pandas as pd

from webdriver_manager.chrome import ChromeDriverManager

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC

from src.scrape.scraper import Scraper

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Importing and Working with Filters from JSON

In [2]:
with open('../data/filters.json') as f:
        filters = json.load(f)

In [3]:
filters

{'price_range': {'min': 0, 'max': 40000},
 'make': ['Alfa Romeo', 'Saab'],
 'body_type': ['Suv', 'Sedan', 'Hatchback', 'Truck', 'Wagon'],
 'year': {'min': 2015, 'max': 2022}}

### Test Results

In [4]:
with open("../data/processed/available_cars.json") as f:
    test_data = json.load(f)

for key in test_data.keys():
    print(key ,": ",len(test_data[key]))

make :  2040
model :  2040
year :  2040
trim :  2040
mileage :  2040
price :  2040
monthly_payment :  2040
status :  0
link :  2040


## Determining Filter Hierarchy
We need to determine what order we want to loop through the filters:
1. **Price and Year**: Since these are unchanging, we set these first
2. **Make**
3. **Body Type**

In [5]:
filters["price_range"]["min"]

0

In [7]:
filters["make"]

['Alfa Romeo', 'Saab']

# Getting Vehicle Details with BeautifulSoup

In [8]:
from bs4 import BeautifulSoup

In [13]:
scraper = Scraper()
scraper.driver.get("https://www.carvana.com/cars/chevrolet?email-capture=&page=7")
html = scraper.driver.page_source # get the html source
soup = BeautifulSoup(html, 'html.parser') # save as a BS object

In [32]:
year_makes = soup.find_all("div",{"class","year-make"})
#print(year_makes)
trim_mileages = soup.find_all("div",{"class","trim-mileage"})
#print(trim_mileages)
prices = soup.find_all("div",{"class","flex items-end font-bold mb-4 text-2xl"})
print(prices)
monthly_payments = soup.find_all("div",{"class","monthly-payment"})
#print(monthly_payments)
cars = soup.find_all("div",{"class","result-tile"})
#print(cars)

results = {
    "make":[],
    "model":[],
    "year":[],
    "trim":[],
    "mileage":[],
    "price":[],
    "monthly_payment":[],
    "link":[]
}
for year_make, trim_mileage, price, monthly_payment, car in zip(year_makes, trim_mileages, prices, monthly_payments, cars):

    print(year_make)
    simplified_year_make = [item for item in year_make.contents if item != " "]
    print(simplified_year_make)
    for key, val in zip(["year","make","model"],[simplified_year_make[0],simplified_year_make[1],simplified_year_make[2]]):
        print(key, val)
        results[key].append(val)

    trim = trim_mileage.findChildren("span")[0].contents[0]
    mileage = trim_mileage.findChildren("span")[1].contents[0]
    for key, val in zip(["trim","mileage"],[trim,mileage]):
        results[key].append(val)

    results["price"].append(price.contents[-1])

    payment = monthly_payment.findChildren("span")[0].contents
    results["monthly_payment"].append([item for item in payment if item != " "][1])

    hrefs = car.find_all("a",href=True)
    for ref in hrefs:
        results["link"].append(f"www.carvana.com{ref['href']}")
    
pd.DataFrame(results)

[<div class="flex items-end font-bold mb-4 text-2xl" data-qa="price" data-test="Price" data-testid="price">$17,990</div>, <div class="flex items-end font-bold mb-4 text-2xl" data-qa="price" data-test="Price" data-testid="price">$14,990</div>, <div class="flex items-end font-bold mb-4 text-2xl" data-qa="price" data-test="Price" data-testid="price">$14,590</div>, <div class="flex items-end font-bold mb-4 text-2xl" data-qa="price" data-test="Price" data-testid="price">$21,590</div>, <div class="flex items-end font-bold mb-4 text-2xl" data-qa="price" data-test="Price" data-testid="price">$17,990</div>, <div class="flex items-end font-bold mb-4 text-2xl" data-qa="price" data-test="Price" data-testid="price">$14,590</div>, <div class="flex items-end font-bold mb-4 text-2xl" data-qa="price" data-test="Price" data-testid="price">$18,590</div>, <div class="flex items-end font-bold mb-4 text-2xl" data-qa="price" data-test="Price" data-testid="price">$17,590</div>, <div class="flex items-end font

Unnamed: 0,make,model,year,trim,mileage,price,monthly_payment,link
0,Chevrolet,Volt,2011,base,14818,"$17,990",355,www.carvana.com/vehicle/2545288
1,Chevrolet,Cruze,2013,LT,35283,"$14,990",305,www.carvana.com/vehicle/2539220
2,Chevrolet,Spark,2013,LT,62195,"$14,590",298,www.carvana.com/vehicle/2535892
3,Chevrolet,Malibu,2020,LT,45817,"$21,590",419,www.carvana.com/vehicle/2418142
4,Chevrolet,Cruze,2016,LS,11871,"$17,990",355,www.carvana.com/vehicle/2535995
5,Chevrolet,Spark EV,2015,2LT,40086,"$14,590",282,www.carvana.com/vehicle/2536352
6,Chevrolet,Trax,2017,LT,40080,"$18,590",348,www.carvana.com/vehicle/2466311
7,Chevrolet,Trax,2015,LT,35008,"$17,590",330,www.carvana.com/vehicle/2483584
8,Chevrolet,Equinox,2019,Premier,53290,"$22,990",446,www.carvana.com/vehicle/2576440
9,Chevrolet,Volt,2014,base,80009,"$14,990",293,www.carvana.com/vehicle/2535020


In [19]:
year_makes = soup.find_all("div",{"class","year-make"})
print(year_makes[0].contents)

['2011', ' ', ' ', ' ', 'Chevrolet', ' ', ' ', ' ', 'Volt']


In [20]:
for trim_mileage in trim_mileages:
    trim = trim_mileage.findChildren("span")[0].contents
    mileage = trim_mileage.findChildren("span")[1].contents

In [21]:
pages = soup.find_all("span",{"class":"paginationstyles__PaginationText-mpry3x-5 iXXOCI"})
nav = pages[0].contents[0].split(" ")
nav[1] == nav[-1]

False

In [34]:
cars = soup.find_all("div",{"class","result-tile"})
for car in cars:
    hrefs = car.find_all("a",href=True)
    for ref in hrefs:
        print(f"www.carvana.com{ref['href']}")
    status_message = car.find_all("div",{"class","days-to-delivery days-to-delivery"})
    print(status_message[0].contents)


www.carvana.com/vehicle/2420612
['Get it by Thursday']
www.carvana.com/vehicle/2375292
['Get it by Thursday']
www.carvana.com/vehicle/2254728
['Get it by Thursday']
www.carvana.com/vehicle/2457433
['Get it by Thursday']
www.carvana.com/vehicle/2424996
['Get it by Thursday']
www.carvana.com/vehicle/2466397
['Get it by Thursday']
www.carvana.com/vehicle/2469624
['Get it by Thursday']
www.carvana.com/vehicle/2336254
['Get it by Thursday']
www.carvana.com/vehicle/2475544
['Get it by Thursday']
www.carvana.com/vehicle/2465075
['Get it by Thursday']
www.carvana.com/vehicle/2474543
['Get it by Thursday']
www.carvana.com/vehicle/2368806
['Get it by Thursday']
www.carvana.com/vehicle/2377063
['Get it by Thursday']
www.carvana.com/vehicle/2296827
['Get it by Thursday']
www.carvana.com/vehicle/2331217
['Get it by Thursday']
www.carvana.com/vehicle/2435996
['Get it by Thursday']
www.carvana.com/vehicle/2432943
['Get it by Thursday']
www.carvana.com/vehicle/2231399
['Get it by Thursday']
www.carvan

In [28]:

# days to delivery
statuses = soup.find_all("div",{"class","days-to-delivery days-to-delivery"})
for status in statuses:
    print(status.contents)

['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']
['Get it by Thursday']


In [29]:
# purchase in progress
statuses = soup.find_all("div",{"class","purchase-callout text-only locked"})
for status in statuses:
    print(status.contents)

In [24]:
for payment in monthly_payments:
    print(payment.findChildren("span")[0].contents)

['$', ' ', '415', ' ', '/mo']
['$', ' ', '420', ' ', '/mo']
['$', ' ', '369', ' ', '/mo']
['$', ' ', '336', ' ', '/mo']
['$', ' ', '443', ' ', '/mo']
['$', ' ', '446', ' ', '/mo']
['$', ' ', '355', ' ', '/mo']
['$', ' ', '460', ' ', '/mo']
['$', ' ', '380', ' ', '/mo']
['$', ' ', '449', ' ', '/mo']
['$', ' ', '427', ' ', '/mo']
['$', ' ', '341', ' ', '/mo']
['$', ' ', '415', ' ', '/mo']
['$', ' ', '450', ' ', '/mo']
['$', ' ', '321', ' ', '/mo']
['$', ' ', '360', ' ', '/mo']
['$', ' ', '340', ' ', '/mo']
['$', ' ', '328', ' ', '/mo']
['$', ' ', '288', ' ', '/mo']
['$', ' ', '320', ' ', '/mo']
