### Importing Necessary Libraries

In [52]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#### Website for the data extraction

In [53]:
website = 'https://www.cars.com/shopping/results/?page=1&page_size=250&body_style_slugs[]=suv&dealer_id=&keyword=&list_price_max=&list_price_min=&makes[]=&maximum_distance=250&mileage_max=&sort=best_match_desc&stock_type=used&year_max=&year_min=&zip=47408'

In [54]:
## Get request
response = requests.get(website)

In [55]:
## Checking the response from website
response.status_code

200

In [59]:
## Creating soup object with html parser
soup = BeautifulSoup(response.content,'html.parser')

In [60]:
soup

<!DOCTYPE html>

<html class="ep-theme-cars" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title data-suffix=" | Cars.com">Used vehicles for Sale Near Me | Cars.com</title>
<meta content="Shop used vehicles for sale at Cars.com. Research, compare, and save listings, or contact sellers directly from 10,000+ vehicles nationwide." name="description"/>
<meta content="noindex, nofollow" name="robots"/>
<meta content="Cars.com" property="og:site_name"/>
<meta content="website" property="og:type"/>
<meta content="Used vehicles for Sale Near Me | Cars.com" property="og:title"/>
<meta content="https://www.cars.com/shopping/results/" property="og:url"/>
<meta content="Shop used vehicles for sale at Cars.com. Research, compare, and save listings, or contact sellers directly from 10,000+ vehicles nationwide." property="og:description"/>
<meta content="https://graphics.cars.co

In [61]:
## Getting the web element from the website 
results = soup.find_all('div' ,{'class' : 'vehicle-card' })

In [47]:
results[0].find('a',href=True)

<a class="vehicle-card-visited-tracking-link" href="/vehicledetail/3af551e6-0da1-444c-aab1-702323661887/?results_page_number=1&amp;search_instance_id=c2dbef79-d8b2-484e-a38d-692d38227952&amp;search_zipcode=47408&amp;vertical_position=1&amp;web_page_type=shopping%2Fsearch-results" rel="nofollow" tabindex="-1"></a>

In [62]:
## Getting the web element from the website 
results = soup.find_all('div' ,{'class' : 'vehicle-card' })

In [63]:
## Extracting the Name 
results[0].find('h2').get_text()

'2019 Lexus RX 350 '

In [9]:
## Extracting the Milage 
results[0].find('div',{'class' : 'mileage'}).get_text()

'29,014 mi.'

In [15]:
## Extracting the Reviews
results[0].find('span',{'class' : 'sds-rating__link'}).get_text()

'(0 reviews)'

In [16]:
## Extracting the Price
results[0].find('span',{'class' : 'primary-price'}).get_text()

'$36,989'

In [17]:
## Extracting the Dealer
results[0].find('div',{'class' : 'dealer-name'}).get_text().strip()

'EchoPark Automotive Louisville Delivery Center'

In [18]:
name = []
mileage = []
price = []
ratings = []
reviews = []
dealer = []

for i in results:
    
    try:
        name.append(i.find('h2').get_text())
    except:
        name.append('NaN')
        
    try:
        mileage.append(i.find('div',{'class' : 'mileage'}).get_text())
    except:
        mileage.append('NaN')
    
    try:
        price.append(i.find('span',{'class' : 'primary-price'}).get_text())
    except:
        price.append('NaN')
        
    try:
        ratings.append(i.find('span',{'class' : 'sds-rating__count'}).get_text())
    except:
        ratings.append('NaN')
        
    try:
        reviews.append(i.find('span',{'class' : 'sds-rating__link'}).get_text())
    except:
        reviews.append('NaN')
        
    try:
        dealer.append(i.find('div',{'class' : 'dealer-name'}).get_text().strip())
    except:
        dealer.append('NaN')

In [19]:
used_car = pd.DataFrame({'Name' : name ,
                        'Mileage' : mileage,
                        'Price' : price,
                        'Dealer' : dealer,
                        'Ratings' : ratings,
                        'Reviews' : reviews})

In [20]:
used_car

Unnamed: 0,Name,Mileage,Price,Dealer,Ratings,Reviews
0,2020 Acura RDX Base,"29,014 mi.","$36,989",EchoPark Automotive Louisville Delivery Center,,(0 reviews)
1,2020 GMC Yukon SLT,"65,291 mi.","$49,402",Hertz Car Sales Indianapolis,3.8,(4 reviews)
2,2019 Porsche Macan Base,"20,817 mi.","$55,989",Motor Werks - Barrington Auto Mall,4.8,"(4,663 reviews)"
3,2010 Ford Expedition Limited,"40,264 mi.","$20,990",Carvana-Touchless Delivery To Your Home,4.3,"(8,507 reviews)"
4,2020 Tesla Model X Long Range Plus,"43,059 mi.","$94,989",EchoPark Automotive Louisville Delivery Center,,(0 reviews)
...,...,...,...,...,...,...
244,2021 Subaru Outback Premium,"14,752 mi.","$36,875",Twin City Dodge Chrysler Jeep RAM,3.7,(60 reviews)
245,2019 Subaru Forester Touring,"19,862 mi.","$36,874",Dean Team Volkswagen Subaru of Ballwin,4.9,"(4,686 reviews)"
246,2020 Jeep Grand Cherokee Trailhawk,"15,478 mi.","$45,000",Lexus of Cool Springs,4.7,(294 reviews)
247,2020 Dodge Durango R/T,"16,973 mi.","$50,481",Monster Motors,4.3,(125 reviews)


#### Performing data cleaning

In [21]:
used_car['Reviews'] = used_car['Reviews'].apply(lambda x:x.strip('reviews)').strip('('))

In [22]:
used_car['Made Year'] = used_car['Name'].str.extract('(^\d*)')

In [23]:
used_car

Unnamed: 0,Name,Mileage,Price,Dealer,Ratings,Reviews,Made Year
0,2020 Acura RDX Base,"29,014 mi.","$36,989",EchoPark Automotive Louisville Delivery Center,,0,2020
1,2020 GMC Yukon SLT,"65,291 mi.","$49,402",Hertz Car Sales Indianapolis,3.8,4,2020
2,2019 Porsche Macan Base,"20,817 mi.","$55,989",Motor Werks - Barrington Auto Mall,4.8,4663,2019
3,2010 Ford Expedition Limited,"40,264 mi.","$20,990",Carvana-Touchless Delivery To Your Home,4.3,8507,2010
4,2020 Tesla Model X Long Range Plus,"43,059 mi.","$94,989",EchoPark Automotive Louisville Delivery Center,,0,2020
...,...,...,...,...,...,...,...
244,2021 Subaru Outback Premium,"14,752 mi.","$36,875",Twin City Dodge Chrysler Jeep RAM,3.7,60,2021
245,2019 Subaru Forester Touring,"19,862 mi.","$36,874",Dean Team Volkswagen Subaru of Ballwin,4.9,4686,2019
246,2020 Jeep Grand Cherokee Trailhawk,"15,478 mi.","$45,000",Lexus of Cool Springs,4.7,294,2020
247,2020 Dodge Durango R/T,"16,973 mi.","$50,481",Monster Motors,4.3,125,2020


In [24]:
used_car['Name'] = used_car['Name'].apply(lambda x: str(x)[4:])

In [25]:
used_car

Unnamed: 0,Name,Mileage,Price,Dealer,Ratings,Reviews,Made Year
0,Acura RDX Base,"29,014 mi.","$36,989",EchoPark Automotive Louisville Delivery Center,,0,2020
1,GMC Yukon SLT,"65,291 mi.","$49,402",Hertz Car Sales Indianapolis,3.8,4,2020
2,Porsche Macan Base,"20,817 mi.","$55,989",Motor Werks - Barrington Auto Mall,4.8,4663,2019
3,Ford Expedition Limited,"40,264 mi.","$20,990",Carvana-Touchless Delivery To Your Home,4.3,8507,2010
4,Tesla Model X Long Range Plus,"43,059 mi.","$94,989",EchoPark Automotive Louisville Delivery Center,,0,2020
...,...,...,...,...,...,...,...
244,Subaru Outback Premium,"14,752 mi.","$36,875",Twin City Dodge Chrysler Jeep RAM,3.7,60,2021
245,Subaru Forester Touring,"19,862 mi.","$36,874",Dean Team Volkswagen Subaru of Ballwin,4.9,4686,2019
246,Jeep Grand Cherokee Trailhawk,"15,478 mi.","$45,000",Lexus of Cool Springs,4.7,294,2020
247,Dodge Durango R/T,"16,973 mi.","$50,481",Monster Motors,4.3,125,2020


In [26]:
used_car['Manufacturer'] = used_car['Name'].str.split(' ').str[1]

In [27]:
used_car

Unnamed: 0,Name,Mileage,Price,Dealer,Ratings,Reviews,Made Year,Manufacturer
0,Acura RDX Base,"29,014 mi.","$36,989",EchoPark Automotive Louisville Delivery Center,,0,2020,Acura
1,GMC Yukon SLT,"65,291 mi.","$49,402",Hertz Car Sales Indianapolis,3.8,4,2020,GMC
2,Porsche Macan Base,"20,817 mi.","$55,989",Motor Werks - Barrington Auto Mall,4.8,4663,2019,Porsche
3,Ford Expedition Limited,"40,264 mi.","$20,990",Carvana-Touchless Delivery To Your Home,4.3,8507,2010,Ford
4,Tesla Model X Long Range Plus,"43,059 mi.","$94,989",EchoPark Automotive Louisville Delivery Center,,0,2020,Tesla
...,...,...,...,...,...,...,...,...
244,Subaru Outback Premium,"14,752 mi.","$36,875",Twin City Dodge Chrysler Jeep RAM,3.7,60,2021,Subaru
245,Subaru Forester Touring,"19,862 mi.","$36,874",Dean Team Volkswagen Subaru of Ballwin,4.9,4686,2019,Subaru
246,Jeep Grand Cherokee Trailhawk,"15,478 mi.","$45,000",Lexus of Cool Springs,4.7,294,2020,Jeep
247,Dodge Durango R/T,"16,973 mi.","$50,481",Monster Motors,4.3,125,2020,Dodge


In [28]:
used_car['Model'] = used_car['Name'].str.split(' ').str[2]

In [29]:
used_car

Unnamed: 0,Name,Mileage,Price,Dealer,Ratings,Reviews,Made Year,Manufacturer,Model
0,Acura RDX Base,"29,014 mi.","$36,989",EchoPark Automotive Louisville Delivery Center,,0,2020,Acura,RDX
1,GMC Yukon SLT,"65,291 mi.","$49,402",Hertz Car Sales Indianapolis,3.8,4,2020,GMC,Yukon
2,Porsche Macan Base,"20,817 mi.","$55,989",Motor Werks - Barrington Auto Mall,4.8,4663,2019,Porsche,Macan
3,Ford Expedition Limited,"40,264 mi.","$20,990",Carvana-Touchless Delivery To Your Home,4.3,8507,2010,Ford,Expedition
4,Tesla Model X Long Range Plus,"43,059 mi.","$94,989",EchoPark Automotive Louisville Delivery Center,,0,2020,Tesla,Model
...,...,...,...,...,...,...,...,...,...
244,Subaru Outback Premium,"14,752 mi.","$36,875",Twin City Dodge Chrysler Jeep RAM,3.7,60,2021,Subaru,Outback
245,Subaru Forester Touring,"19,862 mi.","$36,874",Dean Team Volkswagen Subaru of Ballwin,4.9,4686,2019,Subaru,Forester
246,Jeep Grand Cherokee Trailhawk,"15,478 mi.","$45,000",Lexus of Cool Springs,4.7,294,2020,Jeep,Grand
247,Dodge Durango R/T,"16,973 mi.","$50,481",Monster Motors,4.3,125,2020,Dodge,Durango


In [25]:
used_car['Name'] = used_car['Model']

In [26]:
used_car

Unnamed: 0,Name,Mileage,Price,Dealer,Ratings,Reviews,Made Year,Manufacturer,Model
0,CR-V,"31,360 mi.","$33,489",Motor Werks - Barrington Auto Mall,4.8,4656,2018,Honda,CR-V
1,Telluride,"5,665 mi.","$58,278",Dean Team Volkswagen Subaru of Ballwin,4.9,4685,2021,Kia,Telluride
2,SRX,"99,736 mi.","$14,985",Hendersonville Auto Brokers,3.9,20,2012,Cadillac,SRX
3,Traverse,"48,705 mi.","$35,999",Vroom - Get It Delivered Nationwide,3.6,7870,2020,Chevrolet,Traverse
4,CR-V,"13,221 mi.","$34,995",Valley Honda,4.9,8750,2018,Honda,CR-V
...,...,...,...,...,...,...,...,...,...
244,XC90,"89,722 mi.","$27,996",Germain Honda of Dublin,4.8,4047,2016,Volvo,XC90
245,Grand,"44,587 mi.","$39,990",Mathews Chrysler Jeep Dodge Ram,4.0,34,2020,Jeep,Grand
246,Tahoe,"46,615 mi.","$54,987",Dan Cummins Chevrolet Buick of Paris,4.8,2628,2018,Chevrolet,Tahoe
247,Durango,"33,351 mi.","$37,500",Napleton's Mid Rivers Chrysler Jeep Dodge RAM ...,4.4,152,2018,Dodge,Durango


In [27]:
used_car = used_car[['Model','Manufacturer','Made Year','Mileage','Price','Dealer','Ratings','Reviews']]

In [28]:
used_car = used_car[['Manufacturer','Model','Made Year','Mileage','Price','Dealer','Ratings','Reviews']]

In [29]:
used_car

Unnamed: 0,Manufacturer,Model,Made Year,Mileage,Price,Dealer,Ratings,Reviews
0,Honda,CR-V,2018,"31,360 mi.","$33,489",Motor Werks - Barrington Auto Mall,4.8,4656
1,Kia,Telluride,2021,"5,665 mi.","$58,278",Dean Team Volkswagen Subaru of Ballwin,4.9,4685
2,Cadillac,SRX,2012,"99,736 mi.","$14,985",Hendersonville Auto Brokers,3.9,20
3,Chevrolet,Traverse,2020,"48,705 mi.","$35,999",Vroom - Get It Delivered Nationwide,3.6,7870
4,Honda,CR-V,2018,"13,221 mi.","$34,995",Valley Honda,4.9,8750
...,...,...,...,...,...,...,...,...
244,Volvo,XC90,2016,"89,722 mi.","$27,996",Germain Honda of Dublin,4.8,4047
245,Jeep,Grand,2020,"44,587 mi.","$39,990",Mathews Chrysler Jeep Dodge Ram,4.0,34
246,Chevrolet,Tahoe,2018,"46,615 mi.","$54,987",Dan Cummins Chevrolet Buick of Paris,4.8,2628
247,Dodge,Durango,2018,"33,351 mi.","$37,500",Napleton's Mid Rivers Chrysler Jeep Dodge RAM ...,4.4,152


In [31]:
used_car.to_excel("test.xlsx" , index = False)

#### Extracting the data in bulk

In [32]:

name = []
mileage = []
price = []
ratings = []
reviews = []
dealer = []

for j in range(1,101):
    
    website_new = 'https://www.cars.com/shopping/results/?page=' + str(j) + '&page_size=250&body_style_slugs[]=suv&dealer_id=&keyword=&list_price_max=&list_price_min=&makes[]=&maximum_distance=250&mileage_max=&sort=best_match_desc&stock_type=used&year_max=&year_min=&zip=47408'
    
    response = requests.get(website)
    
    soup = BeautifulSoup(response.content,'html.parser')
    
    results = soup.find_all('div' ,{'class' : 'vehicle-card' })
    
    for i in results:
    
        try:
            name.append(i.find('h2').get_text())
        except:
            name.append('NaN')
        
        try:
            mileage.append(i.find('div',{'class' : 'mileage'}).get_text())
        except:
            mileage.append('NaN')
    
        try:
            price.append(i.find('span',{'class' : 'primary-price'}).get_text())
        except:
            price.append('NaN')
        
        try:
            ratings.append(i.find('span',{'class' : 'sds-rating__count'}).get_text())
        except:
            ratings.append('NaN')
        
        try:
            reviews.append(i.find('span',{'class' : 'sds-rating__link'}).get_text())
        except:
            reviews.append('NaN')
        
        try:
            dealer.append(i.find('div',{'class' : 'dealer-name'}).get_text().strip())
        except:
            dealer.append('NaN')
    

In [33]:
used_car_new = pd.DataFrame({'Name' : name ,
                        'Mileage' : mileage,
                        'Price' : price,
                        'Dealer' : dealer,
                        'Ratings' : ratings,
                        'Reviews' : reviews})

In [34]:
len(used_car_new) ## Length of the data

24900

#### Data Cleaning

In [35]:
used_car_new['Reviews'] = used_car_new['Reviews'].apply(lambda x:x.strip('reviews)').strip('('))

In [37]:
used_car_new['Made Year'] = used_car_new['Name'].str.extract('(^\d*)')

In [38]:
used_car_new['Name'] = used_car_new['Name'].apply(lambda x: str(x)[4:])

In [40]:
used_car_new['Manufacturer'] = used_car_new['Name'].str.split(' ').str[1]

In [41]:
used_car_new['Model'] = used_car_new['Name'].str.split(' ').str[2]

In [42]:
used_car_new['Name'] = used_car_new['Model']

In [43]:
used_car_new = used_car_new[['Manufacturer','Model','Made Year','Mileage','Price','Dealer','Ratings','Reviews']]

In [44]:
used_car_new

Unnamed: 0,Manufacturer,Model,Made Year,Mileage,Price,Dealer,Ratings,Reviews
0,Honda,CR-V,2018,"31,360 mi.","$33,489",Motor Werks - Barrington Auto Mall,4.8,4656
1,Kia,Telluride,2021,"5,665 mi.","$58,278",Dean Team Volkswagen Subaru of Ballwin,4.9,4685
2,Cadillac,SRX,2012,"99,736 mi.","$14,985",Hendersonville Auto Brokers,3.9,20
3,Chevrolet,Traverse,2020,"48,705 mi.","$35,999",Vroom - Get It Delivered Nationwide,3.6,7870
4,Honda,CR-V,2018,"13,221 mi.","$34,995",Valley Honda,4.9,8750
...,...,...,...,...,...,...,...,...
24895,Volvo,XC90,2016,"89,722 mi.","$27,996",Germain Honda of Dublin,4.8,4047
24896,Jeep,Grand,2020,"44,587 mi.","$39,990",Mathews Chrysler Jeep Dodge Ram,4.0,34
24897,Chevrolet,Tahoe,2018,"46,615 mi.","$54,987",Dan Cummins Chevrolet Buick of Paris,4.8,2628
24898,Dodge,Durango,2018,"33,351 mi.","$37,500",Napleton's Mid Rivers Chrysler Jeep Dodge RAM ...,4.4,152


#### Saving the data into excel file

In [46]:
used_car_new.to_excel("used_car_dataset.xlsx" , index = False)