# # Web scrapping data from website "cars.com"

In [2]:
# we need to use the urlopen module inside the urllib.request package
from urllib.request import urlopen as uReq
# we are importing the BeautifulSoup module from the bs4 package
from bs4 import BeautifulSoup as soup
# variable my_url is where we are storing the url
my_url = "https://www.cars.com/research/sedan/"
#variable uClient is used to pass the url into the uReq(urlopen)
uClient = uReq(my_url)
# variable page_html is used to open the html web page
page_html = uClient.read()
# closing the html webpage after it has been read
uClient.close()
# page soup variable is storing the parser for the html file
page_soup = soup(page_html, "html.parser")

# navigating through the html file to find all h4 tags with class name cui-delta
containers_1= page_soup.findAll("h4", {"class": "cui-delta"})

# navigating through the html file to find all "p" tags with class name "msrp" and "no-msrp-range"
containers_2= page_soup.findAll("p", {"class":["msrp","no-msrp-range"]})

# navigating through the html file to find all "p" tags and "cars-star-rating" with class name "no-cr" and "mlp" respectively
containers_3= page_soup.findAll(["p","cars-star-rating"], {"class":["no-cr","mlp"]})

# navigating through the html file to find all "p" tags with class name "mpg"
containers_4= page_soup.findAll("p", {"class":"mpg"})

# variable file_name is storing the csv file where the dataset will be saved
file_name = "cars.csv"
# variable f is used to open file cars.csv in the write mode
f = open(file_name, "w")
# variable headers stores the headers for our dataset
headers = "BRAND NAME, PRICE, CAR RATING, MILE PER GALLON\n"
# we write or save our headers first before the rest data
f.write(headers)
# create a counter
i = 0
# using a for loop, looping through data in containers_1
for data in containers_1:
	# exception handling of indexerror
	try:
		# increment of counter
		i = i + 1
		# price of cars stored here
		price = containers_2[i].text
		# car rating is stored here
		car_rating = containers_3[i].text.strip()
		# mile per gallon of car is stored here
		mile_per_gallon = containers_4[i].text
		# the car name are stored here
		car_name = containers_1[i].text
		
		#print("brand name: " + car_name)
		#print("MSRP: " + price)
		#print("car rating: " + car_rating)
		#print("MPG: " + mile_per_gallon + "\n")

		# we will write the data from the variables above into our csv file, we concatenate a delimeter "," as seperator for our csv file 
		# note: concatenate "\n" at the end to ensure a new line is started after writing first line of data
		f.write(car_name + "," + price.replace(","," ") + "," + car_rating + "," + mile_per_gallon + "\n")
	except IndexError:
		# if an index error occurs exception handles it by pass(ignoring it) and continue
		pass 
		continue





# Using pandas library, read scrapped csv file

In [6]:
import pandas as pd
data = pd.read_csv("cars.csv")

# Visualisation of scrapped data

In [8]:
df = pd.DataFrame(data)
df

Unnamed: 0,BRAND NAME,PRICE,CAR RATING,MILE PER GALLON
0,2021 BMW 750,$103 000,Not Available,19
1,2021 Audi S8,$130 900,Not Available,16
2,2021 BMW M5,No current listings. Get resale value.,Not Available,17
3,2021 Bentley Flying Spur,$216 700,Not Available,15
4,2021 Honda Clarity Plug-In Hybrid,$33 400,Not Available,42
5,2021 Maserati Ghibli,$72 190,Not Available,19
6,2021 Polestar 2,$59 900,Not Available,Coming Soon
7,2021 Subaru Legacy,$22 895,Not Available,30
8,2021 Volvo S90,$51 550,Not Available,25
9,2021 Audi A4,$39 100,Not Available,30


# Visualise the first 5 rows on the data

In [10]:
df1 = df.head()
df1

Unnamed: 0,BRAND NAME,PRICE,CAR RATING,MILE PER GALLON
0,2021 BMW 750,$103 000,Not Available,19
1,2021 Audi S8,$130 900,Not Available,16
2,2021 BMW M5,No current listings. Get resale value.,Not Available,17
3,2021 Bentley Flying Spur,$216 700,Not Available,15
4,2021 Honda Clarity Plug-In Hybrid,$33 400,Not Available,42


In [11]:
print(df1.describe)


<bound method NDFrame.describe of                           BRAND NAME                                   PRICE  \
0                       2021 BMW 750                                $103 000   
1                       2021 Audi S8                                $130 900   
2                        2021 BMW M5  No current listings. Get resale value.   
3           2021 Bentley Flying Spur                                $216 700   
4  2021 Honda Clarity Plug-In Hybrid                                 $33 400   

      CAR RATING  MILE PER GALLON  
0  Not Available               19  
1  Not Available               16  
2  Not Available               17  
3  Not Available               15  
4  Not Available               42  >
