# Scraping Project

The aim of this project is to build a database of items on sale on an online store in order to have relevant data when publishing future items. In this case, I am interested in selling my car therefore I will be scraping the "peugeot 207" webpage but the same code could be used for other items making slight changes.

In [271]:
from bs4 import BeautifulSoup
import requests
from requests import get
import time 
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from random import randint
from time import sleep
from concurrent.futures import Future

In [272]:
#####The webpage and the product to scrape are determined#####
url= "https://listado.mercadolibre.com.uy/peugeot-207"
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')

In [273]:
#I first create a function that gets a list of all the different url pages by following the "next page bottom"
def get_all_urls(url):    
    
    ##initialize this for the while statement
    last_page = False
    
    #put the first url in the list b/c while loop only gets next url
    url_page_list = [url]

    while last_page == False:
        
        ##starts with first page, then grabs "next page" url and turns it into a soup item
        inner_response = requests.get(url)
        inner_soup = BeautifulSoup(inner_response.text, 'html.parser')

        #checks where there exists a "next page" text button. if not, last page = true
        if inner_soup.select(".andes-pagination__button--next > a") == []:
            last_page = True
            break
    
        ##grabs the href and append it to the url_pag_list
        url_page_list.append(inner_soup.select(".andes-pagination__button--next > a")[0]['href'])
    
        ##turns the url being passed through the while loop to the next url
        url = inner_soup.select(".andes-pagination__button--next > a")[0]['href'] 
        
    return(url_page_list)

In [274]:
#I call the function for the first URL
url_page_list=get_all_urls(url)

In [275]:
##Then for each page we need to get the link that corresponds to the item in this case the link that corresponds to the car

def get_car_links(url_page_list):
    link_list=[]
    for url in url_page_list:
        response = get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
    
        for url in html_soup.find_all('a', attrs = {'class': "item__info-link item__js-link "}):
            link_list.append(url.get('href'))
            
    return(link_list)

In [276]:
link_list=get_car_links(url_page_list)

In [277]:
##I check how many car links have been found
print(len(link_list))

231


In [278]:
##I will get from each car's webpage the attributes of interest

def get_attributes(link_list) :
#First I create empty lists to store the attributes of interest of each car on sale
    years=[]
    kms=[]
    brands=[]
    models=[]
    versions=[]
    types=[]
    motors=[]
    powers=[]
    descriptions=[]
    titles=[]
    prices=[]
    urls=[]


    for url in link_list:
    
        response = get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
    
        ##TITLE OF ARTICLE
        main_section=html_soup.find_all('h1')
        title=main_section[0].text
        titles.append(title)
    
        ##PRICE OF ARTICE
    
        price=html_soup.find_all('span', class_="price-tag-fraction")
        price=price[0].text
        prices.append(price)

    
        ##DETAILS                 
        detail_containers = html_soup.find_all('li', class_="specs-item")

        # The detail value 
        year=detail_containers[0].span.text
        years.append(year)
    
        km=detail_containers[1].span.text
        kms.append(km)
    
        brand=detail_containers[2].span.text
        brands.append(brand)
    
        model=detail_containers[3].span.text
        models.append(model)
    
        version=detail_containers[4].span.text
        versions.append(version)
    
        tipo=detail_containers[5].span.text
        types.append(tipo)
    
        motor=detail_containers[6].span.text
        motors.append(motor)
    
        power=detail_containers[7].span.text
        powers.append(power)
    
        urls.append(url)
    
        #DESCRIPTION
        if html_soup.find_all('div', class_="item-description__text") != []:
            container=html_soup.find_all('div', class_="item-description__text")
            description=container[0].text
        
        else: 
            description="/n"
    
        descriptions.append(description)
        
        ###I create a data frame with all the lists I have created on the scraping procedure
        
        test_cars = pd.DataFrame({'Car Title':titles,
                          'Year': years,
                          'Price': prices,
                        "Km" : kms,
                       'Brand': brands,
                       'Model': models,
                       "Version" : versions,
                         "Tipo" : types,
                         "Motor" : motors,
                         "Potencia": powers,
                         "Link" : urls,
                         'Description': descriptions})
    return(test_cars)


In [279]:
test_cars=get_attributes(link_list)

In [280]:
##I clean the scraped data a little bit
test_cars['Car Title'] = test_cars['Car Title'].map(lambda x: x.lstrip('\n\t\t').rstrip('\n\t'))
test_cars['Description'] = test_cars['Description'].map(lambda x: x.lstrip('\n'))
test_cars.Price = 'US$ ' + test_cars.Price 

In [281]:
#Check the data frame
test_cars.head(1)

Unnamed: 0,Car Title,Year,Price,Km,Brand,Model,Version,Tipo,Motor,Potencia,Link,Description
0,Peugeot 207 1.6 Coupe Turbo 150 Cv,2009,US$ 30.500,69.000 km,Peugeot,207,1.6 Coupe Turbo 150 Cv,Coupé,1.6,150hp,https://auto.mercadolibre.com.uy/MLU-457540561...,2do dueño SIEMPRE ATENDIDO EN SADAR\n


In [282]:
##Export the scraped date to CSV
test_cars.to_csv('Peugeot_Database.csv')

In [283]:
###I want to make the scraping faster. Therefore I will start by monitoring the time it is taking without the threading
import time
# Scraping the cars data without threading
start_time = time.time()

test_cars=get_attributes(link_list)
    
end_time = time.time()

print("Process time: ", round(end_time-start_time),  " seconds")

Process time:  160  seconds


In [292]:
##I now include in my function a ThreadPool in order to make the scraping process faster. I take the loop from the function and include it in the threadpool


##I will get from each car's webpage the attributes of interest

years=[]
kms=[]
brands=[]
models=[]
versions=[]
types=[]
motors=[]
powers=[]
descriptions=[]
titles=[]
prices=[]
urls=[]

def get_attributes2(link) :
#First I create empty lists to store the attributes of interest of each car on sale

    response = get(link)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    ##TITLE OF ARTICLE
    main_section=html_soup.find_all('h1')
    title=main_section[0].text
    titles.append(title)
    
    ##PRICE OF ARTICE
    
    price=html_soup.find_all('span', class_="price-tag-fraction")
    price=price[0].text
    prices.append(price)

    
    ##DETAILS                 
    detail_containers = html_soup.find_all('li', class_="specs-item")

    # The detail value 
    year=detail_containers[0].span.text
    years.append(year)
    
    km=detail_containers[1].span.text
    kms.append(km)
    
    brand=detail_containers[2].span.text
    brands.append(brand)
    
    model=detail_containers[3].span.text
    models.append(model)

    version=detail_containers[4].span.text
    versions.append(version)
    
    tipo=detail_containers[5].span.text
    types.append(tipo)
    
    motor=detail_containers[6].span.text
    motors.append(motor)
    
    power=detail_containers[7].span.text
    powers.append(power)
    
    urls.append(link)
    
    #DESCRIPTION
    if html_soup.find_all('div', class_="item-description__text") != []:
        container=html_soup.find_all('div', class_="item-description__text")
        description=container[0].text
        
    else: 
        description="/n"
    
    descriptions.append(description)
        


In [293]:
##We can check that by including the thread the time is reduced from 159 seconds to 34.7seconds
import time
start_time = time.time()

with ThreadPoolExecutor() as pool:
        evs = list(pool.map(get_attributes2, link_list))
    
end_time = time.time()

print("Process time: ", round(end_time-start_time),  " seconds")

Process time:  31  seconds


In [294]:
print(len(titles))
print(len(prices))

231
231


In [295]:
test_cars2=[]

In [296]:
###I create an other data frame with all the lists I have created on the scraping procedure with the thread
test_cars2 = pd.DataFrame({'Car Title':titles,
                          'Year': years,
                          'Price': prices,
                        "Km" : kms,
                       'Brand': brands,
                       'Model': models,
                       "Version" : versions,
                         "Tipo" : types,
                         "Motor" : motors,
                         "Potencia": powers,
                         "Link" : urls,
                         'Description': descriptions})

In [300]:
test_cars2.head(2)

Unnamed: 0,Car Title,Year,Price,Km,Brand,Model,Version,Tipo,Motor,Potencia,Link,Description
0,\n\t\tPeugeot 207 Compact 1.4\n\t,2010,9.45,85.000 km,Peugeot,207,207 compact,Nafta,5,,https://auto.mercadolibre.com.uy/MLU-457743368...,\n# PEUGEOT 207 COMPACT 1.4 ## año 2010# 86mil...
1,\n\t\tPeugeot 207 Full Frances 2013 Impecable\n\t,2013,10.3,100.000 km,Peugeot,207,1.4,Nafta,4,,https://auto.mercadolibre.com.uy/MLU-457128661...,\n207 frances full 2 dueños service oficiales ...


In [301]:
###We clean again the dataset for the function with the thread
test_cars2['Car Title'] = test_cars2['Car Title'].map(lambda x: x.lstrip('\n\t\t').rstrip('\n\t'))
test_cars2['Description'] = test_cars2['Description'].map(lambda x: x.lstrip('\n'))
test_cars2.Price = 'US$ ' + test_cars2.Price 
test_cars2.head(1)

Unnamed: 0,Car Title,Year,Price,Km,Brand,Model,Version,Tipo,Motor,Potencia,Link,Description
0,Peugeot 207 Compact 1.4,2010,US$ 9.450,85.000 km,Peugeot,207,207 compact,Nafta,5,,https://auto.mercadolibre.com.uy/MLU-457743368...,# PEUGEOT 207 COMPACT 1.4 ## año 2010# 86mil k...


In [302]:
len(link_list)

231

In [303]:
##I export it as a new file and check that all the list are appended in order.
test_cars2.to_csv('Peugeot_Database_2.csv')

#Done!!