In [1]:
import os, time
import requests
import bs4
import pandas as pd

from PIL import Image
from pathlib import Path

## Exploring page layout

In [2]:
def get_soup(url):
    res = requests.get(url)
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    return soup

In [3]:
# url to page that has adverts without photos
no_photo_url = 'https://auto.ria.com/uk/legkovie/?page=16583'
soup = get_soup(no_photo_url)

In [4]:
# image URL
for idx, item in enumerate(soup.select('img[width="380"]'), 1):
    print(idx, item.get('src'))

1 https://cdn2.riastatic.com/photosnew/auto/photo/nissan_x-trail__317144487bx.jpg
2 https://cdn4.riastatic.com/photosnew/auto/photo/mazda_mpv__316879619bx.jpg
3 https://cdn0.riastatic.com/photosnew/auto/photo/vaz_2121__313667485bx.jpg
4 https://cdn3.riastatic.com/photosnew/auto/photo/subaru_outback__316637748bx.jpg
5 https://cdn0.riastatic.com/photosnew/auto/photo/bmw_520__316346255bx.jpg
6 https://cdn1.riastatic.com/photosnew/auto/photo/vaz_2101__312900731bx.jpg


In [5]:
elements = soup.select('.content-bar')
len(elements)

9

In [6]:
# Seems like some cars have no photo and as placeholders used blank images
# https://img6.auto.ria.com/images/nophoto/no-photo-295x195.jpg
for elem in elements:
    print(elem.div.img.get('src'))

https://img6.auto.ria.com/images/nophoto/no-photo-295x195.jpg
https://cdn2.riastatic.com/photosnew/auto/photo/nissan_x-trail__317144487bx.jpg
https://img6.auto.ria.com/images/nophoto/no-photo-295x195.jpg
https://cdn4.riastatic.com/photosnew/auto/photo/mazda_mpv__316879619bx.jpg
https://cdn0.riastatic.com/photosnew/auto/photo/vaz_2121__313667485bx.jpg
https://img6.auto.ria.com/images/nophoto/no-photo-295x195.jpg
https://cdn3.riastatic.com/photosnew/auto/photo/subaru_outback__316637748bx.jpg
https://cdn0.riastatic.com/photosnew/auto/photo/bmw_520__316346255bx.jpg
https://cdn1.riastatic.com/photosnew/auto/photo/vaz_2101__312900731bx.jpg


In [7]:
element = elements[0]
element

<div class="content-bar"> <a class="m-link-ticket" href="https://auto.ria.com/uk/auto_ford_transit_gruz_26234669.html"></a> <div class="ticket-photo"> <a class="photo-135x90 bg-no-photo" href="https://auto.ria.com/uk/auto_ford_transit_gruz_26234669.html" onclick="_gaq.push(['_trackEvent', 'BuSearch', 'ClickOn_ad_photo', 'go_to_ad_page'])" target="_self" title="Ford Transit груз. 2006 в Харкові"> <img alt="Ford Transit груз. 2006 в Харкові" src="https://img6.auto.ria.com/images/nophoto/no-photo-295x195.jpg" title="Ford Transit груз. 2006 в Харкові"/> </a> </div> <div class="content"> <div class="head-ticket"> <div class="item ticket-title"> <a class="address" data-template-v="6" href="https://auto.ria.com/uk/auto_ford_transit_gruz_26234669.html" target="_self" title="Ford Transit груз. 2006 в Харкові"> <span class="blue bold">Ford Transit груз. </span> 2006 </a> </div> </div> <div class="price-ticket" data-main-currency="USD" data-main-price="5800"> <span class="size15"> <span class="bo

In [8]:
# image URL
element.div.img.get('src')

'https://img6.auto.ria.com/images/nophoto/no-photo-295x195.jpg'

In [9]:
# price, currency
(element.select('.price-ticket')[0].get('data-main-price'), 
 element.select('.price-ticket')[0].get('data-main-currency'))

('5800', 'USD')

In [10]:
# brand
element.select('.address')[0].get_text()

' Ford Transit груз.  2006 '

In [11]:
# mileage
element.select('.definition-data li i[title="Пробіг"]')[0].find_parent().get_text()

' 290 тис. км '

In [12]:
# fuel type & engine volume
element.select('.definition-data li i[title="Тип палива"]')[0].find_parent().get_text()

' Дизель, 2 л. '

In [13]:
# transmission type
element.select('.definition-data li i[title="Тип коробки передач"]')[0].find_parent().get_text()

' Ручна / Механіка '

In [14]:
# publication date
element.select('.footer_ticket')[0].get_text()

'   26.04.2021          '

In [15]:
blank_photo = 'https://img6.auto.ria.com/images/nophoto/no-photo-295x195.jpg'
for elem in elements:
    img_url = elem.div.img.get('src')
    if img_url == blank_photo:
        continue
    
    brand = elem.select('.address')[0].get_text().strip()
    price = elem.select('.price-ticket')[0].get('data-main-price')
    currency = elem.select('.price-ticket')[0].get('data-main-currency')
    mileage = elem.select('.definition-data li i[title="Пробіг"]')[0].find_parent().get_text()
    f_type = elem.select('.definition-data li i[title="Тип палива"]')[0].find_parent().get_text()
    t_type = elem.select('.definition-data li i[title="Тип коробки передач"]')[0].find_parent().get_text()
    pub_date = elem.select('.footer_ticket')[0].get_text()
    
    print(img_url)
    print(brand, price, currency, mileage, f_type, t_type, pub_date, sep='|')
    print()
    

https://cdn2.riastatic.com/photosnew/auto/photo/nissan_x-trail__317144487bx.jpg
Nissan X-Trail  2002|6900|USD| 290 тис. км | Дизель, 2.2 л. | Ручна / Механіка |   26.04.2021          

https://cdn4.riastatic.com/photosnew/auto/photo/mazda_mpv__316879619bx.jpg
Mazda MPV  2005|7500|USD| 290 тис. км | 2 л. | Ручна / Механіка |   26.04.2021          

https://cdn0.riastatic.com/photosnew/auto/photo/vaz_2121__313667485bx.jpg
ВАЗ 2121  1989|3200|USD| 10 тис. км | Дизель, 1.9 л. | Ручна / Механіка |   26.04.2021          

https://cdn3.riastatic.com/photosnew/auto/photo/subaru_outback__316637748bx.jpg
Subaru Outback Active 2009|10400|USD| 241 тис. км | Дизель, 2 л. | Ручна / Механіка |   26.04.2021          

https://cdn0.riastatic.com/photosnew/auto/photo/bmw_520__316346255bx.jpg
BMW 520  2002|7500|USD| 269 тис. км | Бензин, 2.2 л. | Автомат |   26.04.2021          

https://cdn1.riastatic.com/photosnew/auto/photo/vaz_2101__312900731bx.jpg
ВАЗ 2101  1977|600|USD| 510 тис. км | Бензин | Ручна

In [16]:
# Detecting empty page
not_found_url = 'https://auto.ria.com/uk/legkovie/?page=16866'
nf_soup = get_soup(not_found_url)

nf_soup.select('.content-bar')

[]

## Test image loading

In [17]:
image_url = soup.select('img[width="380"]')[0].get('src')
image_url

'https://cdn2.riastatic.com/photosnew/auto/photo/nissan_x-trail__317144487bx.jpg'

In [18]:
!ls | grep *.jpg

In [19]:
img = Image.open(requests.get(image_url, stream=True).raw)
image_path = os.path.basename(image_url)
img.save(image_path)

In [20]:
!ls | grep *.jpg

nissan_x-trail__317144487bx.jpg


In [21]:
os.remove(image_path)

In [22]:
assert image_path not in os.listdir()

## Design scraper and ...scrape )

In [23]:
def get_soup(url):
    res = requests.get(url)
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    return soup


def download_image(image_url, image_id, upload_dir):
    img = Image.open(requests.get(image_url, stream=True).raw)
    image_name = f"{image_id}_{os.path.basename(image_url)}"
    image_path = os.path.join(upload_dir, image_name)
    img.save(image_path)

    
def autoria_scraper(dataset_dir):

    cars_counter = 0
    pages_counter = 0
    blank_images = 0
    empty_pages = 0
    seperator = '|'
    url = 'https://auto.ria.com/uk/legkovie/?page='
    
    with open(os.path.join(dataset_dir, 'autoria_data.csv'), 'w') as file:
        columns = ['id', 'brand', 'price', 'currency', 'mileage', 'fuel_type', 
                   'transmission_type', 'pub_date']
        file.write(seperator.join(columns) + '\n')
        
        img_upload_dir = os.path.join(dataset_dir, 'images')
        os.makedirs(img_upload_dir, exist_ok=True)
        
        for page_id in range(1, 16000):
            
            try:
                time.sleep(0.1)
                next_page_url = url + str(page_id)
                soup = get_soup(next_page_url)
                elements = soup.select('.content-bar')
                
                if not elements:
                    empty_pages += 1
                    continue

                for elem in elements:
                    img_url = elem.div.img.get('src')
                    if img_url == blank_photo:
                        blank_images += 1
                        continue

                    brand = elem.select('.address')[0].get_text().strip()
                    price = elem.select('.price-ticket')[0].get('data-main-price')
                    currency = elem.select('.price-ticket')[0].get('data-main-currency')
                    mileage = elem.select('.definition-data li i[title="Пробіг"]')[0].find_parent().get_text()
                    f_type = elem.select('.definition-data li i[title="Тип палива"]')[0].find_parent().get_text()
                    t_type = elem.select('.definition-data li i[title="Тип коробки передач"]')[0].find_parent().get_text()
                    pub_date = elem.select('.footer_ticket')[0].get_text()


                    download_image(img_url, cars_counter, img_upload_dir)
                    row = seperator.join([str(cars_counter), brand, price, 
                                          currency, mileage, f_type, t_type, pub_date])
                    file.write(row + '\n')

                    cars_counter += 1

                pages_counter += 1

                print(f"Pages scraped: {pages_counter}   cars_loaded: {cars_counter}", end='\r')
                
            except:
                pass
        # duplicate print without end='\r', to see final output
        print(f"Pages scraped: {pages_counter};   cars_loaded: {cars_counter}")
        print(f"Empty pages: {empty_pages};   blank images: {blank_images}")


In [24]:
"""
Files structure:

root
 |
  - datasets
     |
      - autoria
        |
         - autoria_data.csv
         - images
  - scrapers
    |
     - autoria_scraper.ipynb (current location)
"""

cwd = Path(os.getcwd())
dataset_dir = cwd.parent/'datasets'/'autoria'
dataset_dir.exists()

True

In [222]:
# Output after scraping 16000* pages:

# Pages scraped: 15511;   cars_loaded: 146814
# Empty pages: 14;   blank images: 337
# Time taken: 40958.705729 seconds


# Uncomment lines below to start scraping

# start = time.perf_counter()
# autoria_scraper(dataset_dir)
# stop = time.perf_counter()

print(f"Time taken: {stop - start} seconds")

Pages scraped: 15511;   cars_loaded: 146814
Empty pages: 14;   blank images: 337
Time taken: 40958.705729 seconds


## Quick check of scraped data

In [25]:
data_csv = dataset_dir/'autoria_data.csv'
data_csv.exists()

True

In [26]:
df = pd.read_csv(data_csv, sep='|')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146814 entries, 0 to 146813
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   id                 146814 non-null  int64 
 1   brand              146814 non-null  object
 2   price              146814 non-null  int64 
 3   currency           146814 non-null  object
 4   mileage            146814 non-null  object
 5   fuel_type          146814 non-null  object
 6   transmission_type  146814 non-null  object
 7   pub_date           146814 non-null  object
dtypes: int64(2), object(6)
memory usage: 9.0+ MB
