# OTOMOTO.pl - WebSrapping
<b>Małgorzata Stolarska</b><br>
malgorzata.stolarska@gmail.com

Purpose: obtaining data from www.OtoMoto.pl to analyze car offers. A wide range of data is collected. 

For performance reasons, the script is run on Google Colab and the data is saved in stages to Google Drive in case the session is broken.

## Imports, setting parameters and options

In [None]:
# Set True for quick test, set False for full web scrapping
TEST_MODE = True

In [None]:
# Set Google Drive path 
from google.colab import drive
drive.mount('/content/gdrive')

# Path for saving data
path = "/".join(["/content/gdrive/MyDrive/ForColab/OtoMoto", ("Test" if TEST_MODE else "")])
path

Mounted at /content/gdrive


'/content/gdrive/MyDrive/ForColab/OtoMoto/Test'

In [None]:
# Install chromium, its driver and selenium
!apt update
!apt install chromium-chromedriver
!pip install selenium

[33m0% [Working][0m            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
[33m0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connecting to security.u[0m                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
[33m0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connecting to security.u[0m[33m0% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)[0m                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
[33m0% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)[0m                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
[3

In [None]:
# Other imports
import requests
import json
import time
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm # progress bar
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from requests.exceptions import TooManyRedirects

# Set options for driver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Pandas options 
pd.set_option('display.max_columns', None) 

# Parameters
PAUSE_TIME = 0.1              # for downloading page  
ACCEPT_BUTTON_PAUSE_TIME = 1  # for showing GDPR clause 
NUMBER_OF_TRIES = 20          # for accepting GDPR clause

In [None]:
link = r"https://www.otomoto.pl/osobowe" # the main website
ajax_link = r"https://www.otomoto.pl/ajax/jsdata/params/" # car brands

## Gathering links to offers

In [None]:
def gather_hrefs(link, options=None, max_pages=None):
    
    """
    It collects links to offerts, where variable 'link' is a starting address.
    Because the webpage presents max. 500 pages, the method allows to collect max 16.000 offers:
    - if variable 'link' = https://www.otomoto.pl/osobowe", it returns offers of various brands,
    - if variable 'link' = https://www.otomoto.pl/osobowe/jeep", it returns offers of a given brand.
    """
    
    print("Starting page: %s" % link)
    
    # Launch driver
    driver = webdriver.Chrome(options=options)
    driver.get(link)
    time.sleep(ACCEPT_BUTTON_PAUSE_TIME)

    # Accept the clause (the Accept button is shown with delay)
    for i in range(NUMBER_OF_TRIES):
        try:
            button = driver.find_element_by_id('onetrust-accept-btn-handler')
            print("Progress... GDPR clause was accepted")
            button.click()
            time.sleep(PAUSE_TIME)
            break
        except NoSuchElementException as exception:
            time.sleep(PAUSE_TIME)
            continue
    
    hrefs = np.empty(0, dtype=str)
    page = 1

    while True:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        articles = soup.find_all("article")
        for a in articles:
            if a.has_attr("data-href"):
                hrefs = np.append(hrefs, a["data-href"])
        
        # Progress
        if (max_pages is not None and page < 10) or (page % 20 == 0):
            print("Progress... Page: %i, number of hrefs: %i" % (page, len(hrefs)))

        # Next page
        try:
            button = driver.find_element_by_xpath(r'//span[@class="icon-arrow_right"]') 
            # Click button by java script, also if it's hidden:
            driver.execute_script("arguments[0].click();", button) 
            time.sleep(PAUSE_TIME)                                 
        except NoSuchElementException:
            print("Number of hrefs: %i" % len(hrefs))
            break              

        page += 1
        if max_pages is not None and page > max_pages: 
            break
            
    driver.close()    
    return hrefs

In [None]:
# Brands of cars
params = requests.get(ajax_link)
params = json.loads(params.text.split('var searchConditions = ')[1].split(';var searchConditionsAdding')[0])
car_brands = [p for p in params['values']['573']['571']]

# The word "warszawa" indicates a city, not a brand
if "warszawa" in set(car_brands):    
    car_brands[car_brands.index("warszawa")] = "marka_warszawa"
# The link with "other" does not work properly
if "other" in set(car_brands):    
    car_brands[car_brands.index("other")] = "inny"

print("Total number of brands: %i" % len(car_brands))

# Dump brands to pickle
with open("/".join([path, "car_brands.obj"]), "wb") as fp:
    pickle.dump(car_brands, fp)

Total number of brands: 111


In [None]:
# OR:
# Read data from pickle 
with open("/".join([path, "car_brands.obj"]), 'rb') as fp:
    car_brands = pickle.load(fp)

In [None]:
{i: b for i, b in enumerate(car_brands)}

{0: 'abarth',
 1: 'acura',
 2: 'aixam',
 3: 'alfa-romeo',
 4: 'alpine',
 5: 'aston-martin',
 6: 'audi',
 7: 'austin',
 8: 'autobianchi',
 9: 'bac',
 10: 'bentley',
 11: 'bmw',
 12: 'bollore',
 13: 'brilliance',
 14: 'buick',
 15: 'cadillac',
 16: 'casalini',
 17: 'chatenet',
 18: 'chevrolet',
 19: 'chrysler',
 20: 'citroen',
 21: 'cupra',
 22: 'dacia',
 23: 'daewoo',
 24: 'daihatsu',
 25: 'de-lorean',
 26: 'dfsk',
 27: 'dkw',
 28: 'dodge',
 29: 'ds-automobiles',
 30: 'faw',
 31: 'ferrari',
 32: 'fiat',
 33: 'ford',
 34: 'gaz',
 35: 'gmc',
 36: 'grecav',
 37: 'holden',
 38: 'honda',
 39: 'hummer',
 40: 'hyundai',
 41: 'infiniti',
 42: 'isuzu',
 43: 'iveco',
 44: 'jaguar',
 45: 'jeep',
 46: 'kia',
 47: 'lada',
 48: 'lamborghini',
 49: 'lancia',
 50: 'land-rover',
 51: 'lexus',
 52: 'ligier',
 53: 'lincoln',
 54: 'lotus',
 55: 'maserati',
 56: 'maybach',
 57: 'mazda',
 58: 'mclaren',
 59: 'mercedes-benz',
 60: 'mercury',
 61: 'mg',
 62: 'microcar',
 63: 'mini',
 64: 'mitsubishi',
 65: 'mo

In [None]:
# ----------------------------------------------
# Only the first three brands in test mode
if TEST_MODE:
    car_brands = car_brands[0:3]
#-----------------------------------------------

In [None]:
# Search brand by brand
start = 0  # change to continue the broken loop
for brand in car_brands[start:]:      
    hrefs = gather_hrefs("/".join([link, brand]), options, max_pages=None)

    # Remove duplicates if exist
    hrefs = list(set(hrefs))
    
    # Dump hrefs to pickle
    with open("".join([path, "/hrefs_", brand, ".obj"]), "wb") as fp:
        pickle.dump(hrefs, fp)  

    print("-" * 30)                                                 

Starting page: https://www.otomoto.pl/osobowe/abarth
Progress... GDPR clause was accepted
Number of hrefs: 42
------------------------------
Starting page: https://www.otomoto.pl/osobowe/acura
Progress... GDPR clause was accepted
Number of hrefs: 15
------------------------------
Starting page: https://www.otomoto.pl/osobowe/aixam
Progress... GDPR clause was accepted
Number of hrefs: 102
------------------------------


## Obtaining data from offers

In [None]:
def gather_offerts(hrefs, with_description=False):

    """
    It collects the data of a single offer, located at the link 'href'.
    Collected data:
    - price details,
    - seller details,
    - car parameters,
    - additional features,
    - id, link and description (optional, if with description=True).
    If id is not founs
    """

    df = pd.DataFrame()
    col = ["href", "price_evaluation","price","currency","price_details","seller_type","seller_name"]
    if with_description:
        df_desc = pd.DataFrame() 
    else:
        df_desc = None   

    page = 0 
    total = len(hrefs)

    for href in hrefs:
        try:
          www = requests.get(href)
        except TooManyRedirects as exception:
            print("TooManyRedirects for href: %s" % href)
            continue  
            
        soup = BeautifulSoup(www.text, 'html.parser')

        # ID of article
        ad_id = soup.find("span", id="ad_id", class_="offer-meta__value")
        if ad_id is None: 
            continue # if the offer is removed, go to the next offer
        else:
            ad_id = ad_id.get_text().strip()     

        # Price is below/above/in average
        price_evaluation = soup.find("div", {"data-widget" : "PriceEvaluation/Display"})
        price_evaluation = json.loads(price_evaluation.get("data-props"))["indicator"] \
                                      if price_evaluation is not None else np.NaN

        price = soup.find("span", class_="offer-price__number").next_element
        currency = soup.find("span", class_="offer-price__currency")
        price_details = soup.find("span", class_="offer-price__details")
        seller_type = soup.find("small", class_="seller-box__seller-type")
        seller_name = soup.find("h2", class_="seller-box__seller-name")      

        val = [href]
        val.append(price_evaluation)
        val.append(price.strip() if price is not None else np.NaN)
        val.append(currency.get_text().strip() if currency is not None else np.NaN)
        val.append(price_details.get_text().strip() if price_details is not None else np.NaN)
        val.append(seller_type.get_text().strip() if seller_type is not None else np.NaN)
        val.append(seller_name.get_text().strip() if seller_name is not None else np.NaN)   

        # The parameter sets differ for offers
        parm_col = [p.get_text().strip() for p in soup.findAll("span", class_="offer-params__label")]
        parm_val = [p.get_text().strip() for p in soup.findAll("div", class_="offer-params__value")]

        # The features sets differ for offers
        feat_col = [p.get_text().strip() for p in soup.findAll("li", class_="offer-features__item")]
        feat_val = [1 for p in feat_col]
        
        # Remove duplicated columns (duplicated columns are the result of an error on the page)
        columns = col + parm_col + feat_col
        new_df = pd.DataFrame([val + parm_val + feat_val], columns=columns)
        duplicates = [i for i in columns if columns.count(i)>1]
        if len(duplicates) > 0:
            new_df = new_df.drop(columns=duplicates)      

        # Append new row to df
        try:
            df = df.append(new_df, ignore_index=True)         
        except ValueError as exception:
            print("ValueError [%s] for href: %s" % (str(exception), href))
            continue        

        if with_description:
            description = soup.find("div", class_="offer-description__description").get_text().strip()
            df_desc = df_desc.append(pd.Series([ad_id, description, href]),
                                     ignore_index=True)
        
        page += 1
        if page % 50 == 0:
            print("Progress... Page: %i / %i" % (page, total))

    if with_description:
        df_desc.columns = ["ad_id", "description", "href"]
    
    return df, df_desc

In [None]:
# Brand by brand
start = 0  # change to continue the broken loop
for brand in car_brands[start:]:    

    # Load hrefs from dump
    with open("".join([path, "/hrefs_", brand, ".obj"]), 'rb') as fp:
        hrefs = pickle.load(fp)

    print("Brand: %s, number of hrefs: %i" % (brand, len(hrefs)))
    offers, desc = gather_offerts(hrefs, with_description=False)
    
    # Dump offers to pickle
    with open("".join([path, "/offers_", brand, ".obj"]), "wb") as fp:
        pickle.dump(offers, fp)  
  
    # Dump desc to pickle
    if desc is not None:
        with open("".join([path, "/desc_", brand, ".obj"]), "wb") as fp:
            pickle.dump(desc, fp)     
 
    print("Brand: %s, number of offers downloaded: %i" % (brand, len(offers)))                                          

Brand: abarth, number of hrefs: 42
Brand: abarth, number of offers downloaded: 42
Brand: acura, number of hrefs: 15
Brand: acura, number of offers downloaded: 15
Brand: aixam, number of hrefs: 102
Progress... Page: 50 / 102
Progress... Page: 100 / 102
Brand: aixam, number of offers downloaded: 102


## Summary and rewriting offerts to one file

In [None]:
total_number_hrefs = 0
total_number_offers = 0
total_number_desc = 0
offers = pd.DataFrame()

for brand in car_brands:    

    # Load data from dumps
    h, o, d = [], [], []
    try:
        with open("".join([path, "/hrefs_", brand, ".obj"]), 'rb') as fp:
            h = pickle.load(fp)
        with open("".join([path, "/offers_", brand, ".obj"]), 'rb') as fp:
            o = pickle.load(fp)
        with open("".join([path, "/desc_", brand, ".obj"]), 'rb') as fp:
            d = pickle.load(fp)
    except FileNotFoundError as ex:
        pass

    offers = offers.append(o, ignore_index=True)

    print ("{:<15} index:{:>3} | HREFS:{:>6} | OFFERS:{:>6} | STATUS:{:>5} | DESC:{:>6}"\
           .format(brand, car_brands.index(brand), len(h), len(o), \
                  ("OK" if len(o) > 0 else "None"), (len(d) if d is not None else 0)))
    total_number_hrefs += len(h)
    total_number_offers += len(o) 
    total_number_desc += len(d)

# Differences between number of hrefs and offers are natural
# (some offers have been removed in the meantime)
print("{:<26}| HREFS:{:>6} | OFFERS:{:>6} | RATIO:{:>6} | DESC:{:>6}"\
      .format("TOTAL NUMBER:", total_number_hrefs, total_number_offers, \
              round(total_number_offers/total_number_hrefs,1), total_number_desc))

abarth          index:  0 | HREFS:    41 | OFFERS:    40 | STATUS:   OK | DESC:     0
acura           index:  1 | HREFS:    15 | OFFERS:    15 | STATUS:   OK | DESC:     0
aixam           index:  2 | HREFS:   103 | OFFERS:   103 | STATUS:   OK | DESC:     0
alfa-romeo      index:  3 | HREFS:  1328 | OFFERS:  1311 | STATUS:   OK | DESC:     0
alpine          index:  4 | HREFS:     6 | OFFERS:     6 | STATUS:   OK | DESC:     0
aston-martin    index:  5 | HREFS:    18 | OFFERS:    18 | STATUS:   OK | DESC:     0
audi            index:  6 | HREFS: 14872 | OFFERS: 14399 | STATUS:   OK | DESC:     0
austin          index:  7 | HREFS:     4 | OFFERS:     4 | STATUS:   OK | DESC:     0
autobianchi     index:  8 | HREFS:     2 | OFFERS:     2 | STATUS:   OK | DESC:     0
bac             index:  9 | HREFS:     1 | OFFERS:     1 | STATUS:   OK | DESC:     0
bentley         index: 10 | HREFS:    63 | OFFERS:    60 | STATUS:   OK | DESC:     0
bmw             index: 11 | HREFS: 15879 | OFFERS: 153

In [None]:
offers.shape

(191866, 128)

In [None]:
offers.head()

Unnamed: 0,href,price_evaluation,price,currency,price_details,seller_type,seller_name,Oferta od,Kategoria,Marka pojazdu,Model pojazdu,Rok produkcji,Przebieg,Pojemność skokowa,Rodzaj paliwa,Moc,Skrzynia biegów,Napęd,Typ,Liczba drzwi,Liczba miejsc,Kolor,Faktura VAT,Kraj pochodzenia,Pierwsza rejestracja,Numer rejestracyjny pojazdu,Zarejestrowany w Polsce,Pierwszy właściciel,Bezwypadkowy,Serwisowany w ASO,Stan,Metalik,Możliwość finansowania,ABS,Bluetooth,Czujnik zmierzchu,Elektrycznie ustawiane lusterka,Gniazdo USB,Klimatyzacja automatyczna,MP3,Poduszka powietrzna chroniąca kolana,Poduszki boczne przednie,Radio niefabryczne,Światła do jazdy dziennej,Szyberdach,Alufelgi,Centralny zamek,Czujniki parkowania tylne,ESP (stabilizacja toru jazdy),Immobilizer,Komputer pokładowy,Ogranicznik prędkości,Poduszka powietrzna kierowcy,Poduszki boczne tylne,Radio fabryczne,Światła LED,Wielofunkcyjna kierownica,ASR (kontrola trakcji),Czujnik deszczu,Elektryczne szyby przednie,Gniazdo AUX,Isofix,Kurtyny powietrzne,Podgrzewane lusterka boczne,Poduszka powietrzna pasażera,Przyciemniane szyby,Światła Xenonowe,Światła przeciwmgielne,Wspomaganie kierownicy,Emisja CO2,Okres gwarancji producenta,Perłowy,Tuning,Alarm,CD,Tapicerka welurowa,Gniazdo SD,Tempomat,Dach panoramiczny,Podgrzewana przednia szyba,Klimatyzacja manualna,System Start-Stop,Leasing,Opłata początkowa,Miesięczna rata,Liczba pozostałych rat,Wartość wykupu,Matowy,Akryl (niemetalizowany),VAT marża,Asystent parkowania,Klimatyzacja dwustrefowa,Regulowane zawieszenie,Nawigacja GPS,Tapicerka skórzana,Elektrochromatyczne lusterko wsteczne,Uszkodzony,Czujniki parkowania przednie,Kamera cofania,Odtwarzacz DVD,Podgrzewane przednie siedzenia,Elektrochromatyczne lusterka boczne,Tuner TV,Wersja,Łopatki zmiany biegów,Elektryczne szyby tylne,lub do (przebieg km),Elektrycznie ustawiane fotele,Klimatyzacja czterostrefowa,Asystent pasa ruchu,Czujnik martwego pola,Zmieniarka CD,Relingi dachowe,Hak,Tempomat aktywny,Podgrzewane tylne siedzenia,Kierownica po prawej (Anglik),Gwarancja dealerska (w cenie),Ogrzewanie postojowe,Filtr cząstek stałych,Generacja,HUD (wyświetlacz przezierny),VIN,Homologacja ciężarowa,Zarejestrowany jako zabytek,Warranty if agreed with the buyer,Kod Silnika,Unnamed: 128
0,https://www.otomoto.pl/oferta/abarth-595-pista...,above,79 999,PLN,Faktura VAT,Osoba prywatna,Paweł,Osoby prywatnej,Osobowe,Abarth,595,2018,12 150 km,1 400 cm3,Benzyna,160 KM,Manualna,Na przednie koła,Auta małe,2,4.0,Szary,Tak,Polska,24/05/2019,WX8359A,Tak,Tak,Tak,Tak,Używane,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,https://www.otomoto.pl/oferta/abarth-695-1-4-1...,none,110 000,PLN,"Do negocjacji, Faktura VAT",Autoryzowany Dealer,"Grupa Gezet Fiat, Abarth, Alfa Romeo, Jeep, Ho...",Firmy,Osobowe,Abarth,695,2020,5 km,1 368 cm3,Benzyna,180 KM,Manualna,Na przednie koła,Kompakt,3,4.0,Zielony,Tak,Polska,,,,,Tak,,Nowe,Tak,Tak,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,https://www.otomoto.pl/oferta/abarth-595-abart...,in,63 500,PLN,Faktura VAT,Osoba prywatna,Piotr,Osoby prywatnej,Osobowe,Abarth,595,2017,39 000 km,1 400 cm3,Benzyna,145 KM,Manualna,Na przednie koła,Auta małe,3,4.0,Biały,Tak,Polska,19/07/2017,,Tak,,Tak,Tak,Używane,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,154 g/km,17/07/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,https://www.otomoto.pl/oferta/abarth-595-500-1...,above,36 900,PLN,"Do negocjacji, Faktura VAT",Osoba prywatna,Patryk,Osoby prywatnej,Osobowe,Abarth,595,2009,209 606 km,1 400 cm3,Benzyna,190 KM,Manualna,Na przednie koła,Auta małe,3,4.0,Inny kolor,Tak,,22/04/2009,SO113473,,,,Tak,Używane,Tak,,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,,1.0,1.0,,,1.0,1.0,,1.0,,,,1.0,1.0,,1.0,1.0,,1.0,,1.0,1.0,,1.0,1.0,155 g/km,,Tak,Tak,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,https://www.otomoto.pl/oferta/abarth-595-abart...,none,84 300,PLN,,Autoryzowany Dealer,Dukiewicz Sp Jawna Dealer Fiat Abarth Alfa Rom...,Firmy,Osobowe,Abarth,595,2020,1 km,1 368 cm3,Benzyna,145 KM,Manualna,,Auta małe,3,,Czerwony,,,,,,,Tak,,Nowe,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# Dump all offers to pickle
with open("".join([path, "/offers.obj"]), "wb") as fp:
    pickle.dump(offers, fp)  

In [None]:
# Write offers to CSV
offers.to_csv("/".join([path, "offers.csv"]), encoding="UTF-8")    
 
# For reading:
# pd.read_csv("/".join([path, "offers.csv"]), encoding="UTF-8")