<a href="https://colab.research.google.com/github/KrzysztofLin/olx_bikes_scrapper/blob/main/olx_bike_scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Settings



In [1]:
from dataclasses import dataclass
from bs4 import BeautifulSoup as Soup
from requests import get
import csv
from tqdm import tqdm
import pandas as pd
import re
import multiprocessing as mp
import time
import itertools
from typing import List

In [2]:
OLX_MAIN_ADDRESS = "https://www.olx.pl"
MAIN_CATEGORY = "/d/sport-hobby/rowery"
ROOT_URL = OLX_MAIN_ADDRESS + MAIN_CATEGORY
OFFERT = "/d/oferta"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/102.0.5005.72 Safari/537.36 ",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
              "application/signed-exchange;v=b3;q=0.9 "
}

LIST_OF_MANUFACTURES = [
'unibike',
'kross',
'trek',
'scott',
'haibike',
'specialized',
'cann?ondale',
'ktm',
'romet',
'kellys',
'kands',
'lazzaro',
'merida',
'gt',
'dartmoor',
'giant',
'kona',
'author',
'cube',
'canyon',
'accent',
'oreba',
'boa?rdman',
'ghost',
'focus',
'stevens',
'marin',
"b[i']?twin",
'ridley',
'triban']

COMPILED_LIST_OF_MANUFACTURES = re.compile("|".join(["\\b"+i+"\\b" for i in LIST_OF_MANUFACTURES]))
LIST_OF_WHEEL_SIZES = r"(24|26|27.5|27,5|27|28|29)"
LIST_OF_PRODUCTION_YEAR = r"20[12][0-9]"
BIKE_WEIGHT_REGEX = r"([0-9]?[0-9][.,]?[0-9]?[0-9][ ]?kg|waga[:]?[ ]?[0-9]?[0-9][.,]?[0-9]?)"


##Function to find categories

In [3]:
def find_bikes_categories():
  bike_categories = dict()
  html = get(url=ROOT_URL, headers=HEADERS).text
  soup = Soup(html, 'html.parser')
  category = MAIN_CATEGORY+"/rowery"
  category_number = 1

  for links in soup.find_all('a'):
    address = links.get('href')
    if address:
      if category in address:
        address_name = address.replace('/d/sport-hobby/rowery/','').replace('/','').replace('-',' ')
        bike_categories[category_number] = address_name, OLX_MAIN_ADDRESS + address
        category_number += 1
  bike_categories[category_number] = 'rowery gravelowe', 'https://www.olx.pl/d/sport-hobby/rowery/q-gravel/'

  return bike_categories

In [4]:
def _find_all_ulrs_on_page(page, subcategory_html):
  url=subcategory_html+ f"/?page={page}"
  url_list = []
  html = get(url=url, headers=HEADERS).text
  soup = Soup(html, 'html.parser')
  for links in soup.find_all('a'):
    address = links.get('href')
    if address:
      if OFFERT in address:
        url_list.append(OLX_MAIN_ADDRESS+address)
  return url_list

def find_all_offerts_in_category(subcategory_html, page_range = 10):
  pool = mp.Pool(mp.cpu_count())
  results = pool.starmap(_find_all_ulrs_on_page, [(page, subcategory_html) for page in range(page_range)])
  pool.close()
  print("------")
  fresults = list(itertools.chain(*results))
  return fresults

def _find_info_in_url(current_url):
  html = get(url=current_url, headers=HEADERS).text
  soup = Soup(html, 'html.parser')
  selector = 'div.css-g5mtbi-Text'
  found = soup.select(selector)
  description = [x.text.split(';')[-1].strip().replace('\n', ' ') for x in found]
  if soup:
    try:
      title = soup.find("h1").text
      price = float(soup.find("h3").text[:-2].replace(" ", '').replace(',','.'))
      return current_url, title,  price,  description[0]
    except AttributeError:
      pass
    except ValueError:
      pass

def search_offerts(url_list):
  pool = mp.Pool(mp.cpu_count())
  results = pool.map(_find_info_in_url, [current_url for current_url in url_list])
  pool.close()
  return results

def _find_photos_in_url(current_url: str):
  html = get(url=current_url, headers=HEADERS).text
  soup = Soup(html, 'html.parser')
  bike_photo_addresses = [soup.find('img', attrs= {'src': re.compile("https://ireland")}).get('src')]
  for link in soup.find_all('img', attrs= {'data-src': re.compile("https://ireland")}):
    bike_photo_addresses.append(link.get('data-src'))
  return current_url, bike_photo_addresses

def search_offerts2(url_list):
  pool = mp.Pool(mp.cpu_count())
  results = pool.map(_find_photos_in_url, [current_url for current_url in url_list])
  pool.close()
  return results

'''
for localization search
    #selector = 'p.css-xl6fe0-Text.eu5v0x0'
    #selector = 'div.css-1q7h1ph'
    #found = soup.select(selector)
    #print(found)
    person = {}
    for div in soup.find_all(class_ = 'css-1dp6pbg'):#.find(class_= 'css-xl6fe0-Text eu5v0x0'):
      print(div)
      for link in div.find(class_= 'css-xl6fe0-Text eu5v0x0'):
        print(link) 
'''

"\nfor localization search\n    #selector = 'p.css-xl6fe0-Text.eu5v0x0'\n    #selector = 'div.css-1q7h1ph'\n    #found = soup.select(selector)\n    #print(found)\n    person = {}\n    for div in soup.find_all(class_ = 'css-1dp6pbg'):#.find(class_= 'css-xl6fe0-Text eu5v0x0'):\n      print(div)\n      for link in div.find(class_= 'css-xl6fe0-Text eu5v0x0'):\n        print(link) \n"

In [5]:
bike_categories = find_bikes_categories()
url_list = find_all_offerts_in_category(subcategory_html = bike_categories[5][1], page_range = 2 )
url_list = ['https://www.olx.pl/d/oferta/trek-marlin-6-29-hydraulika-CID767-IDQaktC.html']
bikes_photo_list = search_offerts2(url_list)

------


In [6]:
import requests
import shutil

for bike_url_and_photos in bikes_photo_list:
  file_name = str(bike_url_and_photos[0]).replace(OLX_MAIN_ADDRESS+OFFERT+'/', '').replace('.html', '')
  for index, photo_url in enumerate(bike_url_and_photos[1:][0]):
    photo_filename = file_name + '_' +str(index) + '.jpg'
    res = requests.get(photo_url, stream = True)
    
    if res.status_code == 200:
      with open(photo_filename, 'wb') as f:
          shutil.copyfileobj(res.raw, f)

regex to find size

In [7]:
def find_frame_size(text_to_analysis: List[str]) -> list:
  frame_size = list()
  r1 = None
  for text in text_to_analysis:
    try:
      r1 = re.findall(r"ramy[ ]?.?[ ]?(17|18|19|20|21|22|23|48|49|50|51|52|53|54|55|56|57|58|59|60|62|64|xs|s|m|xl|l)", text)
      if r1 == []:
        r1 = re.findall(r"rama[ ]?.?[ ]?(17|18|19|20|21|22|23|48|49|50|51|52|53|54|55|56|57|58|59|60|62|64|xs|s|m|xl|l)", text)
        if r1 == []:
          r1 = re.findall(r"rozmiarz?e?[ ]?.?[ ]?(17|18|19|20|21|22|23|48|49|50|51|52|53|54|55|56|57|58|59|60|62|64|xs|s|m|xl|l)", text)
          if r1 == []:
            r1 = re.findall(r"roz[ ]?.?[ ]?(17|18|19|20|21|22|23|48|49|50|51|52|53|54|55|56|57|58|59|60|62|64|xs|s|m|xl|l)", text)
    except ValueError:
      pass
    except TypeError:
      pass
    sizes = []
    if r1:
      my_result = max(set(r1), key=r1.count) 
      try:  
        sizes = re.findall("\d+", my_result)[0]
      except IndexError:
        sizes = my_result[-2:]
    if sizes:
      frame_size.append(sizes)
    else:
      frame_size.append('')
  return frame_size

In [8]:
def find_bike_weight(text_to_analysis):
  bikes_weight = _find_attribiute(text_to_analysis, regex = BIKE_WEIGHT_REGEX )
  return bikes_weight

def find_bike_production_year(text_to_analysis):
  return _find_attribiute(text_to_analysis, regex = LIST_OF_PRODUCTION_YEAR)

def find_wheel_size(text_to_analysis):
  return _find_attribiute(text_to_analysis, regex = LIST_OF_WHEEL_SIZES)

def find_producent(text_to_analysis):
  return _find_attribiute(text_to_analysis, regex = COMPILED_LIST_OF_MANUFACTURES)

def _find_attribiute(text_to_analysis, regex):
  bikes_atrribiute = list()
  for text in text_to_analysis:
    attribiute = None
    try:
      atrribiute = re.findall(regex, text)
      if atrribiute:
        bikes_atrribiute.append(atrribiute[0])
      else:
        bikes_atrribiute.append('')
    except ValueError:
      bikes_atrribiute.append('')
    except AttributeError:
      bikes_atrribiute.append('')
    except TypeError:
      bikes_atrribiute.append('')
  return bikes_atrribiute

In [9]:
def user_category_and_page_range_choice(bike_categories):
  for key, values in bike_categories.items():
    print(f"{key}: {values[0]}")

  user_category_choice = int(input(f'Wybierz kategorię od 1 do {list(bike_categories)[-1]}: '))
  user_page_range_choice = int(input(f'Wybierz liczbe stron ktora chcesz przeanalizowac: '))
  return bike_categories[user_category_choice], user_page_range_choice

In [10]:
def scrap_data():
    bike_categories = find_bikes_categories()
    user_bike_category_choice, user_page_range_choice = user_category_and_page_range_choice(bike_categories)
    url_list = find_all_offerts_in_category(subcategory_html = user_bike_category_choice[1], page_range = user_page_range_choice)
    #url_list = find_all_offerts_in_category(subcategory_html = bike_categories[5], page_range = 6 )
    print("Zapytanie jest wyszukiwane, poczekajod 30 sekund do 3 minut")
    start_time = time.time()
    products = search_offerts(url_list)
    print(time.time() - start_time, "seconds")
    output_name = f"result {user_bike_category_choice[0]}.xlsx"
    return products, output_name

def prepare_text_to_analysis(df):
  text_to_analysis = []
  for i in range(len(df)):
    if df.opis[i] is not None and df.tytuł is not None:
      text = df.tytuł[i] + df.opis[i]
    elif df.tytuł is not None:
      text = df.tytuł[i]
    if text:
      text_to_analysis.append(text.lower())
    else:
      text_to_analysis.append('')
  return text_to_analysis

def preprocess_data(products) -> pd.DataFrame:
  df = pd.DataFrame(products, columns = ['link','tytuł', 'cena', 'opis'])  
  text_to_analysis = prepare_text_to_analysis(df)

  df['rozmiar koła'] = find_wheel_size(text_to_analysis)
  df['rozmiar_ramy'] = find_frame_size(text_to_analysis)
  df['producent'] = find_producent(text_to_analysis)
  df['waga'] = find_bike_weight(text_to_analysis)
  df['rok produkcji'] = find_bike_production_year(text_to_analysis)

  opis_column = df.pop('opis')
  df.insert(len(df.columns), 'opis', opis_column)
  return df

def filter_data(df):
  
  pass

def main():
  products, output_name = scrap_data()
  df = preprocess_data(products)
  df.drop_duplicates(subset=['link'], keep='first')
  df = df.sort_values(by='cena')
  df.to_excel(output_name)


#Run main to use the program

In [None]:
main()

1: rowery crossowe
2: rowery dzieciece
3: rowery elektryczne
4: rowery miejskie
5: rowery gorskie
6: rowery szosowe
7: rowery trekkingowe
8: rowery gravelowe
Wybierz kategorię od 1 do 8: 5
Wybierz liczbe stron ktora chcesz przeanalizowac: 25
------
Zapytanie jest wyszukiwane, poczekajod 30 sekund do 3 minut
260.7213325500488 seconds
