In [1]:
from time import sleep
import requests
import pandas as pd
import html
from bs4 import BeautifulSoup

In [2]:
RESULTS_FILE = 'nissan_info.xlsx'
BASE_URL = 'https://auto.drom.ru/nissan/used/all'

In [3]:
def parse_auto_page(page_url: str) -> dict[str, str]:
    req = requests.get(page_url)
    parsed = BeautifulSoup(req.content, 'html.parser')
    car_info = parsed.findAll('table')[0]
    data = {part_info.find('th').text: part_info.find('td').text.replace('\xa0', ' ') for part_info in car_info.findAll('tr') if part_info.find('th')}
    data['title'] = [s.text for s in parsed.select('h1 > span')][0]
    data['price'] = [s.text.replace('\xa0', ' ') for s in parsed.select('.wb9m8q0')][0]
    return data

In [8]:
def get_cars_from_page(page_url: str) -> list[dict[str, str]]:
    r = requests.get(page_url)
    r.raise_for_status()
    parsed_list = BeautifulSoup(r.content, 'html.parser')
    full_car_data = []
    for car_page in parsed_list.select('a>h3'):
        sleep(0.5)
        name = car_page.text
        link = car_page.parent.attrs['href']
        try:
            car_info = parse_auto_page(link)
        except Exception as e:
            print(f'{link}\n\t{r.status_code=}\n\t{e}')
            continue
        car_info['title'] = name
        car_info['link'] = link
        full_car_data.append(car_info)
    return full_car_data

In [10]:
from tqdm.notebook import tqdm
all_cars = []
for i in tqdm(range(1, 101)):
    parsed = get_cars_from_page(BASE_URL + f'/page{i}/')
    if len(parsed) == 0:
        print('No more cars to parse')
        break
    sleep(1)
    all_cars.extend(parsed)

  0%|          | 0/100 [00:00<?, ?it/s]

https://auto.drom.ru/blagoveshchenka/nissan/primera/579929656.html
	r.status_code=200
	list index out of range
200
https://auto.drom.ru/blagoveshchenka/nissan/primera/579929656.html
list index out of range
https://auto.drom.ru/ust-koksa/nissan/note/837703982.html
	r.status_code=200
	list index out of range
200
https://auto.drom.ru/ust-koksa/nissan/note/837703982.html
list index out of range
https://auto.drom.ru/blagoveshchensk/nissan/sunny/347426280.html
	r.status_code=200
	list index out of range
200
https://auto.drom.ru/blagoveshchensk/nissan/sunny/347426280.html
list index out of range
https://auto.drom.ru/zheleznogorsk-ilimskiy/nissan/x-trail/221051936.html
	r.status_code=200
	list index out of range
200
https://auto.drom.ru/zheleznogorsk-ilimskiy/nissan/x-trail/221051936.html
list index out of range
https://auto.drom.ru/alexeevka/nissan/patrol/878979521.html
	r.status_code=200
	list index out of range
200
https://auto.drom.ru/alexeevka/nissan/patrol/878979521.html
list index out o

In [12]:
pd.DataFrame(all_cars).to_excel(RESULTS_FILE)

In [13]:
print(len(all_cars))

668
