In [1]:
from time import sleep

import requests
from bs4 import BeautifulSoup

from network_config import MANE_PAGE_YFA, WEB_HEADERS


def html_response(url, headers):
    for _ in range(3):
        try:
            response = requests.get(url, headers=headers)
            return response.text
        except ConnectionError or TimeoutError or ConnectionResetError:
            print("\n*****ConnectionError, TimeoutError or ConnectionResetError*"
                  "****\n\nI will retry again after 7 seconds...")
            sleep(7)
            print('Making another request...')

In [4]:
import datetime
import pandas as pd
from tqdm.notebook import tqdm

months_dict = {
    'января': 'Jan',
    'февраля': 'Feb',
    'марта': 'Mar',
    'апреля': 'Apr',
    'мая': 'May',
    'июня': 'June',
    'июля': 'Jul',
    'августа': 'Aug',
    'сентября': 'Sep',
    'октября': 'Oct',
    'ноября': 'Nov',
    'декабря': 'Dec'
}

wd_list = ['4WD', 'передний', 'задний']
transmission_list = ['механика', 'АКПП', 'вариатор', 'робот']
fuel_list = ['бензин', 'дизель', 'гибрид', 'электро']

In [5]:
def get_from_list(split_header_item, list_):

    for item in list_:
        if item in split_header_item:
            return item


def get_date(date, today):

    if 'сегод' in date:
        date = today.date()

    elif 'минут' in date:
        try:
            delta_minutes = int(date.split(' ')[0])
        except:
            delta_minutes = 1
        date = (today - datetime.timedelta(minutes=delta_minutes)).date()

    elif 'час' in date:
        delta_hours = int(date.split(' ')[0])
        date = (today - datetime.timedelta(hours=delta_hours)).date()

    else:
        day, month = date.split(' ')
        month = months_dict.get(month)

        if today.month == 1 and month == 'Dec':
            year = today.year - 1
        else:
            year = today.year

        date = datetime.datetime.strptime(
            f'{day} {month} {year}', '%d %b %Y').date()

    return date

In [12]:
d_class = 'css-ck6dgx ewrty961'


def get_car_info(about):
    description = about.get('description')
    if description:
        description = description.replace(
            '\n', ' ').replace('\r', '').replace('  ', ' ')

    brand_len = len(about.get('brand'))
    name = about.get('name').split(', ')[0][brand_len + 1:]

    new = {
        'brand': about.get('brand'),
        'name': name,
        'bodyType': about.get('bodyType'),
        'color': about.get('color'),
        'fuelType': about.get('fuelType'),
        'year': about.get('modelDate'),
        'mileage': None,
        'transmission': about.get('vehicleTransmission'),
        'power': None,
        'price': None,
        'vehicleConfiguration': about.get('vehicleConfiguration'),
        'engineName': None,
        'engineDisplacement': None,
        'date': None,
        'location': None,
        'link': None,
        'description': description,
    }

    vehicle = about.get('vehicleEngine')
    if vehicle:
        new.update({
            'engineName': vehicle.get('name'),
            'engineDisplacement': vehicle.get('engineDisplacement')
        })
    return new


def parse_page_response(response):
    today = datetime.datetime.today()
    soup = BeautifulSoup(response, 'html.parser')
    js_data = soup.find_all('script', type='application/ld+json')[1:21]
    #         for idx in range(1, 21):
    #             about =json.loads(data[idx].contents[0])
    print(js_data[0])
    # return

    # links = soup.find('div', class_='css-1nvf6xk eaczv700')
    links = soup.find('div', class_='css-1nvf6xk eaczv700')
    # links = soup.find('div', class_='css-1f36sr9 e1m0rp604')
    main_page_data = []

    for js_item, full_header, price, date_loc, link, mileage in zip(
            js_data,
            soup.find_all('a', attrs={"data-ftid": "bulls-list_bull"}),
            soup.find_all('span', class_="css-46itwz e162wx9x0"),
            soup.find_all('div', class_="css-1x4jcds eotelyr0"),
            links.find_all('a', attrs={"data-ftid": "bulls-list_bull"}),
            soup.find_all('span', class_='css-1l9tp44 e162wx9x0')
    ):
        js_info = json.loads(js_item.contents[0])
        js_info = get_car_info(js_info)

        mileage = None
        power = None

        buf = full_header.find_all('span', class_='css-1l9tp44 e162wx9x0')
        for item in buf:
            if 'тыс' in item.text:
                mileage = int(item.text.replace(' ', '').replace(
                    'тыс.км', '000').replace(',', '').replace('<', ''))
                break

        split_header = full_header.text.split(', ')
        for item in split_header:
            if 'л.с.' in item:
                power = item.rsplit('(')[-1][:-6]
                try:
                    power = int(power)
                    if power > 2000:
                        power = None
                except:
                    power = None
                break

        price = price.text[:-2].replace(u'\xa0', u'')
        date = get_date(date_loc.div.text, today)
        location = date_loc.span.text
        link = link.get('href')

        js_info.update({
            'mileage': mileage,
            'power': power,
            'price': price,
            'date': date,
            'location': location,
            'link': link,
        })
        main_page_data.append(js_info)

    return main_page_data

In [13]:
a = html_response(MANE_PAGE_YFA + 'page', WEB_HEADERS)
# parse_page_response(a)

In [14]:
def start(region, pages_count):
    print(f'Start {region} region!')

    region = f'region{region}'
    folder = f'drom/{region}/'
    MANE_PAGE_YFA = f'https://auto.drom.ru/{region}/all/page'

    today_date = datetime.datetime.now().date()
    today_hour = datetime.datetime.now().hour

    responses = []
    result = []
    errors = []
    #     print(MANE_PAGE_YFA)

    for page in tqdm(range(pages_count), desc='Collect Pages'):
        page_response = html_response(MANE_PAGE_YFA + str(page + 1), WEB_HEADERS)
        responses.append(page_response)

    for idx in tqdm(range(pages_count), desc='Parsing Pages'):
        try:
            page_records = parse_page_response(responses[idx])

            for i in page_records:
                result.append(i)
        except:
            errors.append(responses[idx])

    print(f'Errors: {len(errors)}')
    if len(result) == 0:
        print('PARSING ERROR: EMPTY CSV FILE!')
        print('PARSING ERROR: EMPTY CSV FILE!')
        print('PARSING ERROR: EMPTY CSV FILE!')

        return

    result_ = pd.DataFrame(result)

    if len(errors) > 0:
        csv_name = f'{folder}drom_{region}_{today_date}_{today_hour:02d}_{len(errors):02d}errors.csv'
    else:
        csv_name = f'{folder}drom_{region}_{today_date}_{today_hour:02d}.csv'

    result_.to_csv(csv_name)
    print(f'Saved to {csv_name}')

    return errors

In [16]:
def main():
    pages_count = 1
    regions = [25]
    for region in regions:
        start(region, pages_count)

In [11]:
main()

Start 25 region!


Collect Pages:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/1 [00:00<?, ?it/s]

Errors: 1
PARSING ERROR: EMPTY CSV FILE!
PARSING ERROR: EMPTY CSV FILE!
PARSING ERROR: EMPTY CSV FILE!
Start 41 region!


Collect Pages:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/1 [00:00<?, ?it/s]

Errors: 1
PARSING ERROR: EMPTY CSV FILE!
PARSING ERROR: EMPTY CSV FILE!
PARSING ERROR: EMPTY CSV FILE!


In [10]:
import schedule


schedule.every().hour.at(':30').do(main)

while True:
    schedule.run_pending()
    sleep(30)

Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_03.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_03.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_04.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_04.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_05.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_05.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_06.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_06.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_07.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_07.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_08.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_08.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_09.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_09.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_10.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_10.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_11.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_11.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_12.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_12.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_13.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_13.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_14.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 18
Saved to drom/region41/drom_region41_2022-10-07_14_18errors.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_15.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_15.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_16.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_16.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_17.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_17.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_18.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_18.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_19.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_19.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_20.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_20.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_21.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_21.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-07_22.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_22.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 1
Saved to drom/region25/drom_region25_2022-10-07_23_01errors.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-07_23.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-08_00.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-08_00.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-08_01.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-08_01.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-08_02.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-08_02.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-08_03.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-08_03.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region25/drom_region25_2022-10-08_04.csv
Start 41 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Parsing Pages:   0%|          | 0/80 [00:00<?, ?it/s]

Errors: 0
Saved to drom/region41/drom_region41_2022-10-08_04.csv
Start 25 region!


Collect Pages:   0%|          | 0/80 [00:00<?, ?it/s]

ConnectionError: HTTPSConnectionPool(host='auto.drom.ru', port=443): Max retries exceeded with url: /region25/all/page22 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000017CA2C6B208>: Failed to establish a new connection: [WinError 10060] Попытка установить соединение была безуспешной, т.к. от другого компьютера за требуемое время не получен нужный отклик, или было разорвано уже установленное соединение из-за неверного отклика уже подключенного компьютера'))

In [None]:
# bulls-list_bull