In [209]:
import requests
from bs4 import BeautifulSoup as bs

import ssl
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
from requests.packages.urllib3.util import ssl_

from dataclasses import dataclass

In [210]:
base_url = 'https://www.avito.ru'
russian_cars = '/rossiya/avtomobili'

In [211]:
@dataclass
class Car:
    
    price : float
        
    brand_model : str
    year : int
    mileage : float
    engine_capacity : float
    horsepower : float
    body_type : str
    drive_type : str
    transmission : str
    engine_type : str
        
    is_offer_vip : bool
    offer_location: str
        
    offer_url : str

In [212]:
CIPHERS = """ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-SHA256:AES256-SHA"""

class TlsAdapter(HTTPAdapter):

    def __init__(self, ssl_options=0, **kwargs):
        self.ssl_options = ssl_options
        super(TlsAdapter, self).__init__(**kwargs)

    def init_poolmanager(self, *pool_args, **pool_kwargs):
        ctx = ssl_.create_urllib3_context(ciphers=CIPHERS, cert_reqs=ssl.CERT_REQUIRED, options=self.ssl_options)
        self.poolmanager = PoolManager(*pool_args, ssl_context=ctx, **pool_kwargs)

session = requests.session()
adapter = TlsAdapter(ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
session.mount("https://", adapter)

In [213]:
response = session.get(base_url+russian_cars, params={'p':'1'})

In [214]:
response.status_code

200

In [215]:
soup = bs(response.content, 'html.parser')

In [216]:
offers = soup.select('div[itemtype="http://schema.org/Product"]')
len(offer_blocks)

56

In [234]:
def parse_offer(offer):
    price = float(offer.select_one('div[class*=price]').get_text().replace('\xa0', '')[:-1])

    title = offer.select_one('div[class*=title]').get_text()
    brand_model, year = title.split(', ')

    car_params = offer.select_one('div[class*=autoParams]').get_text().replace('\xa0', '').split(', ')
    car_params = car_params[-5:] # Sometimes there is more car params like condition 'Битый' and they go before main 5 params

    mileage = float(car_params[0][:-2])

    modification = car_params[1]
    modification_info = modification.split(' ')
    engine_capacity = float(modification_info[0])
    horsepower = modification_info[2][1:-5] # remove brackets and postfix 'л.с.'

    body_type = car_params[2]
    drive_type = car_params[3]
    engine_type = car_params[4]

    offer_location = offer.select_one('div[class*=geo]').get_text()
    is_offer_vip = 'items-vip' in offer.parent['class'][0]

    offer_url = base_url + offer.select_one('a[data-marker=item-title]')['href']

    car = Car(
        price,
        brand_model,
        year,
        mileage,
        engine_capacity,
        horsepower,
        body_type,
        drive_type,
        None, # no transmission info in the 
        engine_type,
        is_offer_vip,
        offer_location,
        offer_url
    )
    return car

In [235]:
cars = []
for offer in offers:
    cars.append(parse_offer(offer))

In [240]:
import pandas

In [241]:
df = pandas.DataFrame(cars)

In [242]:
df.head()

Unnamed: 0,price,brand_model,year,mileage,engine_capacity,horsepower,body_type,drive_type,transmission,engine_type,is_offer_vip,offer_location,offer_url
0,499000.0,Mazda 3,2010,148556.0,1.6,105,хетчбэк,передний,,бензин,False,Санкт-Петербург,https://www.avito.ru/sankt-peterburg/avtomobil...
1,560000.0,Renault Logan,2019,56800.0,1.6,82,седан,передний,,бензин,False,"Нижегородская область, Нижний Новгород",https://www.avito.ru/nizhniy_novgorod/avtomobi...
2,320000.0,Hyundai Starex,2003,288500.0,2.5,103,минивэн,задний,,дизель,False,Москва,https://www.avito.ru/moskva/avtomobili/hyundai...
3,549990.0,Mazda 6,2008,147035.0,2.0,147,седан,передний,,бензин,False,Санкт-Петербург,https://www.avito.ru/sankt-peterburg/avtomobil...
4,510000.0,Skoda Superb,2012,188000.0,1.8,152,лифтбек,передний,,бензин,False,Санкт-Петербург,https://www.avito.ru/sankt-peterburg/avtomobil...
