In [1]:
import pandas as pd
import requests
import time
import re

from tqdm import tqdm
from bs4 import BeautifulSoup

## Функции

In [7]:
def get_features_from_ticket(ticket_bs):
    '''
    возвращает pd.Serias с признакамии из карточки обьявления
    
    ticket_bs: bs4.BeautifulSoup
        обьект BeautifulSoup полученный из html карточки обьявления
        
    '''
    # получение значений признаков (кроме комплектации) из карточки текущего обьявления    
    year = ticket_bs.find('li',class_='CardInfoRow_year').find('a').text
    kmAge = ticket_bs.find('li',class_='CardInfoRow_kmAge').find_all('span')[1].text
    kmAge = re.sub("\D", "", kmAge)
    bodytype = ticket_bs.find('li',class_='CardInfoRow_bodytype').find('a').text
    color = ticket_bs.find('li',class_='CardInfoRow_color').find('a').text
    volume = ticket_bs.find('li',class_='CardInfoRow_engine').find('div').text.split(' / ')[0]
    volume =  re.sub("[^\d.]", "", volume)
    power = ticket_bs.find('li',class_='CardInfoRow_engine').find('div').text.split(' / ')[1]
    power = re.sub("\D", "", power)
    fuel_type = ticket_bs.find('li',class_='CardInfoRow_engine').find('div').text.split(' / ')[2]
    transmission = ticket_bs.find('li',class_='CardInfoRow_transmission').find_all('span')[1].text
    drive = ticket_bs.find('li',class_='CardInfoRow_drive').find_all('span')[1].text
    wheel = ticket_bs.find('li',class_='CardInfoRow_wheel').find_all('span')[1].text
    state = ticket_bs.find('li',class_='CardInfoRow_state').find_all('span')[1].text
    ownersCount = ticket_bs.find('li',class_='CardInfoRow_ownersCount').find_all('span')[1].text
    pts = ticket_bs.find('li',class_='CardInfoRow_pts').find_all('span')[1].text
    customs = ticket_bs.find('li',class_='CardInfoRow_customs').find_all('span')[1].text
    
    # создание словаря комплектации
    complectation_dict = {}
    complectation = ticket_bs.find('section',class_='CardComplectation')
    if complectation:
        complectation_items = complectation.find_all('div',class_="ComplectationGroups__group")
        for item in complectation_items:
            item_name = item.find('span',class_="ComplectationGroups__itemName").text
            item_content = [x.text for x in item.find_all('li',class_="ComplectationGroups__itemContentEl")]
            complectation_dict[item_name] = item_content
        
    # получение цены предложения
    offerprice = ticket_bs.find('span',class_='OfferPriceCaption__price').text
    offerprice = re.sub("\D", "", offerprice)
    
    # заполняем строку значенями
    ind = ['year','kmAge' ,'bodytype' ,'color' ,'volume' ,'power' ,'fuel_type','transmission',
     'drive','wheel','state','ownersCount','pts','customs','complectation','offerprice']
    vol = [year,kmAge ,bodytype ,color ,volume ,power ,fuel_type,transmission,
           drive,wheel,state,ownersCount,pts,customs,complectation_dict,offerprice]
    data_row = pd.Series(data=vol,index=ind)
    
    return data_row

In [8]:
model_list_urls = [
    'https://auto.ru/cars/used/sale/audi/a6/1102314663-084e0ff8/',
    'https://auto.ru/cars/used/sale/mazda/3/1101836763-72c1fd32/',
    'https://auto.ru/cars/used/sale/ford/mondeo/1101981472-3859561f/',
    'https://auto.ru/cars/used/sale/ford/mondeo/1102310519-7daac9c2/'
]

##  Создаем и наполняем marks_model_dict
 - словарь в которм ключи это  обозначения марки на сайте auto.ru, а значения это списки обозначений на сайте auto.ru марок каждой модели 

In [9]:
# определяем константы

url_for_mmdict = 'https://auto.ru/catalog/cars/all/'

headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'suid=bf4a59ff8840503c2077bf716a7bbeef.295e234731ada0bc538a541689345348; tmr_lvid=d2115cf4bc0ee3e6bc3ad89db8fdac9d; tmr_lvidTS=1596630011564; _ym_uid=1596630015684249973; _ga=GA1.2.1113974574.1596630016; autoruuid=g5f949c622cu74bpb1qps6a12l6p5u7t.07262c19ce2a1f07cac53c4ce06ef69b; gids=213; gradius=200; mindboxDeviceUUID=a231f610-2f9e-4911-9cb8-9112afa13ab1; directCrm-session=%7B%22deviceGuid%22%3A%22a231f610-2f9e-4911-9cb8-9112afa13ab1%22%7D; tmr_reqNum=15; yuidlt=1; yandexuid=1471619051363959249; my=YysBgNU2AQEA; crookie=uCcId3jGlYfIvH+2UzzszOwTawIMGSzWumxanVb1Ras+Mu6qi+8yzj8EL+czftU9orxvkiKDKe/wgGsJDdJMlqyn6WU=; cmtchd=MTYxMjgyMjU3MTQ4MA==; _csrf_token=04c10009d8f17a08b4d507f70f2a287c28bf3c17f7b3ef42; gdpr=0; _ym_isad=2; index-selector-tab=marks; listing_view_session={}; listing_view=%7B%22output_type%22%3Anull%2C%22version%22%3A1%7D; autoru-visits-count=2; salon_phone_utms=utm_medium%3Dcpm%26utm_source%3Dauto-ru%26utm_campaign%3Dauto-ru_rus-r225_proauto-rk2021%26utm_content%3D113pa-100PRx40-otchety-o-proshlom-mashiny-ot-99-rublei_proauto-promo-page_rus-r225; hide-proauto-pimple=1; from=direct; autoru_sid=a%3Ag5f949c622cu74bpb1qps6a12l6p5u7t.07262c19ce2a1f07cac53c4ce06ef69b%7C1613427368824.604800.BYczsuAhOkO7E_tI9WN3ZQ.vyyvypdKR_sNAGlonPspJxUInUX2GwuD7owqbI5Sw58; X-Vertis-DC=vla; _ym_d=1613250934; from_lifetime=1613250934966; cycada=FXXHRKJxPTj6XyBIJ1I0Or150N9cGJGcL1yh7v8BaXc=',
'Host': 'auto.ru',
'sec-ch-ua': '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"',
'sec-ch-ua-mobil': '?0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
};

In [10]:
marks_models_dict = {} # создаем словарь с ключами - обозначениеми марки и занчениями - списками моделей этой марки  

for pages_num in tqdm(range(1,20)):
    
    if pages_num==1: params = {'view_type': 'list'}
    else:            params = {'page_num': pages_num,'view_type': 'list'}
    
    res = requests.get(url_for_mmdict, params=params, headers=headers)
    time.sleep(1)
    
    soup = BeautifulSoup(res.text, 'html.parser')  # создаем обьект bs4.BeautifulSoup
    marks_on_page_list = soup.find_all('dd', class_='catalog-all-text-list__desc') # список html-ек марок на странице
    
    if not marks_on_page_list: 
#         print ('стоп')
        break
        
    for mark in marks_on_page_list:
        models_of_mark = mark.find_all('a', class_='link_theme_auto') # список html-ек моделей марки
        
        link_for_mark_name = models_of_mark[0].get('href')  # линк первой модели, содержит обозначене марки на auto.ru
        mark_start = link_for_mark_name.find('cars/') + 5   # первый символ обозначеня марки
        mark_end = link_for_mark_name.find('/', mark_start) # последний символ обозначения марки
        mark_name = link_for_mark_name[mark_start:mark_end] # получение обозначения марки      
        
        models_list = []
        for model in models_of_mark:
            link_theme_auto = model.get('href')                 # линк модели, содержит обозначение модели на auto.ru
            model_start = mark_end+1                            # первый символ обозначения модели
            model_end = link_theme_auto.find('/', model_start)  # последний символ обозначения модели
            model_name = link_theme_auto[model_start:model_end] # получение обозначения модели
            models_list.append(model_name)
      
        marks_models_dict[mark_name] = models_list

 74%|█████████████████████████████████████████████████████████████████████████████████▊                             | 14/19 [00:25<00:09,  1.82s/it]

## Цикл по всем моделям всех марок

In [11]:
for mark in tqdm(marks_models_dict):
    for model in marks_models_dict[mark]:
        url = 'https://auto.ru/moskva/cars/' + mark + '/' + model + '/used/'
#         print(url)





100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<?, ?it/s]


In [None]:
pd.concat()

## Получение данных по одной модели одной марки

In [13]:
mark = list(marks_models_dict.keys())[11]
mark

models_list = marks_models_dict[mark]
model = models_list[0]

url = 'https://auto.ru/moskva/cars/' + mark + '/' + model + '/used/'
url

'https://auto.ru/moskva/cars/audi/100/used/'

In [15]:
# просмотр последовательно всех страниц текущей модели текущей марки
for pages_num in tqdm(range(1,100)): 
    
    if pages_num==1: params = {}
    else:            params = {'page': pages_num}
    
    res = requests.get(url, params=params, headers=headers)
    res.encoding = 'utf-8'
    time.sleep(1)
    
    # создаем обьект bs4.BeautifulSoup из html очередной страницы с обьявлениями по текущей модели текущей марки
    soup = BeautifulSoup(res.text, 'html.parser')  
    
    # список html-ек карточек на странице                                               
    tickets_on_page_list = soup.find_all('a', class_='ListingItemTitle-module__link') 
    
    if not tickets_on_page_list: # выход по исчерпанию страниц
        break
        
    


  0%|                                                                                                                        | 0/99 [00:00<?, ?it/s]
  1%|█▏                                                                                                              | 1/99 [00:02<03:23,  2.08s/it]
  2%|██▎                                                                                                             | 2/99 [00:03<03:12,  1.98s/it]
  3%|███▍                                                                                                            | 3/99 [00:05<02:56,  1.83s/it]

In [59]:
# как бы первая страница
params = {} # на страницах будет {'page':page_num}

res = requests.get(url, params=params, headers=headers)
res.encoding = 'utf-8'
            
# создаем обьект bs4.BeautifulSoup из html очередной страницы с обьявлениями по текущей модели текущей марки
soup = BeautifulSoup(res.text, 'html.parser') 
            
# список html-ек карточек на странице
tickets_on_page_list = soup.find_all('a',  class_='ListingItemTitle-module__link') 

In [106]:
# 
for ticket in tqdm(tickets_on_page_list):
    # получаем url карточки текущего обьявления
    ticket_url = ticket.get('href')
    
    # получем html карточки текщего обьявления
    ticket_res = requests.get(ticket_url)
    ticket_res.encoding = 'utf-8'
    time.sleep(1)
    
    # создаем обьект bs4.BeautifulSoup из html карточки текщего обьявления
    ticket_bs = BeautifulSoup(ticket_res.text, 'html.parser')
    
    # извлекаем признаки 
    get_features_from_ticket(ticket_bs)






  0%|                                                                                                                        | 0/38 [00:00<?, ?it/s]




  3%|██▉                                                                                                             | 1/38 [00:01<01:04,  1.75s/it]




  5%|█████▉                                                                                                          | 2/38 [00:04<01:28,  2.46s/it]


KeyboardInterrupt: 

In [17]:
# как бы первое обьявление

# получаем url карточки текущего обьявления
ticket_url = tickets_on_page_list[1].get('href')
ticket_url

# получем html карточки текщего обьявления
ticket_res = requests.get(ticket_url)
ticket_res.encoding = 'utf-8'

# создаем обьект bs4.BeautifulSoup из html карточки текщего обьявления
ticket_bs = BeautifulSoup(ticket_res.text, 'html.parser')

IndexError: list index out of range

In [16]:
get_features_from_ticket(ticket_bs)

NameError: name 'ticket_bs' is not defined

# Подвал

for loc in tqdm(range(0,len(unuseful_features_LR),1)):

In [21]:
hdrs_office = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'suid=3e1833cc7469d2dc00ccae04449006c4.9491b97d8d423e5ce05ef1c79af8855d; _ym_uid=1599469037495440295; _ga=GA1.2.1600002130.1601280177; _csrf_token=1d7055285f2e0f66d88a7fe1fc75e2eec8ae6cd772464581; autoru_sid=a%3Ag60264c2d2igjpia4p3it7lul5qeu67s.cda7ff5877363e08fe2b0faf7c62fcdd%7C1613122605806.604800.gg8JxiHvE9AyBZihcInqEA.Ld4FRMZg9Xk8q0iFZFqjpOoMCJDw6N7itfobSBJS6WA; autoruuid=g60264c2d2igjpia4p3it7lul5qeu67s.cda7ff5877363e08fe2b0faf7c62fcdd; from=direct; X-Vertis-DC=vla; yuidlt=1; yandexuid=905825121577456505; my=YyYBAS4BAToBAQA%3D; counter_ga_all7=2; gdpr=0; _ym_isad=2; gids=213; _gid=GA1.2.204714820.1613122653; autoru-visits-count=1; from_lifetime=1613124329322; _ym_d=1613124329; cycada=QtZcIspPQofWD6yVq1tXyo1meqCq6CwJcSH8NY94K9c=',
'Host': 'auto.ru',
'Referer': 'https://auto.ru/moskva/cars/used/',
'sec-ch-ua': '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; ,Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
},

In [32]:
hdrs_home = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
# 'Connection': 'keep-alive',
'Cookie': 'suid=bf4a59ff8840503c2077bf716a7bbeef.295e234731ada0bc538a541689345348; tmr_lvid=d2115cf4bc0ee3e6bc3ad89db8fdac9d; tmr_lvidTS=1596630011564; _ym_uid=1596630015684249973; _ga=GA1.2.1113974574.1596630016; autoruuid=g5f949c622cu74bpb1qps6a12l6p5u7t.07262c19ce2a1f07cac53c4ce06ef69b; gids=213; gradius=200; mindboxDeviceUUID=a231f610-2f9e-4911-9cb8-9112afa13ab1; directCrm-session=%7B%22deviceGuid%22%3A%22a231f610-2f9e-4911-9cb8-9112afa13ab1%22%7D; tmr_reqNum=15; yuidlt=1; yandexuid=1471619051363959249; my=YysBgNU2AQEA; autoru_sid=a%3Ag5f949c622cu74bpb1qps6a12l6p5u7t.07262c19ce2a1f07cac53c4ce06ef69b%7C1612822568824.604800.vXs7yiSNjHUj9vs_NTRT7Q.Ab6eLoCklN5OVQvHKRzq30jYE9puDG83Hht-435SfG4; crookie=uCcId3jGlYfIvH+2UzzszOwTawIMGSzWumxanVb1Ras+Mu6qi+8yzj8EL+czftU9orxvkiKDKe/wgGsJDdJMlqyn6WU=; cmtchd=MTYxMjgyMjU3MTQ4MA==; _csrf_token=04c10009d8f17a08b4d507f70f2a287c28bf3c17f7b3ef42; X-Vertis-DC=vla; gdpr=0; _ym_isad=2; index-selector-tab=marks; listing_view_session={}; listing_view=%7B%22output_type%22%3Anull%2C%22version%22%3A1%7D; autoru-visits-count=2; salon_phone_utms=utm_medium%3Dcpm%26utm_source%3Dauto-ru%26utm_campaign%3Dauto-ru_rus-r225_proauto-rk2021%26utm_content%3D113pa-100PRx40-otchety-o-proshlom-mashiny-ot-99-rublei_proauto-promo-page_rus-r225; hide-proauto-pimple=1; from=direct; _ym_d=1613218878; from_lifetime=1613218878985; cycada=S1YNtBaS32C5Sgb39LwY/7150N9cGJGcL1yh7v8BaXc=',
'Host': 'auto.ru',
# 'sec-ch-ua': '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"',
# 'sec-ch-ua-mobile': '?0',
# 'Sec-Fetch-Dest': 'document',
# 'Sec-Fetch-Mode': 'navigate',
# 'Sec-Fetch-Site': 'none',
# 'Sec-Fetch-User': '?1',
# 'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
}

```python
url = 'https://auto.ru/moskva/cars/audi/100/used/'   
response = requests.get(url)
page = BeautifulSoup(response.text, 'html.parser')
soup = BeautifulSoup(requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).text, 'html.parser')  
data = soup.find_all('div', class_ ="LayoutSidebar")
df = pd.read_html(str(data))[0]
df```

In [None]:
def features_parser (url_list):
    counter = 0
    for card_url in url_list:
        response = requests.get(card_url)
        response.encoding = 'utf-8'
        page = BeautifulSoup(response.text, 'html.parser')
    # body type
        body = page.find(itemprop = 'bodyType')
        if body is None:
            data_train.at[counter,'bodyType'] = None
        else:
            data_train.at[counter,'bodyType'] = body['content']
    #brand
        brand = page.find(itemprop = 'brand')
        if brand is None:
            data_train.at[counter, 'brand'] = None
        else:
            data_train.at[counter, 'brand'] = brand['content']
    # color
        color = page.find(itemprop = 'color')
        if color is None:
            data_train.at[counter,'color'] = None
        else:
            data_train.at[counter,'color'] = color['content']
    # fuel type
        fuel = page.find(itemprop = 'fuelType')
        if fuel is None:
            data_train.at[counter,'fuelType'] = None
        else:
            data_train.at[counter,'fuelType'] = fuel['content']
    # model date
        model_date = page.find(itemprop = 'modelDate')
        if model_date is None:
            data_train.at[counter,'modelDate'] = None
        else:
            data_train.at[counter,'modelDate'] = model_date['content']
    # number of doors
        doors = page.find(itemprop = 'numberOfDoors')
        if doors is None:
            data_train.at[counter,'numberOfDoors'] = None
        else:
            data_train.at[counter,'numberOfDoors'] = doors['content']
    # production date
        prod_date = page.find(itemprop = 'productionDate')
        if prod_date is None:
            data_train.at[counter,'productionDate'] = None
        else:
            data_train.at[counter,'productionDate'] = prod_date['content']
    # vehicle transmission
        trans = page.find(itemprop = 'vehicleTransmission')
        if trans is None:
            data_train.at[counter,'vehicleTransmission'] = None
        else:
            data_train.at[counter,'vehicleTransmission'] = trans['content']
    # engine displacement
        engine_ltr = page.find(itemprop = 'engineDisplacement')
        if engine_ltr is None:
            data_train.at[counter,'engineDisplacement'] = None
        else:
            data_train.at[counter,'engineDisplacement'] = engine_ltr['content']
    # engine power
        engine_pwr = page.find(itemprop = 'enginePower')
        if engine_pwr is None:
            data_train.at[counter,'enginePower'] = None
        else:
            data_train.at[counter,'enginePower'] = engine_pwr['content']
    # mileage
        mileage = page.find(class_ = 'CardInfoRow CardInfoRow_kmAge')
        if mileage is None:
            data_train.at[counter,'mileage'] = None
        else:
            mileage = mileage.text.replace('Пробег', '')
            mileage = mileage.replace('\xa0', '')
            mileage = mileage.replace('км', '')
            data_train.at[counter,'mileage'] = mileage
    # drive type
        drive = page.find(class_ = 'CardInfoRow CardInfoRow_drive')
        if drive is None:
            data_train.at[counter,'Привод'] = None
        else:
            drive = drive.text.replace('Привод', '')
            data_train.at[counter,'Привод'] = drive
    # wheel
        wheel = page.find(class_ = 'CardInfoRow CardInfoRow_wheel')
        if wheel is None:
            data_train.at[counter,'Руль'] = None
        else:
            wheel = wheel.text.replace('Руль', '')
            data_train.at[counter,'Руль'] = wheel
    # owners
        owners = page.find(class_ = 'CardInfoRow CardInfoRow_ownersCount')
        if owners is None:
            data_train.at[counter,'Владельцы'] = None
        else:
            owners = owners.text.replace('Владельцы', '')
            owners = owners.replace('\xa0', '')
            data_train.at[counter,'Владельцы'] = owners
    # vehicle certificate
        certificate = page.find(class_ = 'CardInfoRow CardInfoRow_pts')
        if certificate is None:
            data_train.at[counter,'ПТС'] = None
        else:
            certificate = certificate.text.replace('ПТС', '')
            data_train.at[counter,'ПТС'] = certificate
    # price
        price = page.find(class_ = 'OfferPriceCaption__price')
        if price is None:
            data_train.at[counter,'price'] = None
        else:
            price = price.text.replace('\xa0', '')
            price = price.replace('₽', '')
            data_train.at[counter,'price'] = price
    # counter up
        counter += 1

In [20]:
{
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
'Cookie': 'suid=bf4a59ff8840503c2077bf716a7bbeef.295e234731ada0bc538a541689345348; tmr_lvid=d2115cf4bc0ee3e6bc3ad89db8fdac9d; tmr_lvidTS=1596630011564; _ym_uid=1596630015684249973; _ga=GA1.2.1113974574.1596630016; autoruuid=g5f949c622cu74bpb1qps6a12l6p5u7t.07262c19ce2a1f07cac53c4ce06ef69b; gids=213; gradius=200; mindboxDeviceUUID=a231f610-2f9e-4911-9cb8-9112afa13ab1; directCrm-session=%7B%22deviceGuid%22%3A%22a231f610-2f9e-4911-9cb8-9112afa13ab1%22%7D; tmr_reqNum=15; yuidlt=1; yandexuid=1471619051363959249; my=YysBgNU2AQEA; autoru_sid=a%3Ag5f949c622cu74bpb1qps6a12l6p5u7t.07262c19ce2a1f07cac53c4ce06ef69b%7C1612822568824.604800.vXs7yiSNjHUj9vs_NTRT7Q.Ab6eLoCklN5OVQvHKRzq30jYE9puDG83Hht-435SfG4; crookie=uCcId3jGlYfIvH+2UzzszOwTawIMGSzWumxanVb1Ras+Mu6qi+8yzj8EL+czftU9orxvkiKDKe/wgGsJDdJMlqyn6WU=; cmtchd=MTYxMjgyMjU3MTQ4MA==; _csrf_token=04c10009d8f17a08b4d507f70f2a287c28bf3c17f7b3ef42; X-Vertis-DC=vla; gdpr=0; _ym_isad=2; index-selector-tab=marks; listing_view_session={}; listing_view=%7B%22output_type%22%3Anull%2C%22version%22%3A1%7D; autoru-visits-count=2; salon_phone_utms=utm_medium%3Dcpm%26utm_source%3Dauto-ru%26utm_campaign%3Dauto-ru_rus-r225_proauto-rk2021%26utm_content%3D113pa-100PRx40-otchety-o-proshlom-mashiny-ot-99-rublei_proauto-promo-page_rus-r225; hide-proauto-pimple=1; from=direct; _ym_d=1613218878; from_lifetime=1613218878985; cycada=S1YNtBaS32C5Sgb39LwY/7150N9cGJGcL1yh7v8BaXc=',
'Host': 'auto.ru',
'sec-ch-ua': '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
}

{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 'Accept-Encoding': 'gzip, deflate, br',
 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
 'Connection': 'keep-alive',
 'Cookie': 'suid=bf4a59ff8840503c2077bf716a7bbeef.295e234731ada0bc538a541689345348; tmr_lvid=d2115cf4bc0ee3e6bc3ad89db8fdac9d; tmr_lvidTS=1596630011564; _ym_uid=1596630015684249973; _ga=GA1.2.1113974574.1596630016; autoruuid=g5f949c622cu74bpb1qps6a12l6p5u7t.07262c19ce2a1f07cac53c4ce06ef69b; gids=213; gradius=200; mindboxDeviceUUID=a231f610-2f9e-4911-9cb8-9112afa13ab1; directCrm-session=%7B%22deviceGuid%22%3A%22a231f610-2f9e-4911-9cb8-9112afa13ab1%22%7D; tmr_reqNum=15; yuidlt=1; yandexuid=1471619051363959249; my=YysBgNU2AQEA; autoru_sid=a%3Ag5f949c622cu74bpb1qps6a12l6p5u7t.07262c19ce2a1f07cac53c4ce06ef69b%7C1612822568824.604800.vXs7yiSNjHUj9vs_NTRT7Q.Ab6eLoCklN5OVQvHKRzq30jYE9puDG83Hht-435SfG4; crookie=uCcId3jGlY

In [3]:
'https://auto.ru/catalog/cars/used/?view_type=list'
'https://auto.ru/catalog/cars/used/?page_num=2&view_type=list'

'https://auto.ru/catalog/cars/used/?page_num=2&view_type=list'

In [None]:
for mark in marks_on_page_list:
    models_of_mark = mark.find_all('a', class_='link_theme_auto') # html списка моделей марки
    
    link_for_mark_name = models_of_mark[0].get('href')  # ссылка на первую модель, содержит обозначене марки на auto.ru
    mark_start = link_for_mark_name.find('cars/') + 5   # первый символ обозначеня марки
    mark_end = link_for_mark_name.find('/', mark_start) # последний символ обозначения марки
    mark_name = link_for_mark_name[mark_start:mark_end] # получение обозначения марки
    
    print(mark_name, len(models_of_mark))
    
    for model in models_of_mark:
        link_theme_auto = model.get('href')
        model_start = mark_end+1
        model_end = link_theme_auto.find('/', model_start)
        model_name = link_theme_auto[model_start:model_end]
        print('\t',model_name)

In [None]:
params_first_page = {
    'view_type': 'list'
}
res = re.get(url, params=params_first_page, headers=headers)

pages_num = 2

params_next_pages = {
    'page_num': pages_num,
    'view_type': 'list'
}

res_2 = re.get(url, params=params_next_pages, headers=headers)

res.status_code,res_2.status_code

soup = BeautifulSoup(res.text, 'html.parser')
soup_2 = BeautifulSoup(res_2.text, 'html.parser')

In [None]:
marks_on_page_list = soup.find_all('dd', class_='catalog-all-text-list__desc')

models_of_mark = marks_on_page_list[1].find_all('a', class_='link_theme_auto')

link_theme_auto = models_of_mark[0].get('href')
mark_start = link_theme_auto.find('cars/') + 5
mark_end = link_theme_auto.find('/', mark_start)
model_start = mark_end+1
model_end = link_theme_auto.find('/', model_start)

link_theme_auto[mark_start:mark_end], link_theme_auto[model_start:model_end]

In [35]:
item_name = complectation_items[0].find('span',class_="ComplectationGroups__itemName").text
item_content = [x.text for x in complectation_items[0].find_all('li',class_="ComplectationGroups__itemContentEl")]

complectation_dict[item_name] = item_content

In [83]:
# получаем значения признаков(кроме комплектации) из карточки текущего обьявления

year = ticket_bs.find('li',class_='CardInfoRow_year').find('a').text
kmAge = ticket_bs.find('li',class_='CardInfoRow_kmAge').find_all('span')[1].text
kmAge = re.sub("\D", "", kmAge)
bodytype = ticket_bs.find('li',class_='CardInfoRow_bodytype').find('a').text
color = ticket_bs.find('li',class_='CardInfoRow_color').find('a').text
volume = ticket_bs.find('li',class_='CardInfoRow_engine').find('div').text.split(' / ')[0]
volume =  re.sub("[^\d.]", "", volume)
power = ticket_bs.find('li',class_='CardInfoRow_engine').find('div').text.split(' / ')[1]
power = re.sub("\D", "", power)
fuel_type = ticket_bs.find('li',class_='CardInfoRow_engine').find('div').text.split(' / ')[2]
transmission = ticket_bs.find('li',class_='CardInfoRow_transmission').find_all('span')[1].text
drive = ticket_bs.find('li',class_='CardInfoRow_drive').find_all('span')[1].text
wheel = ticket_bs.find('li',class_='CardInfoRow_wheel').find_all('span')[1].text
state = ticket_bs.find('li',class_='CardInfoRow_state').find_all('span')[1].text
ownersCount = ticket_bs.find('li',class_='CardInfoRow_ownersCount').find_all('span')[1].text
pts = ticket_bs.find('li',class_='CardInfoRow_pts').find_all('span')[1].text
customs = ticket_bs.find('li',class_='CardInfoRow_customs').find_all('span')[1].text
offerprice = ticket_bs.find('span',class_='OfferPriceCaption__price').text

# создание словаря комплектации
complectation_dict = {}
complectation = ticket_bs.find('section',class_='CardComplectation')
complectation_items = complectation.find_all('div',class_="ComplectationGroups__group")

for item in complectation_items:
    item_name = item.find('span',class_="ComplectationGroups__itemName").text
    item_content = [x.text for x in item.find_all('li',class_="ComplectationGroups__itemContentEl")]
    complectation_dict[item_name] = item_content

# получение цены предложения
offerprice = re.sub("\D", "", offerprice)

# заполняем строку значенями
ind = ['year','kmAge' ,'bodytype' ,'color' ,'volume' ,'power' ,'fuel_type','transmission',
 'drive','wheel','state','ownersCount','pts','customs','complectation','offerprice']
vol = [year,kmAge ,bodytype ,color ,volume ,power ,fuel_type,transmission,
       drive,wheel,state,ownersCount,pts,customs,complectation_dict,offerprice]
data_row = pd.Series(data=vol,index=ind)
data_row

year                                                          1993
kmAge                                                       786115
bodytype                                                     седан
color                                                        синий
volume                                                         2.3
power                                                          133
fuel_type                                                   Бензин
transmission                                          механическая
drive                                                     передний
wheel                                                        Левый
state                                           Не требует ремонта
ownersCount                                            3 или более
pts                                                       Оригинал
customs                                                 Растаможен
complectation    {'Элементы экстерьера': ['Легкосплавные диски