### Парсинг телефонов с Авито

In [8]:
import sys
sys.path.append('../..')


In [9]:
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from tqdm import tqdm
from kafka import KafkaProducer
import psycopg2
import json
from selenium_stealth import stealth
from webdriver_manager.chrome import ChromeDriverManager
import random

from src.parser.models import Product, Seller
from src.parser.parser import parse_avito_page
from src.parser.tools import get_ad_urls

### Константы

In [10]:
URL_TEMPLATE = "https://www.avito.ru/sankt-peterburg/telefony/mobilnye_telefony/apple-ASgBAgICAkS0wA3OqzmwwQ2I_Dc?p="
NUM_PAGES = 10

producer = KafkaProducer(
    bootstrap_servers="localhost:9092",
    value_serializer=lambda v: json.dumps(v).encode("utf-8")
)

def send_phone(data):
    producer.send("phone_listings", value=data)
    producer.flush()

def send_seller(data):
    producer.send("seller_listings", value=data)
    producer.flush()


# списки существующих ключей для того, чтобы лишний раз 
# не добавлять объявление/ продавца в  таблицу
conn = psycopg2.connect("dbname=mydatabase user=myuser password=mypassword host=localhost port=5432")
cursor = conn.cursor()

cursor.execute("SELECT link FROM product")
product_ids = [row[0] for row in cursor.fetchall()]

cursor.execute("SELECT seller_id FROM seller")
seller_ids = [row[0] for row in cursor.fetchall()]

cursor.close()
conn.close()

### Запуск сервиса

In [11]:
service = Service(ChromeDriverManager().install())

options = webdriver.ChromeOptions()
# options.add_argument('--headless')
driver = webdriver.Chrome(service=service, options=options)
stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )

### Максимальный номер сохраненной страницы  

In [12]:
start_page_number = 0
end_page_number = start_page_number + NUM_PAGES

print(f"Начнем парсинг со страницы номер: {start_page_number} (вкл)")
print(f"Закончим парсинг на странице номер: {end_page_number} (не вкл)")

Начнем парсинг со страницы номер: 0 (вкл)
Закончим парсинг на странице номер: 10 (не вкл)


### Парсинг

In [13]:
for page_num in range(start_page_number, end_page_number):
    print(f"Начали парсинг страницы # {page_num}")
    try:
        # Загрузка страницы с объявлениями
        url = URL_TEMPLATE + str(page_num)
        driver.get(url)
        time.sleep(random.randint(2, 5))

        # Получаем HTML-код страницы
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
  
        # Извлекаем ссылки на объявления на текущей странице
        links = get_ad_urls(soup)
        for link in tqdm(links):
            prepared_link = link.split('?')[0]
            if prepared_link in product_ids:
                continue
            try:
                # Переход на страницу объявления
                driver.get(link)
                time.sleep(random.randint(2, 5))

                # Парсим данные на странице объявления (название, цена, фото, описание и т.д.)
                ad_data, seller_data, done_deals_data = parse_avito_page(driver=driver) # <- словарик 
                ad_data['link'] = prepared_link

                # валидация используя Pydantic
                product = Product.model_validate(ad_data)
                seller = Seller.model_validate(seller_data)

                if ad_data['link'] in product_ids:
                    continue
                
                product_ids.append(prepared_link)
                send_phone(product.model_dump())
                # проверка на отсутствие продавца в таблице
                if seller_data['seller_id'] not in seller_ids:
                    seller_ids.append(seller_data['seller_id'])
                    send_seller(seller.model_dump())

                    done_deals_list = [Product.model_validate(deal) for deal in done_deals_data]
                    for deal in done_deals_list:
                        send_phone(deal.model_dump())
            except TimeoutException:
                print(f"Ошибка: объявление {link} не загрузилось, пропускаем...")
                continue
            except Exception as e:
                print(f"Произошла ошибка при обработке объявления {link}: {e}")
                continue
    except TimeoutException:
        print(f"Ошибка: страница {page_num} не загрузилась, пропускаем...")
        continue
    except Exception as e:
        print(f"Произошла ошибка на странице {page_num}: {e}")
        continue


Начали парсинг страницы # 0


0it [00:00, ?it/s]


Начали парсинг страницы # 1


  0%|          | 0/82 [00:00<?, ?it/s]

Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'


  7%|▋         | 6/82 [00:13<02:47,  2.21s/it]

Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_

 12%|█▏        | 10/82 [00:23<02:49,  2.35s/it]

Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'
Error parsing product: 'NoneType' object has no attribute 'find_next_sibling'


 21%|██        | 17/82 [00:28<01:48,  1.67s/it]


KeyboardInterrupt: 

In [None]:
driver.quit()