In [1]:
import math
import json
import traceback

import requests
from bs4 import BeautifulSoup

from stem import Signal
from stem.control import Controller

from fake_useragent import UserAgent

import pandas as pd

from tqdm.notebook import tqdm

from selenium import webdriver

In [2]:
def create_driver():
    options = webdriver.ChromeOptions()
    proxy = 'localhost:9050'   
    options.add_argument('--proxy-server=socks5://' + proxy)
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    
    return driver

def close_driver(driver):
    driver.quit()

def get_request(url, with_driver=False, driver=None):
    
    renew_tor_ip()
    
    user_agent = UserAgent().chrome
    headers = {'User-Agent': user_agent}

    try:
        if with_driver:
            if not driver:
                driver = create_driver()
                driver.get(url)
                r = driver.page_source
                close_driver(driver)
            else:
                driver.get(url)
                r = driver.page_source
        else:
            session = requests.session()
            session.proxies = {}
            session.proxies['http']='socks5h://localhost:9050'
            session.proxies['https']='socks5h://localhost:9050'
            r = session.get(url, headers=headers).text
    except Exception as e:
        print(str(e))
    else:
        return r   

def get_current_ip():
    session = requests.session()

    # TO Request URL with SOCKS over TOR
    session.proxies = {}
    session.proxies['http']='socks5h://localhost:9050'
    session.proxies['https']='socks5h://localhost:9050'

    try:
        r = session.get('http://httpbin.org/ip')
    except Exception as e:
        print(str(e))
    else:
        return r.text.split()[2].replace('\"', '')


def renew_tor_ip():
    with Controller.from_port(port = 9051) as controller:
        controller.authenticate(password="tor_for_parsing")
        controller.signal(Signal.NEWNYM)

In [3]:
with open('volk_polo_pages.json', 'r') as file:
    cars_pages = json.load(file)

In [4]:
len(cars_pages), len(set(cars_pages))

(887, 886)

In [5]:
cars_pages = list(set(cars_pages))

In [23]:
def _get_car_info(page, url):


    features_dict = dict()
    
#     car_brand = url.split('/')[-4]
#     car_model = url.split('/')[-3]
    
#     features_dict.update({'brand': car_brand, 'model': car_model})

    new_or_used = url.split('/')[4]
    features_dict.update({'new_or_used': new_or_used})
    
    if page.select_one('div[class*=CardSold__title]'):
        print('CAR SOLD')
        return None
    
    info = page.select_one('ul[class*=CardInfo]:not(ul[class*=CardInfoGrouped__list])')
    if info:
        features = info.find_all('li')
        for feature in features:
            spans = feature.find_all('span')
            feature_name = spans[0].text.strip().lower().replace('\xa0', '')
            feature_value = spans[1].text.strip().lower().replace('\xa0', '')

            features_dict.update({feature_name: feature_value})
            
    else:
        info = page.select_one('ul[class*=CardInfoGrouped__list]')
        features = info.find_all('li')
        for feature in features:
            feature_div = feature.find_all('div')
            
            name = feature_div[1]
            value = feature_div[0]
                
            feature_name = name.text.strip().lower().replace('\xa0', '')
            feature_value = value.text.strip().lower().replace('\xa0', '').replace(feature_name, '')

            features_dict.update({feature_name: feature_value})
        
    description = page.find('div', {'class': 'CardDescription__textInner'})
    if description:
        description_text = description.get_text(separator=" ").strip().strip().lower().replace('\xa0', '')
    else:
        description_text = ''
        
    features_dict.update({'описание': description_text})
    
    price = page.find('span', {'class': 'OfferPriceCaption__price'}).text.strip().lower().replace('\xa0', '')
    features_dict.update({'цена': price})
    
    
    try:
        benefits = page.find_all('div', {'class': 'CardBenefits__item-title'})
        for benefit in benefits:
            benefit_name = benefit.text.strip().lower().replace('\xa0', '')
            features_dict.update({benefit_name: True})
    except Exception as err:
        # print(repr(err))
        pass
    
    return features_dict

In [24]:
def get_car_info(car_page, with_driver=False, driver=None):
    resp = get_request(car_page, with_driver=with_driver, driver=driver)
    soup = BeautifulSoup(resp, 'html.parser')
    car_info = _get_car_info(page=soup, url=car_page)
    
    return car_info

def try_to_get_info_5_times(car_page, i, with_driver=True):
    try:
        car_info = None
        if with_driver:
            web_driver = create_driver()
            car_info = get_car_info(car_page, with_driver=with_driver, driver=web_driver)
            web_driver.quit()
        else:
            pass
    except Exception as err:
        print(repr(err))
        print(''.join(traceback.format_tb(err.__traceback__)))
        if i < 1:
            i += 1
            car_info = try_to_get_info_5_times(car_page, i, with_driver=True)
    finally:
        return car_info

In [22]:
# page = 'https://auto.ru/cars/new/group/volkswagen/polo/21802425/22153536/1102148992-db8a69e3/'
web_driver = create_driver()
page = 'https://auto.ru/cars/used/sale/volkswagen/polo/1101696001-9c15b8b3/'
car_info = try_to_get_info_5_times(page, 0, with_driver=True)
close_driver(web_driver)
print(car_info)

['https:', '', 'auto.ru', 'cars', 'used', 'sale', 'volkswagen', 'polo', '1101696001-9c15b8b3', '']
{'год выпуска': '1998', 'пробег': '310000км', 'кузов': 'седан', 'цвет': 'зелёный', 'двигатель': '1.6 л / 100л.с. / бензин', 'налог': '1150₽ / год', 'коробка': 'механическая', 'привод': 'передний', 'руль': 'левый', 'состояние': 'не требует ремонта', 'владельцы': '3 или более', 'птс': 'дубликат', 'таможня': 'растаможен', 'vin': 'wvw**************', 'госномер': '******|71', 'описание': 'продам фольца. сразу о недостатках. надо поменять внутреннюю правую гранату ( на третьей передачи начинается небольшая вибрация при разгоне), расколота передняя правая фара (стекло), есть небольшая дырка в полу не кретичная ( зачищена и обработана пушсалом). в целом кузов на твёрдую 4. оцинкован. датчик топлива не правильно показывает. стоит сигналка с авто запуском, ( но недоходит до стартера, где-то обрыв). по недостаткам всё.     движок не дымит, масло не ест, работает ровно, очень шустрый, 100л.с. передач

In [25]:
df = pd.DataFrame()

err_pages = 1
counter = 1

for page in tqdm(cars_pages):
    try:
        i = 0
        car_info = try_to_get_info_5_times(page, 0, with_driver=True)
        if car_info:
            df = df.append(car_info, ignore_index=True)
            
            if counter%10 == 0:
                df.to_csv('volk_polo_dataset.csv')
            counter += 1
        else:
            print(f'pages not parsed: {err_pages}')
            print(page)
            err_pages += 1
    except Exception as err:
        print(repr(err))
        print(page)
        continue


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=886.0), HTML(value='')))

AttributeError("'NoneType' object has no attribute 'find_all'")
  File "<ipython-input-24-3e3bd8e6baf6>", line 13, in try_to_get_info_5_times
    car_info = get_car_info(car_page, with_driver=with_driver, driver=web_driver)
  File "<ipython-input-24-3e3bd8e6baf6>", line 4, in get_car_info
    car_info = _get_car_info(page=soup, url=car_page)
  File "<ipython-input-23-13ee0c38d40f>", line 30, in _get_car_info
    features = info.find_all('li')

AttributeError("'NoneType' object has no attribute 'find_all'")
  File "<ipython-input-24-3e3bd8e6baf6>", line 13, in try_to_get_info_5_times
    car_info = get_car_info(car_page, with_driver=with_driver, driver=web_driver)
  File "<ipython-input-24-3e3bd8e6baf6>", line 4, in get_car_info
    car_info = _get_car_info(page=soup, url=car_page)
  File "<ipython-input-23-13ee0c38d40f>", line 30, in _get_car_info
    features = info.find_all('li')

pages not parsed: 1
https://auto.ru/cars/new/group/volkswagen/polo/21802427/22153111/1102576151-e9bb004d

In [12]:
df

In [None]:
df.to_csv('volk_polo_dataset.csv')