In [1]:
from bs4 import BeautifulSoup as bs
import requests
from IPython.core.debugger import set_trace

import ssl

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
from requests.packages.urllib3.util import ssl_

In [2]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)
    
from marketplaces.avito import Marketplace

In [33]:
class Avito(Marketplace):
    
    def __init__(self, marketplace_name, url_to_parse):
        super().__init__(marketplace_name, url_to_parse)
        
    def get_offers_on_page(self, page):
        return page.select('*[itemtype="http://schema.org/Product"]')
    
    def get_price(self, offer):
        # 2 Possibilities:
        # First: price-text (contains min price) and price-noaccent (price withous discount),
        # Second: just price-text (price withous discount)
        # For start I check the first case, then the second one
        without_discount_price = offer.select_one('span[class*=price-noaccent]')
        if without_discount_price:
            return float(without_discount_price.get_text().replace('\xa0', '').replace(' ₽ без скидки', ''))
        else:
            return float(offer.select_one('[class*=price-text]').get_text().replace('\xa0', '')[:-1])
    
    # Title has car's name and year, so I save it inside object to use in future (get_name called before get_year)
    def get_name(self, offer):
        self.title = offer.select_one('div[class*=title]').get_text().split(', ')
        return self.title[0]
    
    def get_year(self, offer):
        return int(self.title[1])
    
    # Same situation as with name and year in a title, but now 5 params are in the same block (get_mileage called first)
    def get_mileage(self, offer):
        car_params = offer.select_one('div[class*=autoParams]')

        # If offer is vip, then additional info is on the upper line and separated from main params with <br>,
        # because vip offers stand together in a row of three
        # Otherwise, if offer is not vip, it have its own line, 
        # and extra info is in the same line with main params divided by ', '
        for line_break in offer.findAll('br'):     
            line_break.replaceWith(', ')

        car_params = car_params.get_text().replace('\xa0', '').split(', ')
        self.car_params = car_params[-5:]

        snippets = offer.select_one('div[class*=SnippetBar]')
        is_new = False
        if snippets:
            snippets = snippets.get_text()
            is_new = 'Новый' in snippets
        return 0 if is_new else float(self.car_params[-5][:-2])
    
    # Same situation: modification_info will be used in get_horsepower (get_engine_capacity called before get_horsepower)
    def get_engine_capacity(self, offer):         
        modification = self.car_params[-4]
        self.modification_info = modification.split(' ')
        return float(self.modification_info[0])
    
    def get_horsepower(self, offer):
        return float(self.modification_info[2][1:-5]) # remove brackets and postfix 'л.с.'

    def get_body_type(self, offer):
        return self.car_params[-3]
    
    def get_drive_type(self, offer):
        return self.car_params[-2]
    
    def get_engine_type(self, offer):
        return self.car_params[-1]

    def get_transmission(self, offer):
        return None
    
    def get_offer_location(self, offer):
        return offer.select_one('div[class*=geo]').get_text()
    
    def is_offer_vip(self, offer):
        return 'items-vip' in offer.parent['class'][0]
    
    def get_offer_url(self, offer):
        return 'https://avito.ru/' + offer.select_one('a[data-marker=item-title]')['href']

In [34]:
avito = Avito('Avito', None)

In [5]:
import ssl
import requests

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
from requests.packages.urllib3.util import ssl_

CIPHERS = """ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-SHA256:AES256-SHA"""

class TlsAdapter(HTTPAdapter):

    def __init__(self, ssl_options=0, **kwargs):
        self.ssl_options = ssl_options
        super(TlsAdapter, self).__init__(**kwargs)

    def init_poolmanager(self, *pool_args, **pool_kwargs):
        ctx = ssl_.create_urllib3_context(ciphers=CIPHERS, cert_reqs=ssl.CERT_REQUIRED, options=self.ssl_options)
        self.poolmanager = PoolManager(*pool_args, ssl_context=ctx, **pool_kwargs)

session = requests.session()
adapter = TlsAdapter(ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
session.mount("https://", adapter)

In [27]:
response = session.get('https://www.avito.ru/novosibirsk/avtomobili?radius=200')

In [28]:
response.status_code

200

In [29]:
page = bs(response.content, 'html.parser')

In [30]:
offers = avito.get_offers_on_page(page)
len(offers)

56

In [37]:
for i, offer in enumerate(offers):
    car_info = avito.parse_offer(offer)
    if not car_info:
        print('check log files '+str(i))