In [23]:
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException

from tqdm import tqdm

import re
import time
import json
import random
import os
import glob

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [24]:
valid_data = pd.DataFrame()
catalog_data = pd.DataFrame()
today = time.strftime("%Y%m%d")

In [4]:
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
There is no [win32] chromedriver for browser 99.0.4844 in cache
Trying to download new driver from https://chromedriver.storage.googleapis.com/99.0.4844.51/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\docke\.wdm\drivers\chromedriver\win32\99.0.4844.51]


In [75]:
regions = [
    # "leningradskaya_oblast",
    "moskovskaya_oblast"
]
models = [
    # "skoda",
    # "audi",
    # "honda",
    # "volvo",
    # "bmw",
    # "nissan",
    # "infiniti",
    # "mercedes",
    # "toyota",
    # "lexus",
    # "volkswagen",
    # 'kia',
    'hyundai',
    # 'suzuki',
    'mazda',
    'renault',
    # 'ford',
    # 'haval'
]

## Function block

In [7]:
def get_urls_for_model_in_region(model: str, region: str) -> list:
    '''
    Function that takes model and region and parsing urls for vehicle to list
    '''
    result_list = []
    cnt = True
    page_num = 1
    
    while cnt is True and page_num <= 99:
        try:
            driver.get(f'https://auto.ru/{region}/cars/{model}/used/?page={page_num}')
            next_page = driver.find_element(By.CLASS_NAME, 'ListingItemTitle__link')
            page_links = driver.find_elements(By.CLASS_NAME, 'ListingItemTitle__link')

            for item in page_links:
                result_list.append(item.get_attribute('href'))
            print(f'Parsing {page_num} page for {model} in {region}', end='\r')
            page_num += 1
        except NoSuchElementException:
            cnt = False
            print()
            print(f'There are {page_num - 1} pages by {model}')
    
    return result_list

In [57]:
def get_car_info(url: str) -> pd.DataFrame:
    '''
    Function to parse each car URL
    '''
    driver.get(url)
    
    year = int(driver.find_element(By.CLASS_NAME, 'CardInfoRow_year').text.split('\n')[1])
    
    car = driver.find_elements(By.CLASS_NAME,'CardBreadcrumbs__itemText')
    brand = car[3].text
    model = car[4].text
    
    car_url = url
    
    bodytype = driver.find_element(By.CLASS_NAME, 'CardInfoRow_bodytype').text.split('\n')[1]
    kmage = int(re.sub('\D', '', driver.find_element(By.CLASS_NAME, 'CardInfoRow_kmAge').text))
    color = driver.find_element(By.CLASS_NAME, 'CardInfoRow_color').text.split('\n')[1]
    
    engine = driver.find_element(By.CLASS_NAME, 'CardInfoRow_engine').text.split('/')
    
    try:
        engineDisplacement = float(re.findall('(\d+.\d+)', engine[0])[0])
        enginePower = int(re.findall('\d+', engine[1])[0])
        fuelType = engine[2]
    except IndexError:
        engineDisplacement = 0
        enginePower = 0
        fuelType = engine
    
    super_gen = json.loads(driver.find_element(By.ID, 'sale-data-attributes').get_attribute('data-bem'))['sale-data-attributes']
    
    vehicleTransmission = driver.find_element(By.CLASS_NAME, 'CardInfoRow_transmission').text.split('\n')[1]
    drive = driver.find_element(By.CLASS_NAME, 'CardInfoRow_drive').text.split('\n')[1]
    wheel = driver.find_element(By.CLASS_NAME, 'CardInfoRow_wheel').text.split('\n')[1]
    state = driver.find_element(By.CLASS_NAME, 'CardInfoRow_state').text.split('\n')[1]
    owner = driver.find_element(By.CLASS_NAME, 'CardInfoRow_ownersCount').text.split('\n')[1]
    pts = driver.find_element(By.CLASS_NAME, 'CardInfoRow_pts').text.split('\n')[1]
    customs = driver.find_element(By.CLASS_NAME, 'CardInfoRow_customs').text.split('\n')[1]
    
    try:
        owningTime = driver.find_element(By.CLASS_NAME, 'CardInfoRow_owningTime').text.split('\n')[1]
    except NoSuchElementException:
        owningTime = None
    
    try:
        description = driver.find_element(By.CLASS_NAME, 'CardDescriptionHTML').text
    except NoSuchElementException:
        description = None
    
    sell_id = int(re.findall('\d+', driver.find_element(By.CLASS_NAME, 'CardHead__id').text)[0])
    price = int(re.sub('\D', '', driver.find_element(By.CLASS_NAME, 'OfferPriceCaption__price').text))
    
    complect_list = []
    
    complect_data = driver.find_elements(By.CLASS_NAME, 'ComplectationGroupsDesktop__itemList')

    for item in complect_data:
        complect_list.extend(item.text.replace('\n', '').split('•')[1:])
    
    try:
        catalog_url = driver.find_element(By.CLASS_NAME, 'CardCatalogLink').get_attribute('href')
    except NoSuchElementException:
        catalog_url = 'No catalog link'
    
    
    data_dict = {
        'brand': brand, 'model': model, 'year': year, 'bodytype': bodytype, 'kmage': kmage,
        'color': color, 'engineDisplacement': engineDisplacement, 'enginePower': enginePower, 
        'fuelType': fuelType, 'super_gen': super_gen, 'vehicleTransmission': vehicleTransmission,
        'drive': drive, 'wheel': wheel, 'state': state, 'owner': owner, 'pts': pts, 'customs': customs, 
        'owningTime': owningTime, 'description': description, 'sell_id': sell_id, 'price': price, 
        'car_url': car_url, 'catalog_url': catalog_url, 'equipment_dict': complect_list
    }
    
    return pd.DataFrame([data_dict])

In [77]:
def get_dicts_from_catalog(url: str) -> pd.DataFrame:
    '''
    Function to parse catalog URL
    '''
    driver.get(url)

    car_info_full = driver.find_elements(By.CLASS_NAME, 'list-values')
    
    car_info_dict_ru = {}
    
    for item in car_info_full:
        el = item.text.split('\n')
        for i in range(0, len(el) - 1, 2):
            car_info_dict_ru[el[i]] = el[i + 1]
    
    engine_type_dict = {'бензин': 'GASOLINE', 'дизель': 'DIESEL', 
                        'гибрид': 'HYBRID', 'электро': 'ELECTRO', 'газ': 'LPG',
                        'СУГ': 'LPG'}
    gear_type_dict = {'передний': 'FORWARD_CONTROL', 'полный': 'ALL_WHEEL_DRIVE', 'задний': 'REAR_DRIVE'}
    transmission_dict = {'автомат': 'AUTOMATIC', 'робот': 'ROBOT', 
                         'механика': 'MECHANICAL', 'вариатор': 'VARIATOR'}
    
    if car_info_dict_ru['Тип двигателя'] not in ['электро']:
        if car_info_dict_ru.get('Расход топлива, л город/трасса/смешанный') is not None:
            fuel_rate = float(car_info_dict_ru['Расход топлива, л город/трасса/смешанный'].split('/')[2])
        elif car_info_dict_ru.get('Расход топлива, л смешанный') is not None:
            fuel_rate = float(car_info_dict_ru['Расход топлива, л смешанный'])
        elif car_info_dict_ru.get('Расход топлива, л город/смешанный') is not None:
            fuel_rate = float(car_info_dict_ru['Расход топлива, л город/смешанный'].split('/')[1])
        elif car_info_dict_ru.get('Расход топлива, л город/трасса') is not None:
            fuel_rate = sum(map(float, car_info_dict_ru['Расход топлива, л город/трасса'].split('/'))) / 2
        else:
            fuel_rate = None
    else:
        fuel_rate = float(car_info_dict_ru.get('Запас хода на электричестве, км', 0))
        
    try:
        clearance_min = min(map(int, re.findall('\d+', car_info_dict_ru['Клиренс'])))
    except KeyError:
        clearance_min = None
    
    car_info_dict_en = {
        'engine_type': engine_type_dict[car_info_dict_ru['Тип двигателя']],
        'gear_type': gear_type_dict[car_info_dict_ru['Привод']],
        'transmission': transmission_dict[car_info_dict_ru['Коробка']],
        'power': int(re.findall('\d+', car_info_dict_ru['Мощность'])[0]),
        'power_kvt': int(re.findall('\d+', car_info_dict_ru['Максимальная мощность, л.с./кВт при об/мин'])[1]),
        'acceleration': car_info_dict_ru.get('Разгон до 100 км/ч, с'),
        'clearance_min': clearance_min,
        'fuel_rate': fuel_rate
    }
    
    url2 = url.replace('specifications', 'equipment')
    
    driver.get(url2)

    equipment_list = []

    equipment_data = driver.find_elements(By.CLASS_NAME, 'catalog__package-list-i')
    for item in equipment_data:
        equipment_list.append(item.text)
    
    car_data = json.loads(driver.find_element(By.CLASS_NAME, 'search-form-v2-mmm').get_attribute('data-bem'))
    equip_data = json.loads(driver.find_element(By.CLASS_NAME, 'catalog__section').get_attribute('data-bem'))
    attrib_data = json.loads(driver.find_element(By.CLASS_NAME, 'sale-data-attributes').get_attribute('data-bem'))
    
    catalog_data_dict = {
        'catalog_url': url, 'super_gen_2': car_info_dict_en, 
        'super_gen_3': attrib_data, 'complectation_dict': equipment_list
    }
    
    return pd.DataFrame([catalog_data_dict])

## Gathering URLs from MODEL pages in Regions and getting main info from car page

In [44]:
if not os.path.exists('data/car_data'):
    os.makedirs('data/car_data')
    
for region in regions:
    for model in models:
        model_urls = get_urls_for_model_in_region(model, region)
        for url in tqdm(model_urls):
            time.sleep(random.random())
            try:
                valid_data = pd.concat([valid_data, get_car_info(url)])
            except:
                print('Error 404', end='\r')
        
        valid_data.to_csv(f'data/car_data/{today}_{model}_{region}_car_data.csv')
        valid_data = pd.DataFrame()

Parsing 4 page for haval in moskovskaya_oblast
There are 4 pages by haval


 62%|██████████████████████████████████████████████████                               | 73/118 [05:05<02:22,  3.18s/it]

Error 404

 70%|████████████████████████████████████████████████████████▉                        | 83/118 [05:43<01:51,  3.18s/it]

Error 404

100%|████████████████████████████████████████████████████████████████████████████████| 118/118 [07:59<00:00,  4.06s/it]


## Getting additional information from catalog (links from main DF)

In [79]:
car_csvs = [c for c in os.listdir("data/car_data/") if c.endswith(".csv")]
df = pd.DataFrame()

for c in car_csvs:
    print("Reading", c, end=" ")
    df = pd.concat([df, pd.read_csv(f"data/car_data/{c}")])
    print("done")

df.set_index("Unnamed: 0", inplace=True)
df.reset_index(drop=True, inplace=True)

if not os.path.exists('data/catalog_car_data'):
    os.makedirs('data/catalog_car_data')
    
for single_brand in df.brand.unique():
    print(f'Gathering catalog data for {single_brand}')
    for car_url in tqdm(df[df['brand'].str.lower() == single_brand]['catalog_url'].value_counts().index.tolist()):
        if car_url != 'No catalog link':
            try:
                catalog_data = pd.concat([catalog_data, get_dicts_from_catalog(car_url)])
            except:
                print('Error 404', end='\r')
    catalog_data.to_csv(f'data/catalog_car_data/{today}_{single_brand}_catalog_car_data.csv')
    catalog_data = pd.DataFrame()

Gathering catalog data for hyundai


100%|████████████████████████████████████████████████████████████████████████████████| 252/252 [15:11<00:00,  3.62s/it]


Gathering catalog data for mazda


100%|████████████████████████████████████████████████████████████████████████████████| 197/197 [11:33<00:00,  3.52s/it]


Gathering catalog data for renault


100%|████████████████████████████████████████████████████████████████████████████████| 202/202 [12:08<00:00,  3.61s/it]


## Reading car info files and car catalog files for merging

In [80]:
cars_list = glob.glob('C:\PyProjects\XelorR\sf_project_6\data\car_data\*.csv') 
cars_df = pd.concat(map(pd.read_csv, cars_list))

catalog_cars_list = glob.glob('C:\PyProjects\XelorR\sf_project_6\data\catalog_car_data\*.csv') 
catalog_df = pd.concat(map(pd.read_csv, catalog_cars_list))

## Delete unused cols, reset indexes

In [81]:
del cars_df['Unnamed: 0']
del catalog_df['Unnamed: 0']
cars_df.reset_index(drop=True, inplace=True)
catalog_df.reset_index(drop=True, inplace=True)
catalog_df.drop_duplicates(subset=['catalog_url'], inplace=True, ignore_index=True)

## Merging and writing to file

In [82]:
merged_df = pd.merge(cars_df, catalog_df, on='catalog_url', how='left')

In [83]:
merged_df.to_pickle(f'data/{today}_msk_parsed_data.pkl', compression='zip')

In [84]:
merged_df.sample(3).T

Unnamed: 0,126,1259,9376
brand,Hyundai,Hyundai,Renault
model,Accent,Sonata,Fluence
year,2006,2019,2014
bodytype,седан,седан,седан
kmage,104000,35000,82500
color,серый,чёрный,коричневый
engineDisplacement,1.5,2.5,1.6
enginePower,92,180,114
fuelType,Бензин,Бензин,Бензин
super_gen,"{'asciiCat': 'cars', 'category': 'cars', 'engi...","{'asciiCat': 'cars', 'category': 'cars', 'engi...","{'asciiCat': 'cars', 'category': 'cars', 'engi..."
