In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [2]:
page = requests.get('http://www.autonet.ru/auto/ttx')

In [3]:
soup = BeautifulSoup(page.text, 'lxml')

In [4]:
brands = soup.find('div', class_='brands-block')
brands_links = []

for brand in brands.find_all('li'):
    brands_links.append(brand.find('a').get('href'))

In [5]:
models_links = [] 

for brand_link in brands_links:
    brand_page = requests.get(f'http://www.autonet.ru{brand_link}')
    brand_soup = BeautifulSoup(brand_page.text, 'lxml')
    car_models = brand_soup.find('div', 'filter-models')
    
    for model in car_models.find_all('li'):
        models_links.append(f'http://www.autonet.ru{model.find("a").get("href")}')

In [6]:
models = pd.DataFrame(models_links, columns=['url']).sort_values(by=['url']).reset_index(drop=True)

In [7]:
models.to_csv('../data/car_models_urls.csv')

In [8]:
models['brand'] = models['url'].apply(lambda x: x.split('/')[-2])
models['model'] = models['url'].apply(lambda x: x.split('/')[-1])

In [9]:
models

Unnamed: 0,url,brand,model
0,http://www.autonet.ru/auto/ttx/ac_cars/ace,ac_cars,ace
1,http://www.autonet.ru/auto/ttx/ac_cars/aceca,ac_cars,aceca
2,http://www.autonet.ru/auto/ttx/ac_cars/cobra,ac_cars,cobra
3,http://www.autonet.ru/auto/ttx/ac_cars/mamba,ac_cars,mamba
4,http://www.autonet.ru/auto/ttx/acura/cl,acura,cl
...,...,...,...
1815,http://www.autonet.ru/auto/ttx/zotye/z300,zotye,z300
1816,http://www.autonet.ru/auto/ttx/zx/admiral,zx,admiral
1817,http://www.autonet.ru/auto/ttx/zx/changling,zx,changling
1818,http://www.autonet.ru/auto/ttx/zx/grand_tiger,zx,grand_tiger


In [11]:
df = pd.read_csv('https://stepik.org/media/attachments/lesson/866758/quickstart_train.csv')

In [13]:
url = []

special_models = ['bmw 320i', 'mercedes-benz e200', 'mini cooper', 'smart coupe', 'vw polo', 'vw polo vi', 'vw tiguan']

for model in df['model']:
    model_name = model.lower().split()

    if model in ['Tesla Model 3', 'Volkswagen ID.4 ', 'MINI CooperSE']:
        url.append(np.nan)
        continue
    
    elif model_name[0] == 'vw':
        url.append(f'http://www.autonet.ru/auto/ttx/volkswagen/{model_name[1]}')
    
    elif model_name[1] == 'e200':
        url.append(f'http://www.autonet.ru/auto/ttx/{model_name[0]}/e')
        
    elif model_name[1] == '320i':
        url.append(f'http://www.autonet.ru/auto/ttx/{model_name[0]}/3_series')
    
    elif model_name[1] == 'cooper':
        url.append(f'http://www.autonet.ru/auto/ttx/{model_name[0]}/hatch')
    
    elif model_name[1] == 'coupe':
        url.append(f'http://www.autonet.ru/auto/ttx/{model_name[0]}/fortwo')

    else:
        url.append(f'http://www.autonet.ru/auto/ttx/{model_name[0]}/{model_name[1]}')

In [14]:
df['url'] = url

In [15]:
urls_car_df = df[['car_id', 'model', 'url']]

In [18]:
urls_car_df.to_csv('../data/cars_urls.csv')

In [19]:
urls_car_df

Unnamed: 0,car_id,model,url
0,y13744087j,Kia Rio X-line,http://www.autonet.ru/auto/ttx/kia/rio
1,O41613818T,VW Polo VI,http://www.autonet.ru/auto/ttx/volkswagen/polo
2,d-2109686j,Renault Sandero,http://www.autonet.ru/auto/ttx/renault/sandero
3,u29695600e,Mercedes-Benz GLC,http://www.autonet.ru/auto/ttx/mercedes-benz/glc
4,N-8915870N,Renault Sandero,http://www.autonet.ru/auto/ttx/renault/sandero
...,...,...,...
2332,j21246192N,Smart ForFour,http://www.autonet.ru/auto/ttx/smart/forfour
2333,h-1554287F,Audi A4,http://www.autonet.ru/auto/ttx/audi/a4
2334,A15262612g,Kia Rio,http://www.autonet.ru/auto/ttx/kia/rio
2335,W-2514493U,Renault Sandero,http://www.autonet.ru/auto/ttx/renault/sandero


In [77]:
models_data = {}

def get_car_info(car):
    if pd.notnull(car['url']):
        model_page = requests.get(car['url'])
        model_soup = BeautifulSoup(model_page.text, 'lxml')

        models_data[car['car_id']] = []
        mod_lists = model_soup.find_all('table', class_='mod-list-table')

        for mod_list in mod_lists:
            for mod in mod_list.find_all('tr')[1:]:
                name = mod.find('td', class_='mod').text.strip('\n')

                try:
                    carcass_inf = mod.find('td', class_='carcass').text.split()
                    carcass = carcass_inf[0]
                    doors = int(carcass_inf[1][1])
                    volume = int(mod.find('td', class_='volume').text.split()[0])
                    power = int(mod.find('td', class_='power').text.split()[0])
                    release_year = int(mod.find('td', class_='edition').text.split('-')[0].split('.')[-1])

                except:
                    continue

                else:
                    if carcass and doors and volume and power and release_year:
                        if release_year <= car['year_to_start']:
                            models_data[car['car_id']].append((name, release_year, carcass, doors, volume, power))

                        else:
                            continue

        if len(models_data[car['car_id']]) == 0:
            models_data[car['car_id']].append((np.nan, np.nan, np.nan, np.nan, np.nan, np.nan))
            
    else:
        models_data[car['car_id']] = []
        models_data[car['car_id']].append((np.nan, np.nan, np.nan, np.nan, np.nan, np.nan))

In [78]:
def pass_car_info(car):
    car_model = pd.DataFrame(car, columns=['name', 'release_year', 'carcass', 'doors', 'volume', 'power']).sort_values(by=['name', 'release_year', 'carcass', 'doors', 'volume', 'power']).reset_index(drop=True)
    car_model = car_model.iloc[0]
    
    return car_model

In [79]:
df.apply(lambda x: get_car_info(x), axis=1);

In [68]:
pass_car_info(models_data['R22777636m'])['name']

'KIA Rio 1.1 CDRi'

In [80]:
for car_id in models_data.keys():
    car_info = pass_car_info(models_data[car_id])
    
    for feature in ['name', 'release_year', 'carcass', 'doors', 'volume', 'power']:
        df.loc[df['car_id'] == car_id, feature] = car_info[feature]

In [92]:
df['name'] = df['name'].apply(lambda x: str(x).strip())

In [93]:
df[['car_id', 'model', 'url', 'year_to_start', 'name', 'release_year', 'carcass', 'doors', 'volume', 'power']].to_csv('../data/cars_models_parsing_results.csv', index=False)

In [82]:
df.to_csv('../data/car_parsing_mean.csv', index=False)

In [94]:
df[['car_id', 'model', 'url', 'year_to_start', 'name', 'release_year', 'carcass', 'doors', 'volume', 'power']]

Unnamed: 0,car_id,model,url,year_to_start,name,release_year,carcass,doors,volume,power
0,y13744087j,Kia Rio X-line,http://www.autonet.ru/auto/ttx/kia/rio,2015,KIA Rio 1.1 CDRi,2012.0,хэтчбек,3.0,1120.0,75.0
1,O41613818T,VW Polo VI,http://www.autonet.ru/auto/ttx/volkswagen/polo,2015,Volkswagen Cross Polo 1.2,2005.0,хэтчбек,5.0,1198.0,64.0
2,d-2109686j,Renault Sandero,http://www.autonet.ru/auto/ttx/renault/sandero,2012,Renault Sandero 1.4,2009.0,хэтчбек,5.0,1390.0,75.0
3,u29695600e,Mercedes-Benz GLC,http://www.autonet.ru/auto/ttx/mercedes-benz/glc,2011,,,,,,
4,N-8915870N,Renault Sandero,http://www.autonet.ru/auto/ttx/renault/sandero,2012,Renault Sandero 1.4,2009.0,хэтчбек,5.0,1390.0,75.0
...,...,...,...,...,...,...,...,...,...,...
2332,j21246192N,Smart ForFour,http://www.autonet.ru/auto/ttx/smart/forfour,2017,Smart Brabus Forfour 0.9,2015.0,хэтчбек,4.0,898.0,109.0
2333,h-1554287F,Audi A4,http://www.autonet.ru/auto/ttx/audi/a4,2016,Audi A4 1.4 Avant TFSI,2015.0,универсал,5.0,1395.0,150.0
2334,A15262612g,Kia Rio,http://www.autonet.ru/auto/ttx/kia/rio,2015,KIA Rio 1.1 CDRi,2012.0,хэтчбек,3.0,1120.0,75.0
2335,W-2514493U,Renault Sandero,http://www.autonet.ru/auto/ttx/renault/sandero,2014,Renault Sandero 1.2 75 л.с.,2014.0,хэтчбек,5.0,1179.0,75.0
