In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [2]:
page = requests.get('http://www.autonet.ru/auto/ttx')

In [3]:
soup = BeautifulSoup(page.text, 'lxml')

In [4]:
brands = soup.find('div', class_='brands-block')
brands_links = []

for brand in brands.find_all('li'):
    brands_links.append(brand.find('a').get('href'))

In [5]:
models_links = [] 

for brand_link in brands_links:
    brand_page = requests.get(f'http://www.autonet.ru{brand_link}')
    brand_soup = BeautifulSoup(brand_page.text, 'lxml')
    car_models = brand_soup.find('div', 'filter-models')
    
    for model in car_models.find_all('li'):
        models_links.append(f'http://www.autonet.ru{model.find("a").get("href")}')

In [6]:
models = pd.DataFrame(models_links, columns=['url']).sort_values(by=['url']).reset_index(drop=True)

In [7]:
models.to_csv('../data/car_models_urls.csv')

In [8]:
models['brand'] = models['url'].apply(lambda x: x.split('/')[-2])
models['model'] = models['url'].apply(lambda x: x.split('/')[-1])

In [9]:
models

Unnamed: 0,url,brand,model
0,http://www.autonet.ru/auto/ttx/ac_cars/ace,ac_cars,ace
1,http://www.autonet.ru/auto/ttx/ac_cars/aceca,ac_cars,aceca
2,http://www.autonet.ru/auto/ttx/ac_cars/cobra,ac_cars,cobra
3,http://www.autonet.ru/auto/ttx/ac_cars/mamba,ac_cars,mamba
4,http://www.autonet.ru/auto/ttx/acura/cl,acura,cl
...,...,...,...
1815,http://www.autonet.ru/auto/ttx/zotye/z300,zotye,z300
1816,http://www.autonet.ru/auto/ttx/zx/admiral,zx,admiral
1817,http://www.autonet.ru/auto/ttx/zx/changling,zx,changling
1818,http://www.autonet.ru/auto/ttx/zx/grand_tiger,zx,grand_tiger


In [10]:
df = pd.read_csv('https://stepik.org/media/attachments/lesson/866758/quickstart_train.csv')

In [11]:
url = []

special_models = ['bmw 320i', 'mercedes-benz e200', 'mini cooper', 'smart coupe', 'vw polo', 'vw polo vi', 'vw tiguan']

for model in df['model']:
    model_name = model.lower().split()

    if model in ['Tesla Model 3', 'Volkswagen ID.4 ', 'MINI CooperSE']:
        url.append(np.nan)
        continue
    
    elif model_name[0] == 'vw':
        url.append(f'http://www.autonet.ru/auto/ttx/volkswagen/{model_name[1]}')
    
    elif model_name[1] == 'e200':
        url.append(f'http://www.autonet.ru/auto/ttx/{model_name[0]}/e')
        
    elif model_name[1] == '320i':
        url.append(f'http://www.autonet.ru/auto/ttx/{model_name[0]}/3_series')
    
    elif model_name[1] == 'cooper':
        url.append(f'http://www.autonet.ru/auto/ttx/{model_name[0]}/hatch')
    
    elif model_name[1] == 'coupe':
        url.append(f'http://www.autonet.ru/auto/ttx/{model_name[0]}/fortwo')

    else:
        url.append(f'http://www.autonet.ru/auto/ttx/{model_name[0]}/{model_name[1]}')

In [12]:
df['url'] = url

In [13]:
urls_car_df = df[['car_id', 'model', 'url']]

In [14]:
urls_car_df

Unnamed: 0,car_id,model,url
0,y13744087j,Kia Rio X-line,http://www.autonet.ru/auto/ttx/kia/rio
1,O41613818T,VW Polo VI,http://www.autonet.ru/auto/ttx/volkswagen/polo
2,d-2109686j,Renault Sandero,http://www.autonet.ru/auto/ttx/renault/sandero
3,u29695600e,Mercedes-Benz GLC,http://www.autonet.ru/auto/ttx/mercedes-benz/glc
4,N-8915870N,Renault Sandero,http://www.autonet.ru/auto/ttx/renault/sandero
...,...,...,...
2332,j21246192N,Smart ForFour,http://www.autonet.ru/auto/ttx/smart/forfour
2333,h-1554287F,Audi A4,http://www.autonet.ru/auto/ttx/audi/a4
2334,A15262612g,Kia Rio,http://www.autonet.ru/auto/ttx/kia/rio
2335,W-2514493U,Renault Sandero,http://www.autonet.ru/auto/ttx/renault/sandero


In [15]:
urls_car_df.to_csv('../data/cars_urls.csv')

In [16]:
models_data = {}

def get_car_info(car):
    if pd.notnull(car['url']):
        model_page = requests.get(car['url'])
        model_soup = BeautifulSoup(model_page.text, 'lxml')

        models_data[car['car_id']] = []
        mod_lists = model_soup.find_all('table', class_='mod-list-table')

        for mod_list in mod_lists:
            for mod in mod_list.find_all('tr')[1:]:
                name = mod.find('td', class_='mod').text.strip('\n')

                try:
                    carcass_inf = mod.find('td', class_='carcass').text.split()
                    carcass = carcass_inf[0]
                    doors = int(carcass_inf[1][1])
                    volume = int(mod.find('td', class_='volume').text.split()[0])
                    power = int(mod.find('td', class_='power').text.split()[0])
                    release_year = int(mod.find('td', class_='edition').text.split('-')[0].split('.')[-1])

                except:
                    continue

                else:
                    if carcass and doors and volume and power and release_year:
                        if release_year <= car['year_to_start']:
                            models_data[car['car_id']].append((name, release_year, carcass, doors, volume, power))

                        else:
                            continue

        if len(models_data[car['car_id']]) == 0:
            models_data[car['car_id']].append((np.nan, np.nan, np.nan, np.nan, np.nan, np.nan))
            
    else:
        models_data[car['car_id']] = []
        models_data[car['car_id']].append((np.nan, np.nan, np.nan, np.nan, np.nan, np.nan))

In [17]:
def pass_car_info(car):
    car_model = pd.DataFrame(car['car_id'], columns=['name', 'release_year', 'carcass', 'doors', 'volume', 'power']).sort_values(by=['name', 'release_year', 'carcass', 'doors', 'volume', 'power']).reset_index(drop=True)
    car_model = car_model.iloc[0]

In [18]:
df.apply(lambda x: get_car_info(x), axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
2332    None
2333    None
2334    None
2335    None
2336    None
Length: 2337, dtype: object

In [39]:
models_data

{'y13744087j': [('KIA Rio 1.4 MPI', 2015, 'седан', 4, 1396, 107),
  ('KIA Rio 1.4 MPI', 2015, 'хэтчбек', 5, 1396, 107),
  ('KIA Rio 1.6 MPI', 2015, 'седан', 4, 1591, 123),
  ('KIA Rio 1.6 MPI', 2015, 'хэтчбек', 5, 1591, 123),
  ('KIA Rio 1.4 MPI', 2014, 'седан', 4, 1396, 107),
  ('KIA Rio 1.4 MPI', 2014, 'хэтчбек', 5, 1396, 107),
  ('KIA Rio 1.4 MPI', 2011, 'седан', 4, 1396, 107),
  ('KIA Rio 1.4 MPI', 2012, 'хэтчбек', 5, 1396, 107),
  ('KIA Rio 1.6 MPI', 2014, 'седан', 4, 1591, 123),
  ('KIA Rio 1.6 MPI', 2014, 'хэтчбек', 5, 1591, 123),
  ('KIA Rio 1.6 MPI', 2011, 'седан', 4, 1591, 123),
  ('KIA Rio 1.6 MPI', 2012, 'хэтчбек', 5, 1591, 123),
  ('KIA Rio 1.1 CDRi', 2012, 'хэтчбек', 3, 1120, 75),
  ('KIA Rio 1.1 CRDi', 2011, 'хэтчбек', 5, 1120, 75),
  ('KIA Rio 1.2 MPI', 2011, 'хэтчбек', 5, 1248, 87),
  ('KIA Rio 1.2 MPI', 2012, 'хэтчбек', 3, 1248, 87),
  ('KIA Rio 1.2 MPI', 2012, 'седан', 4, 1248, 87),
  ('KIA Rio 1.4 CDRi', 2012, 'хэтчбек', 3, 1396, 90),
  ('KIA Rio 1.4 CRDi', 2011, 'х

In [51]:
df.loc[df['model'] == 'Tesla Model 3']

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq,url
7,O-2124190y,Tesla Model 3,premium,electro,3.9,2017,116872,2019,50.4,gear_stick,4.712356,9793288.0,0.1,95.890736,-8.939366,174,139,
99,n19871201N,Tesla Model 3,premium,electro,3.58,2016,89811,2018,99.17,another_bug,4.648448,10237560.0,0.1,195.243961,6.716249,174,147,
187,I13914989Z,Tesla Model 3,premium,electro,4.64,2016,98304,2019,103.75,engine_ignition,4.152241,12913000.0,0.1,195.321301,7.404675,174,150,
218,g-2283000q,Tesla Model 3,premium,electro,5.44,2015,75113,2020,49.28,gear_stick,4.413736,14364220.0,0.1,101.976813,-0.650539,174,141,
505,i-1106718P,Tesla Model 3,premium,electro,4.72,2015,75786,2017,120.33,engine_ignition,3.839828,16317380.0,0.1,197.938901,-12.254816,174,146,
685,I-2073229s,Tesla Model 3,premium,electro,5.1,2017,120231,2017,58.11,engine_fuel,4.083851,16251740.0,0.1,191.425146,-6.04526,174,144,
1120,N44849090k,Tesla Model 3,premium,electro,3.6,2015,86115,2022,58.5,electro_bug,4.186149,15782760.0,0.0,199.370103,3.385336,174,148,
1192,T20813001U,Tesla Model 3,premium,electro,3.98,2017,132784,2017,54.48,engine_fuel,3.951782,12560570.0,0.1,174.458376,-10.913769,174,138,
1245,O-2688441F,Tesla Model 3,premium,electro,4.82,2015,80038,2021,58.85,engine_fuel,4.556149,10274930.0,0.1,169.368375,4.348376,174,139,
1447,p22133368b,Tesla Model 3,premium,electro,3.06,2014,66621,2019,64.58,electro_bug,4.279713,9146520.0,0.0,180.831626,-5.15,174,151,
