In [55]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
from tqdm import tqdm

## Partie I

In [56]:
def extract_data(str_class):
    if str_class.startswith("auto_pv_detTophead1Txt3 flol"):
        return soup.find("h2", {"class": str_class}).text.strip()
    else:
        return soup.find("li", {"class": str_class}).find("span").text.strip()

In [57]:
url = "https://www.paruvendu.fr/a/voiture-occasion/renault/scenic-ii/1260682580A1KVVORESC2"
response = requests.get(url)
soup = BeautifulSoup(response.text)

### Partie manuelle

In [58]:
# Automatisable
version = extract_data("vers")
price = extract_data("px").replace(" €", "").replace(" ", "")
km = extract_data("kil").replace(" ", "").replace("\nkm", "")
energy = extract_data("en")
transmission = extract_data("vit")
power_cv = extract_data("puiss").replace("\nCV", "")
places = extract_data("por").replace("\nplaces", "")

# Manuel
doors = extract_data("carro").split(" ")[0]
year = list(list(soup.find("li", {"class":"ann"}))[3])[0].strip().split(" ")[1]
postal_code = extract_data("auto_pv_detTophead1Txt3 flol").split(" ")[1]

In [59]:
print(version, price, year, postal_code, km, energy, transmission, doors, power_cv, places, sep="\n")

Scenic 1.9 dCi 130 Exception
4000
2008
(63200)
173300
Diesel
Manuelle
4
8
5


### Partie automatisable

In [60]:
list_of_keys = ['version', 'price', 'km', 'energy', 'transmission', 'power_cv', 'places', 'doors', 'year', 'postal_code', 'emission_g_per_km', 'consumption_l_per_100_km']
list_of_str_class = ['vers', 'px', 'kil', 'en', 'vit', 'puiss', 'por', 'carro', 'ann', "auto_pv_detTophead1Txt3 flol", 'emiss', 'cons']

list_of_values = [extract_data(str_class).split("\n")[0].replace(" €", "") for str_class in list_of_str_class]

In [61]:
dict(zip(list_of_keys, list_of_values))

{'version': 'Scenic 1.9 dCi 130 Exception',
 'price': '4 000',
 'km': '173 300',
 'energy': 'Diesel',
 'transmission': 'Manuelle',
 'power_cv': '8',
 'places': '5',
 'doors': '4 portes avec hayon',
 'year': 'Avril 2008',
 'postal_code': 'Riom (63200)',
 'emission_g_per_km': '159',
 'consumption_l_per_100_km': '6.0'}

## Partie II

In [62]:
def extract_data(str_class):
    if str_class.startswith("auto_pv_detTophead1Txt3 flol"):
        return soup.find("h2", {"class": str_class}).text.strip()
    else:
        return soup.find("li", {"class": str_class}).find("span").text.strip()

In [63]:
url = "https://www.paruvendu.fr/a/voiture-occasion/ford/puma/1258498821A1KVVOFOPUM"
response = requests.get(url)
soup = BeautifulSoup(response.text)

In [64]:
list_of_keys = ['version', 'price', 'km', 'energy', 'transmission', 'power_cv', 'places', 'doors', 'year', 'postal_code', 'emission_g_per_km', 'consumption_l_per_100_km']
list_of_str_class = ['vers', 'px', 'kil', 'en', 'vit', 'puiss', 'por', 'carro', 'ann', "auto_pv_detTophead1Txt3 flol", 'emiss', 'cons']

In [65]:
list_of_values = []
for str_class in list_of_str_class:
    try:
        list_of_values.append(extract_data(str_class).split("\n")[0].replace(" €", ""))
    except:
        list_of_values.append(None)

In [66]:
dict(zip(list_of_keys, list_of_values))

{'version': None,
 'price': None,
 'km': None,
 'energy': None,
 'transmission': None,
 'power_cv': None,
 'places': None,
 'doors': None,
 'year': None,
 'postal_code': None,
 'emission_g_per_km': None,
 'consumption_l_per_100_km': None}

## Partie III

### Zone visible

In [67]:
def extract_data(str_class):
    if str_class.startswith("auto_pv_detTophead1Txt3 flol"):
        return soup.find("h2", {"class": str_class}).text.strip()
    else:
        return soup.find("li", {"class": str_class}).find("span").text.strip()

In [68]:
list_of_keys = ['version', 'price', 'km', 'energy', 'transmission', 'power_cv', 'places', 'doors', 'year', 'postal_code', 'emission_g_per_km', 'consumption_l_per_100_km']
list_of_str_class = ['vers', 'px', 'kil', 'en', 'vit', 'puiss', 'por', 'carro', 'ann', "auto_pv_detTophead1Txt3 flol", 'emiss', 'cons']

In [69]:
list_of_values = []
for str_class in list_of_str_class:
    try:
        list_of_values.append(extract_data(str_class).split("\n")[0].replace(" €", ""))
    except:
        list_of_values.append(None)

In [70]:
dict(zip(list_of_keys, list_of_values))

{'version': None,
 'price': None,
 'km': None,
 'energy': None,
 'transmission': None,
 'power_cv': None,
 'places': None,
 'doors': None,
 'year': None,
 'postal_code': None,
 'emission_g_per_km': None,
 'consumption_l_per_100_km': None}

### Zone non-visible

In [71]:
specifications = soup.find_all('div', {'class':'cotea16-mes', 'id':'mes-ht'})

In [72]:
list_of_keys = ['color', 'real_power', 'bodywork', 'air_quality_sticker', 'paint', 'technical_control', 'others']

In [73]:
list_of_values = []
for i in specifications[1].find_all('li', {"class":"nologo"}):
    try:
        list_of_values.append(i.text.strip().split("\n\n")[1])
    except:
        list_of_values.append(None)

IndexError: list index out of range

In [None]:
dict(zip(list_of_keys, list_of_values))

- Le zip entre les clés et les valeurs peuvent s'inverser selon le contenu de la page.
- On gardera uniquement les valeurs et on verra comment les associer avec les clés. À voir si repartir sur du manuel ne serait pas plus simple

## Partie IV

In [None]:
import json

In [None]:
start_urls = []
with open('C:/Users/namor/OneDrive/Documents/simplon/Data IA/Projets/Projet 6/SIMPLON_PARUVENDU_API/paruvendu_api/paruvendu_api/outputs/harvest1.json','r') as sample:
    for line in sample:
        start_urls.append(json.loads(line.strip()))

In [None]:
for i in specifications[1].find_all('li'):
    keys = [i for i in i.text.strip().split('\n') if i][0]
    values = [i for i in i.text.strip().split('\n') if i][1]

    dict_of_specifications[keys] = values

data = pd.concat([data, pd.DataFrame([dict_of_specifications])])

In [None]:
for url in start_urls:
    response = requests.get(url['url'])
    soup = BeautifulSoup(response.text)
    
    if response.status_code != 200:
        response = requests.get(soup.find('a')['href'])
        soup = BeautifulSoup(response.text)
        
    specifications = soup.find_all('div', {'class':'cotea16-mes', 'id':'mes-ht'})

    dict_of_specifications = {}
    try:
        for i in specifications[1].find_all('li'):
            keys = [i for i in i.text.strip().split('\n') if i][0]
            values = [i for i in i.text.strip().split('\n') if i][1]

            dict_of_specifications[keys] = values

        data = pd.concat([data, pd.DataFrame([dict_of_specifications])])
        print(len(data))
    except:
        pass
    time.sleep(5)

In [None]:
data

## Partie V

Ce qui était une idée va probablement devenir réalité. Nous allons utiliser Selenium pour simuler un robot et accéder aux pages.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
from msedge.selenium_tools import EdgeOptions

options = EdgeOptions()
options.use_chromium = True
options.add_argument("--headless")

In [None]:
pip install msedge-selenium-tools

In [None]:
start_urls = [{'url': 'https://www.paruvendu.fr/a/voiture-occasion/abarth/595/1260555888A1KVVOAB595'},
 {'url': 'https://www.paruvendu.fr/a/voiture-occasion/abarth/grande-punto/1259800124A1KVVOABPUG'},
 {'url': 'https://www.paruvendu.fr/a/voiture-occasion/abarth/595/1260385815A1KVVOAB595'},
 {'url': 'https://www.paruvendu.fr/a/voiture-occasion/abarth/abarth-124-spider/1260779829A1KVVOAB124'},
 {'url': 'https://www.paruvendu.fr/a/voiture-occasion/abarth/500/1260778646A1KVVOAB500'},
 {'url': 'https://www.paruvendu.fr/a/voiture-occasion/abarth/595/1258641506A1KVVOAB595'}]

In [None]:
def simulate_page(url):
    driver = webdriver.Edge('C:\Program Files (x86)\EdgeDriver\msedgedriver.exe')
    driver.get(url)
#     driver.maximize_window()
    driver.implicitly_wait(20)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    return driver, soup.find_all('div', {'class':'cotea16-mes', 'id':'mes-ht'})

In [None]:
# response = requests.get("https://www.paruvendu.fr/communfo/antiaspiration/default/getCaptcha?idSession=aac_37.58.163.229&url=%2Fa%2Fvoiture-occasion%2Ffiat%2F500%2F1260747636A1KVVOFI500")
# soup = BeautifulSoup(response.text, 'html.parser')

driver, specifications = simulate_page("https://www.paruvendu.fr/communfo/antiaspiration/default/getCaptcha?idSession=aac_37.58.163.229&url=%2Fa%2Fvoiture-occasion%2Ffiat%2F500%2F1260747636A1KVVOFI500")
if "Nos systèmes ont détecté un trafic inhabituel depuis votre accès internet." in soup.text:
    print('Simulation')
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, 'recaptcha-anchor'))).click()
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "formbp12_btnclassic"))).submit()

print('Début du scrapping')
dict_of_specifications = {}
for index in range(2):
    for i in specifications[index].find_all('li'):
        keys = [i for i in i.text.strip().split('\n') if i][0]
        values = [i for i in i.text.strip().split('\n') if i][1]

        dict_of_specifications[keys] = values

    data = pd.concat([data, pd.DataFrame([dict_of_specifications])])
data

In [None]:
data = pd.DataFrame()

In [None]:
dict_of_specifications = {}
for i in specifications[1].find_all('li'):
    keys = [i for i in i.text.strip().split('\n') if i][0]
    values = [i for i in i.text.strip().split('\n') if i][1]

    dict_of_specifications[keys] = values

data = pd.concat([data, pd.DataFrame([dict_of_specifications])])
data

In [None]:
for i in start_urls:
    print(i['url'])

## Partie VI

In [None]:
import random
import time

In [None]:
brand_list = ['Abarth', 'Aiways', 'Aleko', 'Alfa Romeo', 'Alpina', 'Aro', 'Aston Martin', 'Audi', 'Austin', 'Autres', 'Auverland', 'BMW', 'Bentley', 'Bertone', 'Buggy', 'Buick', 'Cadillac', 'Caterham', 'Chevrolet', 'Chrysler', 'Citroën', 'Corvette', 'Cupra', 'DS', 'Dacia', 'Daewoo', 'Daihatsu', 'Daimler', 'Dangel', 'De la Chapelle', 'Dodge', 'Donkervoort', 'Ferrari', 'Fiat', 'Ford', 'GMC', 'Gac Gonow', 'Honda', 'Hummer', 'Hyundai', 'Infiniti', 'Isuzu', 'Jaguar', 'Jeep', 'Kia', 'Lada', 'Lamborghini', 'Lancia', 'Land-Rover', 'Landwin', 'Lexus', 'Lotus', 'MG', 'MPM Motos', 'Mahindra', 'Maruti', 'Maserati', 'Maybach', 'Mazda', 'Mega', 'Mercedes', 'Mini', 'Mitsubishi', 'Morgan', 'Nissan', 'Opel', 'PGO', 'Peugeot', 'Polski/FSO', 'Pontiac', 'Porsche', 'Proton', 'Renault', 'Rolls-Royce', 'Rover', 'Saab', 'Santana', 'Seat', 'Seres', 'Shuanghuan', 'Skoda', 'Smart', 'Ssangyong', 'Subaru', 'Suzuki', 'TVR', 'Talbot', 'Tata', 'Tesla', 'Toyota', 'Venturi', 'Volkswagen', 'Volvo', 'Wallys']

url_list = [f"https://www.paruvendu.fr/a/voiture-occasion/{brand}/" for brand in brand_list]

In [None]:
list_of_user_agents = [ 
    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", 
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
    "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36"
] 

In [96]:
dict_of_values = {}
for url, brand in zip(url_list, brand_list):
    header={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding":"gzip, deflate, br",
            "Accept-Language":"fr,fr-FR;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
            "Connection":"keep-alive",
            "Cookie":"__aaxsc=1; didomi_token=eyJ1c2VyX2lkIjoiMTdlNGVkNjgtY2I3Zi02ODhmLTllMWItODg1MmI2NDE4ODMwIiwiY3JlYXRlZCI6IjIwMjItMDEtMjBUMTQ6MTE6MTIuMDYxWiIsInVwZGF0ZWQiOiIyMDIyLTAxLTIwVDE0OjExOjEyLjA2MVoiLCJ2ZXJzaW9uIjoyLCJwdXJwb3NlcyI6eyJkaXNhYmxlZCI6WyJkZXZpY2VfY2hhcmFjdGVyaXN0aWNzIiwiZ2VvbG9jYXRpb25fZGF0YSJdfSwidmVuZG9ycyI6eyJkaXNhYmxlZCI6WyJnb29nbGUiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSJdfSwiYWMiOiJBQUFBLkRMVUFBQUFBQUJJQkJDSUFFQVFBQ0FRb0VZR1VBU0FRQUlDSUlCQWdBZ0ZBQkFBZ2dBSVFBQUVBTXdBZ0FBQUJFQUJFSkFFRkFJSUloSVFBQUFBaUV3UkFBREFBQ0FJQmdKS0FVQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFCRUFBQUFBQUFCQUFNQUJNQkFFQUJBQUFJQUFBRUlBQUlCQWdBQUFBQUJSQUFCQUFBSkZnZ2hRQUVBQUVDVkFRQUFFQkFRU0FJZ0FBS0lCTUVRQXlBQUFJQUFvSUNpQUVFQVFBQUFCQUNBd0JBQUFBRUJHQUFBQ2dRWUJBQUFBQkFBQ0lCQUFJQ0JJSUlCR0JBQkFBZ0FBQ0FCUUFVUVlnQUZFQWdHUUlBRUlBQkVBQ0lBQ0FvQUJRaHJRQUVDSUNnRUF3QUFRQVFBS0FBRG5BSUFBQ0VZQUFFQkVEQ0lDS29BQ2dBUlFRQ0NSSVVnZ0NBRkVnQUFRRW1rQWdBQVFBZ0FFZ0NBQkVpQVNUd1FBbkFpREFFS0NWRXFCUUFJRUFGZ2lFQ294MGd3UURDd0tBZ0JuSlFZUVlET29FREdnM09Pd0dSQXN4SUJCZ0FFQVFBQUFCdUFDUmdrUU14Z01yaW5xTFZIdl8xMndNNzdsMERCWUlmcWNkY1hTMDZQREFKaHk5TVdyNkFHZFlyWV9Mb1pCSE1URU1DXzRvQUhOVGU3N01OQUFBIn0=; euconsent-v2=CPTHkcBPTHkcBAHABBENB-CgAAAAAH_AAAAAAAARDAJMNS8gC7EscGTaNKoUQIwrCQ6AUAFFAMLRFYQOrgp2VwE-oIWACE1ARgRAgxBRgwCAAACAJCIgJADwQCIAiAQAAgBUgIQAEbAILACwMAgAFANCxAigCECQgyOCo5TAgIkWignsrEEoO9jTCEMssAKBR_RUICJQggWBkJCwcxwBICXCyQLMUb5AAAAA.YAAAD_gAAAAA; dbps=NC",
            "Host":"www.paruvendu.fr",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":random.choice(list_of_user_agents)}
    response = requests.get(url, headers=header)
    soup = BeautifulSoup(response.text)
    
    try:
        dict_of_values[brand] = soup.find_all('select', {"name":"md", "class":'aff_act_miseAJourLibelle'})[0].text.split('\n')
    except:
        dict_of_values[brand] = []
        
    time.sleep(3)

In [95]:
dict_of_values

{'Abarth': ['',
  'Tous les modèles',
  '500',
  '595',
  '695',
  'Abarth 124 spider',
  'F595',
  'Grande Punto',
  '']}

In [97]:
a = ['',
  'Tous les modèles',
  '500',
  '595',
  '695',
  'Abarth 124 spider',
  'F595',
  'Grande Punto',
  '']

list(filter(None, a))

['Tous les modèles',
 '500',
 '595',
 '695',
 'Abarth 124 spider',
 'F595',
 'Grande Punto']

In [None]:
.remove('Tous les modèles')

In [100]:
dict_of_values

{'Abarth': ['',
  'Tous les modèles',
  '500',
  '595',
  '695',
  'Abarth 124 spider',
  'F595',
  'Grande Punto',
  ''],
 'Aiways': [],
 'Aleko': [],
 'Alfa Romeo': [],
 'Alpina': [],
 'Aro': ['',
  'Tous les modèles',
  'Aro 10',
  'Aro 24',
  'Cross Lander',
  'Forester',
  'Pick-up',
  'Spartana',
  'Trapeurs',
  'Divers',
  ''],
 'Aston Martin': [],
 'Audi': ['',
  'Tous les modèles',
  '100',
  '200',
  '80',
  '90',
  'A1',
  'A2',
  'A3',
  'A4',
  'A5',
  'A6',
  'A7',
  'A8',
  'Allroad',
  'Coupé',
  'E-TRON',
  'Q2',
  'Q3',
  'Q4',
  'Q5',
  'Q7',
  'Q8',
  'R8',
  'RS Q3',
  'RS3',
  'RS4',
  'RS5',
  'RS6',
  'S3',
  'S4',
  'S5',
  'S6',
  'S8',
  'SQ5',
  'SQ7',
  'SQ8',
  'TT',
  'TT S',
  'V8',
  'Divers',
  ''],
 'Austin': ['', 'Tous les modèles', 'Autres', 'Mini', ''],
 'Autres': [],
 'Auverland': ['', 'Tous les modèles', 'A3', 'A4', 'Divers', ''],
 'BMW': ['',
  'Tous les modèles',
  'i3',
  'i4',
  'i8',
  'iX',
  'iX3',
  'M2',
  'M3',
  'M4',
  'M5',
  'M6',
 