In [1]:
import requests
import html5lib
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import itertools
from multiprocessing import Pool

In [2]:
regions = ["ile_de_france", "aquitaine", "provence_alpes_cote_d_azur"]
features = ["Link", "Good_offer", "Price", "Argus", "Model", "Year", "Km", "Phone", "Pro"]

In [3]:
def get_soup_from(URl, method='get', form={}, parser='html5lib'):
    
    """Extracts the source code from a specific URl
    ----------
    Parameters
    URl : string (required)
        web location of the data
    method : string (optional)
        'get' or 'post' request
    form : dict (optional)
        form encoded data embedded in the POST request
    -------
    Returns
    soup : bs4.BeautifulSoup
        content of an HTML/XML document
    """
    
    if method == 'get':
        res = requests.get(URl)
    elif method == 'post':
        res = requests.post(URl, data=form)
    else:
        raise ValueError("ERROR: Wrong choice of method -> 'get' or 'post'.")
        
    assert res.ok, "WARNING: Failed to get request from URL: {}".format(URl)
    
    return BeautifulSoup(res.text, parser)

In [4]:
def get_argus_rating(version, year):
    url = "https://www.lacentrale.fr/cote-auto-renault-zoe-" + version + "+charge+rapide-" + str(year) + ".html"
    soup = get_soup_from(url)
    return int(soup.find('span', class_='jsRefinedQuot').text.replace(' ', ''))

In [5]:
def scrap_ad(url):
    
    soup = get_soup_from(url)
    
    try:
        price = int(soup.find("h2", {"class":"item_price clearfix"}).get("content"))
    except:
        price = np.nan
    
    try:
        year = int(soup.find("span", {"itemprop": "releaseDate"}).text.strip())
    except:
        year = np.nan
    
    try:
        kms_regex = re.compile(r'\d+ KM', re.IGNORECASE)
        kms_match = soup.find("span", {"class":"value"}, text=kms_regex).text
        kms_meter = int(kms_match.replace(' ', '').replace('KM', ''))
    except:
        kms_meter = np.nan
     
    _description = soup.find("div", {"class": "line properties_description"}).text.strip()
    
    try:
        model_regex = re.compile(r'ZEN|INTENS|LIFE', re.IGNORECASE)
        model = re.findall(model_regex, _description)[0].upper()
    except:
        model = 'NaN'
        
    try:
        phone_regex = re.compile(r'[0|\+33|0033][1-9][-. 0-9]{8,}')
        phone = re.findall(phone_regex, _description)[0].replace('-', '').replace('.', '').replace(' ', '')
        if len(phone) != 10:
            phone = 'NaN'
    except:
        phone = 'NaN'
        
    try:
        argus = get_argus_rating(model, year)
        below_argus = 'yes' if price < argus else 'no'
    except:
        argus = np.nan
        below_argus = 'NaN'
        
    try:
        ispro = 'yes' if soup.find("span", {"class": "ispro"}) else 'no'
    except:
        ispro = 'NaN'
    
    return url, below_argus, price, argus, model, year, kms_meter, ispro, phone

In [6]:
def get_ad_links(region):
    links = []
    page_number = 1
    while True:
        url = "https://www.leboncoin.fr/annonces/offres/" + region + "/?o=" + str(page_number) + "&q=renault%20zoe"
        soup = get_soup_from(url)
        links_page = ["https:" + item.find('a')['href']
                      for item in soup.find_all("li", itemtype="http://schema.org/Offer")
                      if re.compile(r'zo[eéÉ]', flags=re.IGNORECASE).search(item.find('a')['title'])
                      and item.find('p')['content'] == 'Voitures']
        if links_page:
            links += links_page
            page_number += 1
        else:
            break
    return links

In [7]:
links = list(itertools.chain.from_iterable(map(lambda reg: get_ad_links(reg), regions)))

In [8]:
pool = Pool(processes=8)
data = pd.DataFrame(pool.map(scrap_ad, links), columns=features)

In [9]:
data

Unnamed: 0,Link,Good_offer,Price,Argus,Model,Year,Km,Phone,Pro
0,https://www.leboncoin.fr/voitures/1317781865.h...,yes,10990,12008.0,INTENS,2015,15254,yes,0164597186
1,https://www.leboncoin.fr/voitures/1337806946.h...,yes,8500,10393.0,ZEN,2014,20774,yes,0134301285
2,https://www.leboncoin.fr/voitures/1337678025.h...,yes,10100,10393.0,ZEN,2014,35500,yes,0175448159
3,https://www.leboncoin.fr/voitures/1337677954.h...,no,10400,10393.0,ZEN,2014,18350,yes,0175448159
4,https://www.leboncoin.fr/voitures/1337481952.h...,yes,10990,12008.0,INTENS,2015,26800,yes,0768351919
5,https://www.leboncoin.fr/voitures/1337251455.h...,yes,7900,10393.0,ZEN,2014,29162,yes,
6,https://www.leboncoin.fr/voitures/1337677869.h...,yes,8890,9074.0,LIFE,2014,24059,yes,0178455073
7,https://www.leboncoin.fr/voitures/1336880612.h...,yes,7500,8752.0,LIFE,2013,28500,no,
8,https://www.leboncoin.fr/voitures/1336791801.h...,yes,9000,12008.0,INTENS,2015,10000,no,
9,https://www.leboncoin.fr/voitures/1170785025.h...,yes,9480,12624.0,LIFE,2016,9087,yes,0805805802


In [10]:
# data.to_csv(...)