In [20]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from itertools import chain
from multiprocessing import Pool, TimeoutError
import numpy as np

In [2]:
urlArgus = "https://www.lacentrale.fr/cote-voitures-renault-zoe--2012-.html"

regions = ["ile_de_france", "provence_alpes_cote_d_azur", "aquitaine"]

urlLeboncoin = "https://www.leboncoin.fr/voitures/offres/"

In [3]:
def getUrlArgus():
    urlArgusList = []
    for annee in range(2012,2018):
        urlArgusList.append("https://www.lacentrale.fr/cote-voitures-renault-zoe--" + str(annee) + "-.html")
    return urlArgusList

In [4]:
def getArgusLinks(urlArgus):
    soupArgus = getSoupFromURL(urlArgus)
    a = soupArgus.find_all("div", class_ = "listingResultLine auto")
    linksArgus = ["https://www.lacentrale.fr/" + a[i].a['href'] for i in range(len(a))]
    return linksArgus

In [5]:
def getSoupFromURL(url, method='get', data={}):
    
    if method == 'get':
        res = requests.get(url)
    elif method == 'post':
        res = requests.post(url, data=data)
    else:
        return None
    
    if res.status_code == 200:
        soup = BeautifulSoup(res.text, 'html.parser')
        return soup
    else:
        return None

In [6]:
def getUrlList():
    urlList = []
    for region in regions:
        siteSoup = getSoupFromURL("https://www.leboncoin.fr/voitures/offres/" + region + "/?th=1&q=renault%20zo%E9")
        numberOfPages = int(siteSoup.find_all("a", class_ = "element page")[-1].text.strip())
        for page in range(1,numberOfPages+1):
            urlList.append(urlLeboncoin + region + "/?o=" + str(page) + "&q=renault%20zo%E9")
    return urlList

In [7]:
def getVersion(soup):
    regexLife = "[Ll]{1}[iI]{1}[fF]{1}[eE]{1}"
    regexZen = "[zZ][eE][nN]"
    regexIntens = "[Ii][Nn][Tt][Ee][Nn][Ss]"
    
    description = soup.find("p", itemprop = "description").text.strip()
    
    if re.search(regexLife, description):
        version = "Life"
    elif re.search(regexZen, description):
        version = "Zen"
    elif re.search(regexIntens, description):
        version = "Intens"
    else:
        version = "Inconnu"
    return version

In [8]:
def sellerType(soup):
    
    if soup.find("span", class_ = "ispro"):
        seller = "Professionel"
    else :
        seller = "Particulier"
        
    return seller

In [9]:
# récuperer la liste de tous les renault zoe
def getLinks(url):
    soup = getSoupFromURL(url)
    subsoup = soup.find_all("a", class_= "list_item clearfix trackable")
    linkList = ["http:" + car['href'] for car in subsoup]
    return linkList

In [10]:
def getNumber(soup):
    
    regexTelNumber = '(0[1-9](?P<sep>[-. ]?)(?:\d{2}(?P=sep)){3}\d{2})'
    description = soup.find("p", itemprop = "description").text.strip()
    num = re.search(regexTelNumber, description)
    
    if num :
        number = num.group(0).replace(" ", "").replace("-", "").replace(".", "")
        return number 
    else : 
        return "Numéro Inconnu"

In [11]:
def getInfos(carLink):

    carSoup = getSoupFromURL(carLink)  

    price = int(carSoup.find_all("span", class_= "value")[0].text.strip().replace("\xa0€", "").replace(" ", ""))
    
    year = int(carSoup.find_all("span", class_= "value")[4].text.strip())
    
    kilometrage = int(carSoup.find_all("span", class_= "value")[5].text.strip().replace("KM", "").replace(" ", ""))
    
    version = getVersion(carSoup)
    
    telephonNumber = getNumber(carSoup)
    
    seller = sellerType(carSoup)
    
    return version, year,  kilometrage, price, telephonNumber, seller

In [12]:
def getArgusEstimatedPrice(argusLink):
    soupArg = getSoupFromURL(argusLink)
    estimatedPrice = int(soupArg.find("span", class_ = "jsRefinedQuot").text.strip().replace(" ", ""))
    return estimatedPrice

In [78]:
%%time

# Scrapping des estimations du prix des Argus
argusUrls = getUrlArgus()
year = [2012, 2013, 2014, 2015, 2016, 2017]

intensPrice = list(map(lambda x :  getArgusEstimatedPrice(getArgusLinks(x)[0]), argusUrls))
lifePrice = list(map(lambda x :  getArgusEstimatedPrice(getArgusLinks(x)[2]), argusUrls))
zenPrice = list(map(lambda x :  getArgusEstimatedPrice(getArgusLinks(x)[4]), argusUrls))

CPU times: user 2.12 s, sys: 61.7 ms, total: 2.18 s
Wall time: 10.6 s


In [79]:
# Création d'une Dataframe avec les estimations des prix en fonction de l'année et du modèle
d = {"Intens" : intensPrice, 'Life': lifePrice, 'Zen': zenPrice}
dfEstimatedPrice = pd.DataFrame(d, index = year)

dfEstimatedPrice["Inconnu"] = round(dfEstimatedPrice.mean(axis = 1),0)
dfEstimatedPrice

Unnamed: 0,Intens,Life,Zen,Inconnu
2012,8535,7320,8725,8193.0
2013,8503,9958,8352,8938.0
2014,9496,10702,11578,10592.0
2015,13417,13760,9650,12276.0
2016,14382,14866,15690,14979.0
2017,13760,14385,16233,14793.0


In [47]:
%%time
# Multithreading
pool = Pool(processes=10)

urlList = getUrlList()

finalList = []
carsLinks = []

carsLinks = pool.map(getLinks, urlList)
carsLinks = list(chain.from_iterable(carsLinks))

# Création de liste avec chaque ligne comprenant les infos sur chaque voiture
completeList = pool.map(getInfos, carsLinks)

CPU times: user 732 ms, sys: 111 ms, total: 843 ms
Wall time: 52.3 s


In [97]:
# Création d'un DataFrame comprenant les infos du boncoin sur les voitures 
df = pd.DataFrame(completeList, columns = ["version", "year",  "kilometrage", "price", "telephon_number", "seller"])
df.head()

Unnamed: 0,version,year,kilometrage,price,telephon_number,seller
0,Inconnu,2017,20000,15900,Numéro Inconnu,Particulier
1,Intens,2015,15254,10990,0164597186,Professionel
2,Life,2014,28537,7900,0134301285,Professionel
3,Life,2013,8751,8990,0164597186,Professionel
4,Life,2014,29000,9700,Numéro Inconnu,Particulier


In [98]:
df["EstimatedPrice"] = np.nan
dftest = df[["year", "version"]].values

In [104]:
%%time
for i in range(len(dftest)):
    if dftest[i][0] in year :
        df["EstimatedPrice"][i] = dfEstimatedPrice.loc[dftest[i][0], dftest[i][1]]    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


CPU times: user 5.32 s, sys: 47.7 ms, total: 5.37 s
Wall time: 5.37 s


In [105]:
df.head()

Unnamed: 0,version,year,kilometrage,price,telephon_number,seller,EstimatedPrice
0,Inconnu,2017,20000,15900,Numéro Inconnu,Particulier,14793.0
1,Intens,2015,15254,10990,0164597186,Professionel,13417.0
2,Life,2014,28537,7900,0134301285,Professionel,10702.0
3,Life,2013,8751,8990,0164597186,Professionel,9958.0
4,Life,2014,29000,9700,Numéro Inconnu,Particulier,10702.0


In [106]:
df.to_csv(path_or_buf = "/Users/robinchampseix/Desktop/Telecom/MS_Big_Data/Kit_Data_Science/renaultZoe.csv", sep='\t')