In [60]:
from urllib.request import urlopen
from multiprocessing import Pool
from bs4 import BeautifulSoup
import urllib.request, json
import pandas as pd
import numpy as np
import lxml.html
import requests
import getpass
import math
import re

def get_model(text):
    if 'zen' in text:
        return 'zen'
    elif 'intens' in text:
        return 'intens'
    elif 'life' in text:
        return 'life'
    return None

def get_spec_car_data(text):
    URL = "https://www.leboncoin.fr/voitures/" + text + ".htm?ca=12_s"
    r = requests.get(URL)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    data = soup.find_all('h2', class_=re.compile('clearfix'))
    data_cleaned = [d.text.replace("\n", "")
                    .replace("\xa0€", "")
                    .replace("Prix", "")
                    .replace(" ", "")
                    .replace("Kilométrage", "")
                    .replace("KM", "")
                    .replace("Année-modèle", "") for d in data]
    price = data_cleaned[0]
    year = data_cleaned[4]
    km = data_cleaned[5]
    
    phone = ["NoNumber"]
    description = soup.find_all('p', itemprop=re.compile('description'))
    description_cleaned = [re.findall(r"(\d{2}[\.\s]??\d{2}[\.\s]??\d{2}[\.\s]??\d{2}[\.\s]??\d{2}[\.\s]??)",
                                      d.text.strip()) for d in description]
    phone.extend(description_cleaned[0])
    return price + "|" + year + "|" + km + "|" + phone[-1].replace('.', '').replace(' ', '')

def get_cote(df):
    year = df.Year
    version = df.Version_Cleaned
    
    if version==None:
        return None
    
    URL = ('https://www.lacentrale.fr/cote-auto-renault-zoe-' + version + '+charge+rapide-' + str(year) + '.html')
    r = requests.get(URL)
    
    '''if r!=200:
        print(r)
        return None'''
    
    soup = BeautifulSoup(r.text, 'html.parser')
    data = soup.find_all('span', class_=re.compile('jsRefinedQuot'))
    cote_cleaned = int(data[0].text.replace("\n", "").replace("\xa0€", "").replace(" ", ""))
    return cote_cleaned
    
    
def get_part_pro_dataframe(region):
    p = Pool(3)
    frames = []

    for part_pro_link in ['c', 'p']:
        page = 1
        id_all_links = []
        all_versions = []
        
        URL = ('https://www.leboncoin.fr/voitures/offres/' + region + '/?o=' + str(page)
           + '&q=renault%20zo%E9&f=' + part_pro_link)
        r = requests.get(URL)
        soup = BeautifulSoup(r.text, 'html.parser')
        
        id_links = re.findall('voitures/(\d+).htm', str(soup))
        versions = soup.find_all('h2',attrs={"class": u"item_title"})
        versions_cleaned = [v.text.replace('\n', '').replace('\t', '').lower()
                            .replace('renault', '').replace('zoe', '').strip()
                            for v in versions]

        id_all_links = id_links
        all_versions = versions_cleaned


        while(id_links != []):
            page = page + 1
            URL = ('https://www.leboncoin.fr/voitures/offres/' + region + '/?o=' + str(page)
                   + '&q=renault%20zo%E9&f=' + part_pro_link)
            r = requests.get(URL)
            soup = BeautifulSoup(r.text, 'html.parser')

            id_links = re.findall('voitures/(\d+).htm', str(soup))
            id_all_links.extend(id_links)
            
            versions = soup.find_all('h2',attrs={"class": u"item_title"})
            versions_cleaned = [v.text.replace('\n', '').replace('\t', '').lower()
                                .replace('renault', '').replace('zoe', '').strip()
                                for v in versions]
            all_versions.extend(versions_cleaned)
        
        if part_pro_link == "c":
            part_pro = "pro"
        else:
            part_pro = "part"
            
            
        data = {"ID_Link": id_all_links, "Version": all_versions, "Seller": part_pro, "Region": region}
        df = pd.DataFrame(data)
        frames.append(df)
    
    df = pd.concat(frames)
    df["Version_Cleaned"] = p.map(get_model, df.Version.tolist())
    df["Info"] = p.map(get_spec_car_data, df.ID_Link.tolist())
    df["Price"] = df.Info.str.split('|',3, expand=True)[0].tolist()
    df["Year"] = df.Info.str.split('|',3, expand=True)[1].tolist()
    df["Kilometer"] = df.Info.str.split('|',3, expand=True)[2].tolist()
    df["Phone"] = df.Info.str.split('|',3, expand=True)[3].tolist()
    df["Cote"] = df.apply(get_cote, axis=1)
        
    return df
## OFF

In [61]:
%%time

def get_leboncoin_data_frame(region):
    dict_df = {}
    all_df = []
    for r in region:
        dict_df[r] = get_part_pro_dataframe(region=r)
        all_df.append(dict_df[r])
    return pd.concat(all_df)

df_leboncoin = get_leboncoin_data_frame(["ile_de_france", "aquitaine", "provence_alpes_cote_d_azur"])

CPU times: user 14.7 s, sys: 1.49 s, total: 16.2 s
Wall time: 1min 20s
