In [10]:
from urllib.request import urlopen
from multiprocessing import Pool
from bs4 import BeautifulSoup
import urllib.request, json
import pandas as pd
import numpy as np
import lxml.html
import requests
import getpass
import math
import re

def get_model(text):
    if 'zen' in text:
        return 'zen'
    elif 'intens' in text:
        return 'intens'
    elif 'life' in text:
        return 'life'
    return None

def get_spec_car_data(text):
    URL = "https://www.leboncoin.fr/voitures/" + text + ".htm?ca=12_s"
    r = requests.get(URL)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    data = soup.find_all('h2', class_=re.compile('clearfix'))
    data_cleaned = [d.text.replace("\n", "")
                    .replace("\xa0€", "")
                    .replace("Prix", "")
                    .replace(" ", "")
                    .replace("Kilométrage", "")
                    .replace("KM", "")
                    .replace("Année-modèle", "") for d in data]
    price = data_cleaned[0]
    year = data_cleaned[4]
    km = data_cleaned[5]
    return price + "|" + year + "|" + km

def get_cote(df):
    year = df.Year
    version = df.Version
    URL = ('https://www.lacentrale.fr/cote-auto-renault-zoe-' + version + '+charge+rapide-' + str(year) + '.html')
    r = requests.get(URL)
    soup = BeautifulSoup(r.text, 'html.parser')
    data = soup.find_all('span', class_=re.compile('jsRefinedQuot'))
    cote_cleaned = int(data[0].text.replace("\n", "").replace("\xa0€", "").replace(" ", ""))
    return cote_cleaned
    
def get_part_pro_dataframe(pro, region):
    page = 1
    if pro == True:
        part_pro_link = 'c'
        part_pro = 'pro'
        
    else:
        part_pro_link = 'p'
        part_pro = 'part'
    
    URL = "https://www.leboncoin.fr/annonces/offres/ile_de_france/?th=1&q=renault%20zo%E9"
    r = requests.get(URL)
    soup = BeautifulSoup(r.text, 'html.parser')
    data = soup.find_all('span', class_=re.compile('total_page'))
    
    URL = ('https://www.leboncoin.fr/voitures/offres/' + region + '/?o=' + str(1) 
           + '&q=renault%20zo%E9&f=' + part_pro_link)
    r = requests.get(URL)
    soup = BeautifulSoup(r.text, 'html.parser')
        
    id_links = re.findall('voitures/(\d+).htm', str(soup))
    versions = soup.find_all('h2',attrs={"class": u"item_title"})

    versions_cleaned = [v.text.replace('\n', '').replace('\t', '').lower()
                            .replace('renault', '').replace('zoe', '').strip()
                            for v in versions]
        
    data = {"ID_Link": id_links, "Version": versions_cleaned, "Seller": part_pro}
    df = pd.DataFrame(data)
    df["Version"] = df.Version.apply(lambda x: get_model(x))
    df["Info"] = df["ID_Link"].apply(lambda x: get_spec_car_data(x))
    df["Price"] = df.Info.str.split('|',2, expand=True)[0].tolist()
    df["Year"] = df.Info.str.split('|',2, expand=True)[1].tolist()
    df["Kilometer"] = df.Info.str.split('|',2, expand=True)[2].tolist()
    df_cleaned = df.dropna().copy()
    df_cleaned["Cote"] = df_cleaned.apply(get_cote, axis=1)
        
    return df_cleaned
    
df_part = get_part_pro_dataframe(pro=True, region="ile_de_france")
df_pro = get_part_pro_dataframe(pro=True, region="ile_de_france")
frames = [df_part, df_pro]
df = pd.concat(frames)
df.head()

Unnamed: 0,ID_Link,Seller,Version,Info,Price,Year,Kilometer,Cote
0,1318776746,pro,life,10890|2015|7079,10890,2015,7079,11306
1,1318653452,pro,intens,10400|2014|44361,10400,2014,44361,12091
2,1318653554,pro,life,9300|2014|29623,9300,2014,29623,11212
3,1318653590,pro,zen,11400|2016|5160,11400,2016,5160,12420
4,1318653336,pro,zen,10100|2014|35500,10100,2014,35500,9650
