Intro) Le but de ce projet est est de réaliser une analyse géographique du marché des gites en France et de construire un modèle de prédiction du prix à la nuit des gites en incluant une variable géographique sur l'emplacement de l'établissement.

Dans ce notebook, on lance un crawl via selenium pour recuperer les information des gites situés en France via les urls récupérés dans le notebook 1- Crawl Urls Gites. Ces avis constitueront la base de données pour les travaux suivants

In [1]:
import time
import re
import concurrent.futures
import warnings
import traceback
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from fake_useragent import UserAgent
from selenium.webdriver.chrome.service import Service

from tqdm.notebook import tqdm

from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import Future
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
#Get Gites urls and setting logs so we can keep track of which urls have been done 
with open(r"urls.txt","r") as liste_urls :
    gites_urls = liste_urls.readlines()
    gites_urls = [url.replace("\n","") for url in gites_urls]
gites_urls = gites_urls[1:]

try :
    with open("completed.txt") as log :
        done_urls = log.readlines()
        done_urls = [url.replace("\n","").replace(" page is completed","").replace("'","")
                    for url in done_urls]
except:
    done_urls = []
gites_urls = [url for url in gites_urls if url not in done_urls] 
len(gites_urls)

25467

In [3]:
def loop(url_gite) :
    """
    Fonction prenant en paramètre un url gite sur GITES DE FRANCE
    et crawlant les informations du gite
    
    Arguments:
        url_gite : String : Un url de gite sur GITES DE FRANCE
        Par exemple : "https://www.gites-de-france.com/fr/normandie/calvados/la-ptite-chaume-14g3269"
        
    Renvoit: Null
        Crawl et ecrit sur le fichier de résultat les informations du gite
    """
    #Driver
    chrome_options = Options()
    ua = UserAgent()
    userAgent = ua.random
    chrome_options.add_argument(f'user-agent={userAgent}')
    path = r"DRIVER_PATH"
    chrome_options.add_argument("--headless")
    chrome = webdriver.Chrome(executable_path= path ,options=chrome_options)
    chrome.maximize_window()
    
    #Get to url
    chrome.get(url_gite)
    time.sleep(1)
    #Popup
    try :
        btn = chrome.find_element(By.XPATH,'//*[@id="onetrust-accept-btn-handler"]')
        chrome.execute_script("arguments[0].scrollIntoView();", btn)
        chrome.execute_script("arguments[0].click();",btn)
        time.sleep(0.5)
    except:
        pass
    
    #Nom Gite
    lecture_nom = chrome.find_element(By.CSS_SELECTOR,"h1[class='g2f-accommodationHeader-title']") 
    try :
        nom = lecture_nom.text
        nom=' '.join(nom.replace('\n',' ').split())
    except :
        nom=""
        
    #Code Identification Gite
    lecture_code = chrome.find_element(By.CSS_SELECTOR,"p[class='g2f-accommodationHeader-detail']")
    try :
        code = lecture_code.find_element(By.TAG_NAME,"span").text
        code = code.split("Ref : ")[1]
    except:
        code =""
        
    #Categorie Gite
    try :
        lecture_type = chrome.find_element(By.CSS_SELECTOR,"a[class='g2f-breadcrumb-link']")
        type_ = lecture_type.text
        type_ = type_.split("Location ")[1].split(" ")[0]
    except:
        try :
            lecture_type = chrome.find_element(By.CSS_SELECTOR,"h3[class='g2f-accommodationHeader-type']")
            type_ = lecture_type.text
            type_ = type_.split(" - ")[0]
        except:
            type_ = ""
            
    #Adresse Gite
    lecture_adrs = chrome.find_element(By.CSS_SELECTOR,"p[class='g2f-accommodationHeader-detail']") 
    try :
        adrs = lecture_adrs.text
        ville = adrs.split("| à ")[1].split("- ")[0]
        ville=' '.join(ville.replace('\n',' ').split())
        dep = adrs.split("| à ")[1].split("- ")[1]
        dep=' '.join(dep.replace('\n',' ').split())
    except :
        ville=""
        dep =""
    
    #Etoiles Gite
    try :
        lecture_etoiles = chrome.find_element(By.CSS_SELECTOR,"ul[class='g2f-levelEpis--big gite g2f-levelEpis']") 
        lecture_etoiles = lecture_etoiles.find_elements(By.TAG_NAME,"li")
        etoiles = len(lecture_etoiles)
    except :
        etoiles=""
    
    #Nombre Chambres Gites
    try :
        lecture_rooms = chrome.find_element(By.CSS_SELECTOR,"li[class='room']") 
        lecture_rooms = lecture_rooms.find_element(By.CSS_SELECTOR,"span[class='capacity-value']")
        rooms = lecture_rooms.text
    except :
        rooms=""
        
    #Capacite Accueil Gites
    try:
        lecture_capa= chrome.find_element(By.CSS_SELECTOR,"li[class='people']") 
        lecture_capa = lecture_capa.find_element(By.CSS_SELECTOR,"span[class='capacity-value']")
        capa = lecture_capa.text
    except :
        capa=""
        
    #Surface en m² Gites
    try:
        lecture_surface= chrome.find_element(By.CSS_SELECTOR,"li[class='surface']") 
        lecture_surface = lecture_surface.find_element(By.CSS_SELECTOR,"span[class='capacity-value']")
        surface = lecture_surface.text
    except :
        surface=""
        
    #Coordonnes GPS Gites
    try :
        lecture_carte= chrome.find_element(By.CSS_SELECTOR,"div[id='map-accommodation']") 
        try :
            lat = lecture_carte.get_attribute("data-lat")
        except :
            lat=""
        try :
            long = lecture_carte.get_attribute("data-lng")
        except :
            long=""
    except:
        lat =""
        long=""
        
    #Notes Clients Gites
    try:
        lecture_note= chrome.find_element(By.CSS_SELECTOR,"div[class='g2f-rating-container']") 
        note = lecture_note.text
        note = note.split("( ")[1].split("/")[0]
    except :
        note=""
    
    #Wifi Gites
    try:
        lecture_wifi= chrome.find_element(By.CSS_SELECTOR,"li[class='wifi']") 
        wifi = "wifi"
    except :
        wifi="no wifi"
        
    #Animaux Autorisés
    try:
        lecture_pets= chrome.find_element(By.CSS_SELECTOR,"li[class='pets']") 
        pets = "pets"
    except :
        pets="no pets"
    
    #Prix Sejour Gites
    try :
        lecture_prix = chrome.find_element(By.XPATH,'//*[@id="g2f-funnel-accommodation-booking"]/div/div[2]/div[2]/span[2]/span[3]/strong')
        prix = lecture_prix.text.replace(" €","").replace(",",".")
        duree_prix = chrome.find_element(By.XPATH,'//*[@id="g2f-funnel-accommodation-booking"]/div/div[2]/div[2]/span[2]/span[3]/span[2]').text
    except:
        try:
            url_prix = chrome.find_element(By.XPATH,'//*[@id="widget_resafngf"]/iframe').get_attribute('src')
            chrome.get(url_prix)
            lecture_prix = chrome.find_element(By.XPATH,'//*[@id="widgetFNGF_content"]/div[1]/div/div/span[2]')
            prix = lecture_prix.text.replace("€","")
            duree_prix = chrome.find_element(By.XPATH,'//*[@id="widgetFNGF_content"]/div[1]/div/div/span[3]').text
        except:
            prix=''
            duree_prix=""
            
    #Date Labellisation Gites
    try :
        lecture_date = chrome.find_element(By.CSS_SELECTOR,"div[class='g2f-contactCard-profil u-display--flex']")
        date = re.search("(\d+)",lecture_date.text).group()
    except:
        date = ""

    #. ESSAIE (try) d'afficher sous format texte,les informations
    # SI CELA N'EST PAS POSSIBLE (except), affiche que les urls, et met des vides dans les autres colonnes.
    try:
        result=(url_gite+'\t'+ str(code)+'\t'+ str(type_)+'\t'+ str(nom)+ '\t'+ str(ville)+ '\t'+ str(dep)
                + '\t' +str(etoiles)+'\t'+ str(rooms)+ '\t'+ str(surface)+ '\t'+ str(capa)
                + '\t' +str(lat)+ '\t' +str(long)+ '\t' +str(prix)+ '\t' +str(date)+'\t'+ str(note)
                +'\t'+ str(wifi)+'\t'+ str(pets)+ '\t'+ str(prix)+ '\t'+ str(duree_prix))
        with open('Gites_de_France.csv','a',encoding ="utf-8") as fhandle:
            print(result,file=fhandle)
        chrome.quit()
    except :
        exception=(url_gite+'\t'+ ""+'\t'+ ""+'\t'+ ""+ '\t'+ ""+ '\t'+ ""+ '\t' +""
                +'\t'+ ""+ '\t'+ ""+ '\t'+ ""+ '\t' +""+ '\t' +""+ '\t' +""+ '\t' +""
                   +'\t'+ ""+'\t'+ ""+'\t'+ ""+ '\t'+ ""+ '\t'+ "")
        with open('Gites_de_France.csv','a',encoding = "utf-8") as fhandle:
                print(exception,filea=fhandle) 
        chrome.quit()

In [6]:
#Multithreading
def main() :
    with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
       	future_to_url = {executor.submit(loop, url): url for url in gites_urls}
       	for future in tqdm(concurrent.futures.as_completed(future_to_url),total=len(future_to_url)):
       		url = future_to_url[future]
       		try:
       			data = future.result()
       		except Exception as exc:
       			with open('exception.txt',"a") as flog:
       				print('%r generated an exception: %s' % (url, exc),file=flog)
       		else:
       			with open('completed.txt',"a") as flog:
       				print('%r page is completed' % url,file=flog)
                     
if __name__ == "__main__":
    main() 