In [77]:
from bs4 import BeautifulSoup 
from selenium import webdriver
# The following packages will also be used in this tutorial
import pandas as pd # All database operations
import numpy as np  # Numerical operations
import time         # Tracking time
import requests     # HTTP requests
import re           # String manipulation
from sklearn.feature_extraction.text import CountVectorizer # BagOfWords (cleaning)
from joblib import Parallel, delayed # Parallellization of tasks

In [103]:
import itertools

In [78]:
def getPage(url):
	''' returns a soup object that contains all the information 
	of a certain webpage'''
	result = requests.get(url)
	content = result.content
	return BeautifulSoup(content, features = "lxml")

url_page ="https://www.airbnb.fr/s/Paris--France/homes?adults=1&place_id=ChIJD7fiBh9u5kcRYJSMaMOCCwQ&refinement_paths%5B%5D=%2Fhomes&checkin=2020-12-30&checkout=2020-12-31"
page=getPage(url_page)

In [79]:
def getRoomClasses(soupPage):
	''' This function returns all the listings that can 
	be found on the page in a list.'''
	rooms = soupPage.findAll("div", {"class": "_8ssblpx"})
	result = []
	for room in rooms:
		result.append(room)
	return result
listing=getRoomClasses(page)

In [80]:
##commentaires
def getNbCommentaire(listing):
	''' Returns the guest information'''
	try:
		return listing.find("span", {"class":"_a7a5sx"}).text 
	except:
		return "pas de commentaire"

##evaluation
def getEvaluation(listing):
	''' Returns the guest information'''
	try:
		return listing.find("span", {"class":"_10fy1f8"}).text 
	except:
		return "pas d'évaluation"

## lien du logement
def getListingLink(listing):
	''' Returns the guest information'''
	try:
		return "http://airbnb.com" + listing.find("a")["href"]
	except:
		return "pas de lien"

## titre du logement
def getListingTitle(listing):
	''' Returns the guest information'''
	try:
		return listing.find("meta")["content"]
	except:
		return "pas de titre du logement"

##description du logement
def getTopRow(listing):
	''' Returns the guest information'''
	try:
		return listing.find("div", {"class": "_167qordg"}).text
	except:
		return "pas de description du logement"   
    
##information sur le nombre de chambre/douche
def getRoomInfo(listing):
	''' Returns the guest information'''
	try:
		return listing.find("div", {"class":"_kqh46o"}).text
	except:
		return "pas d'infos sur chambre/douche"
    
##prix
def getPrix(listing):
	''' Returns the guest information'''
	try:
		return listing.find("div", {"class":"_1fwiw8gv"}).text
	except:
		return "pas d'infos sur le prix"

In [81]:
## extraction des infos de tous les logements d'une page

def extractinfo(page):
    df = pd.DataFrame(columns = ['title', 'link',"nbComments","prix","topRow","evaluation","roomInfo"]) ##,"evaluation", ,'nbComments'
    new=[]
    listing=getRoomClasses(page)
    for i in range(1, len(listing)):
        new.append(getListingTitle(listing[i]))
        new.append(getListingLink(listing[i]))
        new.append(getNbCommentaire(listing[i]))
        new.append(getPrix(listing[i]))
        new.append(getTopRow(listing[i]))
        new.append(getEvaluation(listing[i]))
        new.append(getRoomInfo(listing[i]))
        df.loc[i]=new
        new=[]
    return df

In [82]:
## il faut ajouter des dates aux liens pour voir les prix
def findNextPage(soupPage):
	''' Finds the next page with listings if it exists '''
	try:
		nextpage = "https://airbnb.com" + soupPage.find("a", {"class": "_za9j7e"})["href"]
	except: # When he can't find the button, I assume he reached the end
		nextpage = "no next page"
	return nextpage
url=findNextPage(page)
url

'https://airbnb.com/s/Paris--France/homes?checkin=2020-12-30&refinement_paths%5B%5D=%2Fhomes&adults=1&checkout=2020-12-31&tab_id=home_tab&place_id=ChIJD7fiBh9u5kcRYJSMaMOCCwQ&federated_search_session_id=839d436c-3141-4710-851c-eb566a29934f&search_type=pagination&items_offset=20&section_offset=3'

In [83]:
## nombre de page du site web
def getPages(url):
	''' This function returns all the links to the pages containing 
	listings for one particular city '''
	result = []
	while url != "no next page": 
		page = getPage(url)
		result = result + [page]
		url = findNextPage(page)
	return result
URL=getPages(url_page)
len(URL)

15

In [84]:
## Scrapper toutes les infos sur les logements
def extractPages(url):
	''' This function outputs a dataframe that contains all information of a particular
	city. It thus contains information of multiple listings coming from multiple pages.'''
	pages = getPages(url)
	# Do for the first element to initialize the dataframe
	df =extractinfo(pages[0])
	# Loop over all other elements of the dataframe
	for pagenumber in range(1, len(pages)):
		df = df.append(extractinfo(pages[pagenumber]))
	return df

In [85]:
##base de tous les logements avec les évaluations
data=pd.DataFrame.from_dict(extractPages(url_page))
data.head(5)

Unnamed: 0,title,link,nbComments,prix,topRow,evaluation,roomInfo
1,La montmartoise - null - Paris,http://airbnb.com/rooms/41399617?adults=1&chec...,(51),Prix :24€ / nuit,Chambre privée à 18e arrondissement,4.80,"1 voyageur · 1 chambre · 1 lit · 1,5 salle de ..."
2,Studette 9ème - null - Paris,http://airbnb.com/rooms/46863460?adults=1&chec...,pas de commentaire,Prix :50€ / nuit,Logement entier à 9e arrondissement,pas d'évaluation,2 voyageurs · Studio · 2 lits · Demi-salle de ...
3,"Espace partagé, Shared space - null - Saint-Denis",http://airbnb.com/rooms/9975524?adults=1&check...,(441),Prix :19€ / nuit,Chambre partagée à La Chapelle,4.58,2 voyageurs · 1 chambre · 1 lit · 1 salle de b...
4,Suite balcon avec vue Tour eiffel ou Sacré-Coe...,http://airbnb.com/rooms/37108841?adults=1&chec...,(4),Prix initial :372€Prix réduit :265€ / nuit,Chambre d'hôtel à 8e arrondissement,5.0,2 voyageurs · 1 chambre · 1 lit · 1 salle de bain
5,Logement Privé Relaxant avec grand Jaccusi &Ja...,http://airbnb.com/rooms/15443356?adults=1&chec...,(291),Prix :129€ / nuit,Chambre privée à Rosny-sous-Bois,4.68,"2 voyageurs · 1 chambre · 1 lit · 1,5 salle de..."


In [86]:
## exportation de la base des listingd
len(data.index)
import csv
data.to_csv('D:\Python\dataListing.csv', index = True)

In [None]:
## scrapper les commentaires de chaque logement

In [87]:
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains 




In [88]:
help(webdriver.Chrome().get)

Help on method get in module selenium.webdriver.remote.webdriver:

get(url) method of selenium.webdriver.chrome.webdriver.WebDriver instance
    Loads a web page in the current browser session.



In [89]:
## installer chrome driver
#driver = webdriver.Chrome()
driver_path = 'D:/chromedriver.exe'
opt = webdriver.ChromeOptions()
opt.add_experimental_option('w3c', False)
driver = webdriver.Chrome(executable_path=driver_path,options=opt)

##Initialiser le driver de Selenium
def setupDriver(url, waiting_time = 5):
	driver = webdriver.Chrome(options=opt)
	driver.get(url)
	time.sleep(waiting_time) 
	return driver


In [90]:

## charger chaque listing
def getJSpage(url):
	driver = setupDriver(url)
	html = driver.page_source
	driver.close()
	return BeautifulSoup(html, features="lxml")

# lien de la page des commentaires de chaque listing
def getCommentLink(soupPage):
	try:
		return "https://www.airbnb.fr" + soupPage.find("div", {"class": "_19qg1ru"}).find("a")["href"]
	except:
		return "pas de lien"

##Recuperer l'ensemble des commentaires
def getRoomComments(soupage):
    #try
    crooms=soupage.findAll('div', {'class': '_1gjypya'})
    result=[]
    for room in crooms:
        result.append(room)
    return result

def getcomments(result):
    comments=[]
    for i in result:
        try:
            comments.append(i.find('div', {'class': '_1y6fhhr'}).text)
        except: 
            pass
    return comments



In [91]:
##Scrolling
def ScrollPagecomments(url):
    driver=setupDriver(url)
    num_current_comment_found = 0
    pre_scroll_num_of_comment=-1
    comment_class_when_scrolling  = '_1gjypya'
    time.sleep(5)
    while (num_current_comment_found != pre_scroll_num_of_comment) :           
                visible_comments = driver.find_elements_by_class_name(comment_class_when_scrolling)
                pre_scroll_num_of_comment= len(visible_comments)
                try:
                    last_visible_comment = visible_comments[-1]
                    actions = ActionChains(driver)
                    actions.move_to_element(last_visible_comment)
                    actions.perform()
                    time.sleep(2)
                    last_visible_comment.location_once_scrolled_into_view
                    visible_comments = driver.find_elements_by_class_name(comment_class_when_scrolling)
                    num_current_comment_found = len(visible_comments)
                except:
                    pass
    html=driver.page_source
    driver.close()
    return BeautifulSoup(html,features="lxml")

In [154]:
##EXTRACTION DES COMMENTAIRES
def ExtractionCommentaire(data):
    ind=[]
    com=[]
    DataComments=[]
    i=0
    for link in data['link']:
        soupPage=getJSpage(link)
        url_comment=getCommentLink(soupPage)
        if (url_comment!="pas de lien"):
            js=ScrollPagecomments(url_comment)
            DataComments=getcomments(getRoomComments(js))
            #a=getcomments(getRoomComments(js))
        else :
            DataComments=["pas de commentaire"]
        i=i+1
        ind=ind+list(itertools.repeat(i, len(DataComments)))
        com=com + DataComments
        database=pd.DataFrame.from_dict({'index':ind, 'commentaires':com})
        database.to_csv('D:\Python\database.csv', index = False, encoding = 'utf-8')
    return database

In [157]:
##extraction 
DataComments=ExtractionCommentaire(data) # head(1) [1:2]
#basefinale=pd.DataFrame(DataComments)
#basefinale.to_csv('D:\Python\basefinale.csv', index = True)

In [160]:
DataComments.tail(30)

Unnamed: 0,index,commentaires
10483,1,Louise is an ideal host and you are in good ha...
10484,1,Louise's apartment is a perfect spot to enjoy ...
10485,1,"Louise’s apartment is a clean, tidy and spacio..."
10486,1,Wonderful apartment in an ideal location. Very...
10487,1,"Nice location, clean and fresh apartment, easy..."
10488,1,We had a short but wonderful stay at Louise's ...
10489,1,Wonderful location with lots of shops and rest...
10490,1,Louise is a wonderful hostess and her place is...
10491,1,Very nice and comfy :) great and easy communic...
10492,1,Nice location and perfect interior.\nThe whole...


In [126]:
data.head()

Unnamed: 0,title,link,nbComments,prix,topRow,evaluation,roomInfo
1,La montmartoise - null - Paris,http://airbnb.com/rooms/41399617?adults=1&chec...,(51),Prix :24€ / nuit,Chambre privée à 18e arrondissement,4.80,"1 voyageur · 1 chambre · 1 lit · 1,5 salle de ..."
2,Studette 9ème - null - Paris,http://airbnb.com/rooms/46863460?adults=1&chec...,pas de commentaire,Prix :50€ / nuit,Logement entier à 9e arrondissement,pas d'évaluation,2 voyageurs · Studio · 2 lits · Demi-salle de ...
3,"Espace partagé, Shared space - null - Saint-Denis",http://airbnb.com/rooms/9975524?adults=1&check...,(441),Prix :19€ / nuit,Chambre partagée à La Chapelle,4.58,2 voyageurs · 1 chambre · 1 lit · 1 salle de b...
4,Suite balcon avec vue Tour eiffel ou Sacré-Coe...,http://airbnb.com/rooms/37108841?adults=1&chec...,(4),Prix initial :372€Prix réduit :265€ / nuit,Chambre d'hôtel à 8e arrondissement,5.0,2 voyageurs · 1 chambre · 1 lit · 1 salle de bain
5,Logement Privé Relaxant avec grand Jaccusi &Ja...,http://airbnb.com/rooms/15443356?adults=1&chec...,(291),Prix :129€ / nuit,Chambre privée à Rosny-sous-Bois,4.68,"2 voyageurs · 1 chambre · 1 lit · 1,5 salle de..."


In [47]:
len(DataComments[1])

441

In [174]:
len(DataComments[0])
data['link']

1     http://airbnb.com/rooms/9975524?adults=1&check...
2     http://airbnb.com/rooms/37108841?adults=1&chec...
3     http://airbnb.com/rooms/28977228?adults=1&chec...
4     http://airbnb.com/rooms/33703579?adults=1&chec...
5     http://airbnb.com/rooms/41399617?adults=1&chec...
                            ...                        
15    http://airbnb.com/rooms/13164529?adults=1&chec...
16    http://airbnb.com/rooms/34623498?adults=1&chec...
17    http://airbnb.com/rooms/26151927?adults=1&chec...
18    http://airbnb.com/rooms/46701017?adults=1&chec...
19    http://airbnb.com/rooms/17458548?adults=1&chec...
Name: link, Length: 285, dtype: object

In [157]:
data.head(3)

Unnamed: 0,title,link,nbComments,prix,topRow,evaluation,roomInfo
1,"Espace partagé, Shared space - null - Saint-Denis",http://airbnb.com/rooms/9975524?adults=1&check...,(441),Prix :19€ / nuit,Chambre partagée à La Chapelle,4.58,2 voyageurs · 1 chambre · 1 lit · 1 salle de b...
2,Suite balcon avec vue Tour eiffel ou Sacré-Coe...,http://airbnb.com/rooms/37108841?adults=1&chec...,(4),Prix initial :372€Prix réduit :265€ / nuit,Chambre d'hôtel à 8e arrondissement,5.0,2 voyageurs · 1 chambre · 1 lit · 1 salle de bain
3,Artsy loft by the canal St Martin+République 7...,http://airbnb.com/rooms/28977228?adults=1&chec...,(75),Prix :97€ / nuit,Loft entier à 11e arrondissement,4.93,5 voyageurs · 1 chambre · 3 lits · 1 salle de ...


In [152]:
data.head(2)
data[1:2]
data.tail(2)

Unnamed: 0,title,link,nbComments,prix,topRow,evaluation,roomInfo
18,Appartement de charme dans les hauts de Seine ...,http://airbnb.com/rooms/46701017?adults=1&chec...,pas de commentaire,Price:$99 / night,Entire apartment in La Garenne-Colombes,pas d'évaluation,2 guests · 1 bedroom · 1 bath
19,1 chambre privée confortable - null - Villejuif,http://airbnb.com/rooms/17458548?adults=1&chec...,(113),Price:$42 / night,Private room in Villejuif,4.46,2 guests · 1 bedroom · 1 bed · 1 bath


In [70]:
help(write)

NameError: name 'write' is not defined

In [86]:
data1=data.copy()
data1.drop_duplicates()

Unnamed: 0,title,link,nbComments,prix,topRow,evaluation,roomInfo
1,"Espace partagé, Shared space - null - Saint-Denis",http://airbnb.com/rooms/9975524?adults=1&check...,(441),Prix :19€ / nuit,Chambre partagée à La Chapelle,4.58,2 voyageurs · 1 chambre · 1 lit · 1 salle de b...
2,Suite balcon avec vue Tour eiffel ou Sacré-Coe...,http://airbnb.com/rooms/37108841?adults=1&chec...,(4),Prix initial :372€Prix réduit :265€ / nuit,Chambre d'hôtel à 8e arrondissement,5.0,2 voyageurs · 1 chambre · 1 lit · 1 salle de bain
3,Artsy loft by the canal St Martin+République 7...,http://airbnb.com/rooms/28977228?adults=1&chec...,(75),Prix :97€ / nuit,Loft entier à 11e arrondissement,4.93,5 voyageurs · 1 chambre · 3 lits · 1 salle de ...
4,Secret Deluxe Room in the heart of Paris - nul...,http://airbnb.com/rooms/33703579?adults=1&chec...,(18),Prix :97€ / nuit,Chambre d'hôtel à 9e arrondissement,4.61,2 voyageurs · 1 chambre · 1 lit · 1 salle de bain
5,La montmartoise - null - Paris,http://airbnb.com/rooms/41399617?adults=1&chec...,(51),Prix :24€ / nuit,Chambre privée à 18e arrondissement,4.80,"1 voyageur · 1 chambre · 1 lit · 1,5 salle de ..."
...,...,...,...,...,...,...,...
15,Chambre pour 2 personnes vue jardin - null - S...,http://airbnb.com/rooms/13164529?adults=1&chec...,(115),Price:$48 / night,Private room in Sèvres,4.10,2 guests · 1 bedroom · 1 bed · 1 shared bath
16,Paris Nation : lovely flat very bright - null ...,http://airbnb.com/rooms/34623498?adults=1&chec...,(103),Price:$102 / night,Entire apartment in Nation,4.81,2 guests · 1 bedroom · 1 bed · 1 bath
17,Double Room - Smart Place Gare du Nord - null ...,http://airbnb.com/rooms/26151927?adults=1&chec...,(295),Price:$82 / night,Hotel room in X Arrondissement,4.57,2 guests · 1 bedroom · 1 bed · 1 bath
18,Appartement de charme dans les hauts de Seine ...,http://airbnb.com/rooms/46701017?adults=1&chec...,pas de commentaire,Price:$99 / night,Entire apartment in La Garenne-Colombes,pas d'évaluation,2 guests · 1 bedroom · 1 bath


In [91]:
help(driver)

Help on WebDriver in module selenium.webdriver.chrome.webdriver object:

class WebDriver(selenium.webdriver.remote.webdriver.WebDriver)
 |  WebDriver(executable_path='chromedriver', port=0, options=None, service_args=None, desired_capabilities=None, service_log_path=None, chrome_options=None, keep_alive=True)
 |  
 |  Controls the ChromeDriver and allows you to drive the browser.
 |  
 |  You will need to download the ChromeDriver executable from
 |  http://chromedriver.storage.googleapis.com/index.html
 |  
 |  Method resolution order:
 |      WebDriver
 |      selenium.webdriver.remote.webdriver.WebDriver
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, executable_path='chromedriver', port=0, options=None, service_args=None, desired_capabilities=None, service_log_path=None, chrome_options=None, keep_alive=True)
 |      Creates a new instance of the chrome driver.
 |      
 |      Starts the service and then creates new instance of chrome driver.
 |      

In [None]:
import itertools
index=[]
comments=[]
for i in range(len(DataComments)):
    index=index+list(itertools.repeat(i, len(DataComments[i]))
    comments=comments+DataComments[i]

In [68]:
for i in range(5):
    print(i)


0
1
2
3
4


In [122]:
list1=[1]
list2=['d','e','f']
list1=list1+list2
list1

[1, 'd', 'e', 'f']

In [73]:
import csv 
  
# data to be written row-wise in csv fil 
data = [['Geeks','test'], [4], ['geeks !']] 
  
# opening the csv file in 'w+' mode 
file = open('D:/Python/g4g.csv', 'w+', newline ='') 
  
# writing the data into the file 
with file:     
    write = csv.writer(file) 
    write.writerows(data),