# Importing

In [1]:
# Basic libraries
import pandas as pd
import numpy as np

# Importing for scraping
from bs4 import BeautifulSoup 
from selenium import webdriver

import time         # Tracking time
import requests     # HTTP requests
import re           # String manipulation
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

#### Choosing of interesting nodes
We import a large list of reviews from 'insideirbnb.com' and we choose listings whose reviewer are popular(who commented more than 15 listings)

In [2]:
reviews = pd.read_csv(r"C:\Users\Gianmarco\Dropbox\Il mio PC (DESKTOP-DM1F9TN)\Desktop\SocialNetworkAnalysis\progetto\Data\reviews_amsterdam.csv")

In [3]:
count = reviews.groupby('reviewer_id')[['listing_id']].count().reset_index()
count = count.rename(columns={'listing_id': 'count'})
count = count[count['count']>7]

In [4]:
count.shape

(22, 2)

In [5]:
df_reviews = pd.merge(reviews,count, on = "reviewer_id")
listings = df_reviews["listing_id"]

In [6]:
len(listings.unique())

171

# Data Scraping of listings

In [7]:
# Use selenium driver to obtain JavaScript information of airbnb pages
def setupDriver(url, waiting_time = 20):
    ''' Initializes the driver of selenium'''
    driver = webdriver.Chrome("chromedriver.exe")
    driver.get(url)
    time.sleep(waiting_time) 
    return driver


def getJSpage(url):
    ''' Extracts the html of the webpage including the JS elements,
    output should be used as the input for all functions extracting specific information
    from the detailed pages of the listings '''
    driver = setupDriver(url)
    
    read_more_buttons = driver.find_elements_by_class_name("_1d079j1e")
    
    try:
        for i in range(2, len(read_more_buttons)):
            read_more_buttons[i].click()
        
    except:
        pass
    html = driver.page_source
    driver.close()
    return BeautifulSoup(html, features="lxml") 

In [8]:
# GetFunctions
def getLinkListings2(listing):
    return "https://www.airbnb.it/rooms/"+str(listing)+"?"

def getListingTitle2(listing):
    try:
        output = listing.find("h1", {"class": "_fecoyn4"}).text
    except:
        output = []
    return output 

def getTopRow2(listing):
    try:
        output = listing.find("div", {"class": "_xcsyj0"}).text
    except:
        output = []
    return output 
    
    
def getRoomInfo2(listing):
    ''' Returns the guest information'''
    output = []
    
    try:
        info = listing.find("div", {"class":"_tqmy57"}).findAll("span")[0::2]
        for i in range(len(info)):
            output.append(info[i].text)
            
        
    except:
        output = []
    return output

def getBasicFacilities2(listing):
    ''' Returns the basic facilities'''
    
    output = []
    try:
        fac = listing.findAll("div", {"class":"iikjzje dir dir-ltr"})
        for i in range(len(fac)):
            output.append(fac[i].text)
        
    except:
        output = []
    return output

def getListingRating2(listing):
    ''' Returns the rating '''
    try:
        output = listing.find(class_="_12si43g").text
    except:
        output = []
    return output

def getListingReviewNumber2(listing):
    ''' Returns the number of reviews '''
    try: # Not all listings have reviews // extraction failed
        output = listing.find(class_ = "_1qx9l5ba").text
    except:
        output = []   # Indicate that the extraction failed -> can indicate no reviews or a mistake in scraping
    return output

In [9]:
def createDf(listings):
    ''' Takes all the information of a single page (thus multiple listings) and
    summarizes it in a dataframe'''
    listings = listings.unique()
    titles, links,listings_id ,toprows, roominfos, basicfacilitiess,ratings, reviews =  [], [], [], [], [], [], [], []
    conta = 0
    for listing in listings:
        links.append(getLinkListings2(listing))
        page = getJSpage(links[-1])
        titles.append(getListingTitle2(page))
        toprows.append(getTopRow2(page))
        roominfos.append(getRoomInfo2(page))
        basicfacilitiess.append(getBasicFacilities2(page))
        ratings.append(getListingRating2(page))
        reviews.append(getListingReviewNumber2(page))
        listings_id.append(listing)
        conta += 1
        print(conta)
    dictionary = {"listing_id": listings_id, "title": titles, "toprow": toprows, "roominfo": roominfos, "facilities" : basicfacilitiess,  "rating": ratings, "link": links, "reviewnumber": reviews}
    return pd.DataFrame(dictionary)

In [10]:
df_listings = createDf(listings)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171


In [11]:
df_listings

Unnamed: 0,listing_id,title,toprow,roominfo,facilities,rating,link,reviewnumber
0,2818,@ Daniel's (camera vista giardino),Stanza privata in appartamento - Host: Daniel,"['2 ospiti', '1 camera da letto', '2 letti', '...","['Wi-fi', 'Lavatrice', 'Cortile', 'Asciugacape...",4.88,https://www.airbnb.it/rooms/2818?,(278 recensioni)
1,8778725,[],[],[],[],[],https://www.airbnb.it/rooms/8778725?,[]
2,15458166,Private room with canal view,Stanza privata in casa a schiera - Host: Malti,"['2 ospiti', '1 camera da letto', '1 letto', '...","['Accesso alla spiaggia (sul lungomare)', 'Cuc...",4.83,https://www.airbnb.it/rooms/15458166?,(78 recensioni)
3,17632137,"Modern 2P apt at historic Dappermarket | Zoo, ...",Intero appartamento - Host: Guido,"['2 ospiti', '1 camera da letto', '1 letto', '...","['Cucina', 'Wi-fi', 'TV', 'Lavatrice', 'Frigor...",4.36,https://www.airbnb.it/rooms/17632137?,(25 recensioni)
4,20203218,"Private room in Eastern Docklands, Amsterdam",Stanza privata in casa - Host: Carolyn,"['2 ospiti', '1 camera da letto', '1 letto', '...","['Lungomare', 'Wi-fi', 'Vasca da bagno', 'Gioc...",4.85,https://www.airbnb.it/rooms/20203218?,(26 recensioni)
...,...,...,...,...,...,...,...,...
166,21354405,Modern & beautiful apt. in OudWest with garden!,Intero appartamento - Host: Lotte,"['2 ospiti', '1 camera da letto', '1 letto', '...","['Cucina', 'Wi-fi', 'TV', 'Lavatrice', 'Cortil...",4.90,https://www.airbnb.it/rooms/21354405?,(10 recensioni)
167,23747999,Rivierenbuurt,Intero appartamento - Host: Paskalle,"['2 ospiti', '1 camera da letto', '1 letto', '...","['Cucina', 'Wi-fi', 'TV', 'Patio o balcone', '...",4.91,https://www.airbnb.it/rooms/23747999?,(11 recensioni)
168,26000561,Cosy apartment in the centre of Amsterdam,Intero appartamento - Host: Demi,"['2 ospiti', '1 camera da letto', '1 letto', '...","['Cucina', 'Wi-fi', 'TV', 'Lavatrice', 'Asciug...",4.88,https://www.airbnb.it/rooms/26000561?,(16 recensioni)
169,33990484,Lovely bright apartment near Vondelpark,Intero appartamento - Host: Fleur,"['2 ospiti', '1 camera da letto', '1 letto', '...","['Cucina', 'Wi-fi', 'Animali domestici ammessi...",4.93,https://www.airbnb.it/rooms/33990484?,(29 recensioni)


In [12]:
df_listings.to_csv(r"C:\Users\Gianmarco\Dropbox\Il mio PC (DESKTOP-DM1F9TN)\Desktop\SocialNetworkAnalysis\progetto\Data\listings_amsterdam.csv", index = False)

In [10]:
df_listings = pd.read_csv(r"C:\Users\Gianmarco\Dropbox\Il mio PC (DESKTOP-DM1F9TN)\Desktop\SocialNetworkAnalysis\progetto\Data\listings_amsterdam.csv")

# Data Scraping of reviews
(of relative listing)

In [12]:
# GetFunctions
def getListingReviewNumber(revnumber_column):
    ''' Returns the number of reviews '''
    output = []
    for n in revnumber_column:
        
        try: # Not all listings have reviews // extraction failed
            ele = int(re.findall(r"[1234567890]+", n)[0])
        except:
            ele = -1   # Indicate that the extraction failed -> can indicate no reviews or a mistake in scraping
        output.append(ele)
    return output

def extracturlreviews(link):
    return link[:len(link)-1] + "/reviews?"

def getReviews(JSsoup):
    ''' Returns a list of the featured reviews on the page '''

    return JSsoup.find(class_ = "_3j8fry").findAll(class_ = "_1gjypya")

In [13]:
#function to simulate scroll down on inner page
def extractScrollReviews(url,n):
    driver = webdriver.Chrome("chromedriver.exe")
    wait = WebDriverWait(driver,40)
    
    driver.get(url)

    x = 0

    wait.until(EC.presence_of_all_elements_located((By.XPATH,"//div[@class='_3j8fry']//div[@class='_1gjypya']")))
    
    while x < n:
        driver.find_elements_by_xpath("//div[@class='_3j8fry']//div[@class='_1gjypya']")
        driver.find_element_by_xpath("(//div[@class='_3j8fry']//div[@class='_1gjypya'])[last()]").location_once_scrolled_into_view
        x = len(driver.find_elements_by_xpath("//div[@class='_3j8fry']//div[@class='_1gjypya']"))
    html = driver.page_source
    driver.close()
    return BeautifulSoup(html, features="lxml")

In [14]:
def extractInformation(df):
    ''' Takes all the information of a single page (thus multiple listings) and
    summarizes it in a dataframe'''
    listings_id = list(df.listing_id.unique())
    links = list(df.link.unique())
    reviews = getListingReviewNumber(list(df.reviewnumber))
    revlist = []
    cont = 0
    for link in links:
        
        
    
        try:
            revlist.append(getReviews(extractScrollReviews(extracturlreviews(link),reviews[cont])))
            
        except:
            revlist.append('null')
            print('null')
        cont += 1
        print(cont)
        
    dictionary = {"listing_id": listings_id,  "link": links, "reviewnumber": reviews, "revlist":revlist}
    return pd.DataFrame(dictionary)
# revlist column contain html of every reviews related to listing in that row

In [17]:
reviews_info = extractInformation(df_listings)

1
null
2
3
4
5
6
null
7
8
9
10
null
11
12
13
14
15
16
17
18
19
20
21
22
23
null
24
25
26
null
27
28
null
29
30
31
32
33
34
35
36
37
38
39
null
40
41
42
43
44
45
46
47
48
null
49
50
51
52
53
54
55
56
57
58
59
60
null
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
null
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
null
95
96
97
98
99
100
101
102
null
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
null
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
null
161
162
163
164
165
null
166
167
168
169
170
171


In [18]:
reviews_info = reviews_info[reviews_info.revlist != 'null']
reviews_info.shape

(156, 4)

In [19]:
def getReviewer_id(row):
    try:
        idx = int(re.findall(r"[1234567890]+",row.find("a",{"class": "_105023be"})["href"])[0])
    except:
        idx = []
    return idx

In [20]:
# In this way every row correspond to a single review
def createdfreviews(df):
    listings_id,reviewnumbers,dates,reviews,reviews_id = [],[],[],[], []
    for i in range(df.shape[0]):
        row = df.iloc[i]
        cont = 0
        while cont < int(row["reviewnumber"]):
            
            try:
                reviews.append(row["revlist"][cont].find("span").text)
            except:
                break
                
            try:
                reviews_id.append(getReviewer_id(row["revlist"][cont]))
            except:
                break
                
            try:
                dates.append(row["revlist"][cont].find("div", {"class": "_1ixuu7m"}).text)
            except:
                break
            listings_id.append(row["listing_id"])
            #reviewnumbers.append(row["reviewnumber"]) 
            cont += 1
    dictionary = {"reviewer_id" : reviews_id, "reviews": reviews ,"date": dates,"listing_id": listings_id}
    return pd.DataFrame(dictionary)     

In [21]:
df_reviews = createdfreviews(reviews_info)

In [22]:
df_reviews.to_csv(r"C:\Users\Gianmarco\Dropbox\Il mio PC (DESKTOP-DM1F9TN)\Desktop\SocialNetworkAnalysis\progetto\Data\reviews_list_amsterdam.csv", index = False)

In [23]:
df_reviews.head()

Unnamed: 0,reviewer_id,reviews,date,listing_id
0,39647118,"Daniel è una persona molto socievole , ci siam...",aprile 2016,2818
1,31905488,"Ottima accoglienza, abbiamo concordato l'orari...",ottobre 2015,2818
2,28738000,L'appartamento è molto pulito. Daniel è molto ...,giugno 2015,2818
3,32161750,Daniel è una persona estremamente gentile. L'a...,luglio 2015,2818
4,33086171,"Daniel is a perfect host: he is really kind, p...",ottobre 2017,2818


In [25]:
df_reviews.shape

(11394, 4)