In [66]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from re import search
from time import sleep
import random
from dotenv import load_dotenv
import os
from pymongo import MongoClient

startingUrl = "https://vancouver.craigslist.org/search/apa?query=ubc&min_price=&max_price=&availabilityMode=0&sale_date=all+dates"
# add 's=START_NUMBER' to the query in order to scrape through paginator
load_dotenv('.env')

# try to avoid scraper defense

HEADER = {
  "Access-Control-Allow-Origin": "*",
  "Access-Control-Allow-Methods": "GET",
  "Access-Control-Allow-Headers": "Content-Type",
  "Access-Control-Max-Age": "3600",
  "User-Agent":
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
};
req = requests.get(startingUrl, HEADER)
soup = BeautifulSoup(req.content, 'html.parser')
# find all rows
resultRows = soup.find_all("li", {"class": "result-row"})

data = []
for result in resultRows:
    row = {}
    time = result.find("time", {"class": "result-date"})['datetime']
    header = result.find("h3", {"class": "result-heading"}).find("a")
    name = header.text
    href = header['href']
    price = result.find("span", {"class": "result-price"}).text
    bdr = result.find("span", {"class": "housing"})
    if bdr:
        bdr = bdr.text
        if search("br", bdr):
            bdr = int(bdr[bdr.find(' ')+len(' '):bdr.rfind('br')])
        else:
            bdr = 1
    else:
        bdr = 1
    # put everything together
    row['name'] = name
    row['href'] = href
    row['time'] = time
    row['price'] = price
    row['bedroom'] = bdr
    data.append(row)
# Put everything together in a dataframe
df = pd.DataFrame(data)



In [2]:
# Pagination Handler
totalCounts = int(soup.find("span", {"class" : "totalcount"}).text)
totalCounts
def getPaginatorParameter(df, totalCounts):
    result = []
    temp = len(df)
    one_iter = len(df)
    while temp < totalCounts:
        result.append(temp)
        temp += one_iter
    return result
parameters = getPaginatorParameter(df, totalCounts)



for param in parameters:
    link = startingUrl + '&s=' + str(param)
    req = requests.get(link, HEADER)
    soup = BeautifulSoup(req.content, 'html.parser')
# find all rows
    resultRows = soup.find_all("li", {"class": "result-row"})
    tempData = []
    for result in resultRows:
        row = {}
        time = result.find("time", {"class": "result-date"})['datetime']
        header = result.find("h3", {"class": "result-heading"}).find("a")
        name = header.text
        href = header['href']
        price = result.find("span", {"class": "result-price"}).text
        bdr = result.find("span", {"class": "housing"})
        if bdr:
            bdr = bdr.text
            if search("br", bdr):
                bdr = int(bdr[bdr.find(' ')+len(' '):bdr.rfind('br')])
            else:
                bdr = 1
        else:
            bdr = 1
        # put everything together
        row['name'] = name
        row['href'] = href
        row['time'] = time
        row['price'] = price
        row['bedroom'] = bdr
        tempData.append(row)
    # Put everything together in a dataframe
    tempDf = pd.DataFrame(tempData)
    df = df.append(tempDf, ignore_index=True)

In [3]:
# Message the data
# Convert string to datetime
df['time'] = pd.to_datetime(df['time'])
# Currency
df['price'] = df['price'].apply(lambda x: x.replace('$','')).apply(lambda x: x.replace(',','')).astype(np.int64)

df.drop(df.loc[df['price']==0].index, inplace=True)
df = df.drop_duplicates(subset=['name','price','bedroom'], keep='first')
df

Unnamed: 0,name,href,time,price,bedroom
0,"2 Bedrooms, Townhouse in Vancouver Cambie, VGH...",https://vancouver.craigslist.org/van/apa/d/van...,2022-07-07 20:19:00,3200,2
1,3 bedrooms basement near UBC,https://vancouver.craigslist.org/van/apa/d/van...,2022-07-07 19:56:00,3300,3
2,UPSTAIRS UNIT FAMILY HOME CLOSE TO UBC,https://vancouver.craigslist.org/van/apa/d/van...,2022-07-07 18:41:00,3600,3
3,3 BED UPSTAIRS HOME CLOSE TO UBC,https://vancouver.craigslist.org/van/apa/d/van...,2022-07-07 18:40:00,3600,2
4,"One bedroom-Dunbar,Lord Byng, UBC, Point Grey",https://vancouver.craigslist.org/van/apa/d/van...,2022-07-07 18:10:00,2150,1
...,...,...,...,...,...
513,$2800 Dunbar / UBC area Ground Level 3 bedroom...,https://vancouver.craigslist.org/van/apa/d/van...,2022-05-24 22:56:00,2800,3
514,Huge 2 Bedroom Corner Unit In Sth Granville,https://vancouver.craigslist.org/van/apa/d/van...,2022-05-25 20:51:00,2575,2
515,Vancouver west Dunbar area 6 bedroom for rent,https://vancouver.craigslist.org/van/apa/d/van...,2022-05-24 23:41:00,7800,6
516,"$3,500 / 3BR 2Bath - Upper Level House New Ren...",https://vancouver.craigslist.org/van/apa/d/van...,2022-05-24 11:21:00,3500,3


In [4]:
hrefs = df['href'].tolist()
location = []
for index in range(0,len(hrefs)):
    req = requests.get(hrefs[index], headers={'User-Agent': 'Custom'})
    
    sleep(random.randint(3, 7))
    print("currently on row {}".format(index + 1))
    if req.status_code == 200:
        soup = BeautifulSoup(req.content, 'html.parser')
        result = soup.find("div", {"class": "mapaddress"})
        # If listing does specifies map address
        if result is not None:
            location.append(result.text)
        # If listing does not specify map address
        else:
            location.append('location not found')
    else:
        print("unexpected status code {}".format(req.status_code))
        break



currently on row 1
currently on row 2
currently on row 3
currently on row 4
currently on row 5
currently on row 6
currently on row 7
currently on row 8
currently on row 9
currently on row 10
currently on row 11
currently on row 12
currently on row 13
currently on row 14
currently on row 15
currently on row 16
currently on row 17
currently on row 18
currently on row 19
currently on row 20
currently on row 21
currently on row 22
currently on row 23
currently on row 24
currently on row 25
currently on row 26
currently on row 27
currently on row 28
currently on row 29
currently on row 30
currently on row 31
currently on row 32
currently on row 33
currently on row 34
currently on row 35
currently on row 36
currently on row 37
currently on row 38
currently on row 39
currently on row 40
currently on row 41
currently on row 42
currently on row 43
currently on row 44
currently on row 45
currently on row 46
currently on row 47
currently on row 48
currently on row 49
currently on row 50
currently

['location not found',
 'location not found',
 'location not found',
 'location not found',
 'Carnarvon St near 21st',
 'Broadway near Collingwood',
 'location not found',
 '33rd E near Commercial',
 'Knox Rd near University Blvd',
 '5628 Birney Avenue',
 'location not found',
 'West 10th Avenue near Discovery',
 '5740 Toronto Road near Allison Road',
 '5632 kings rd',
 'Macdonald Street near 5th Avenue',
 'york avenue near balsam',
 'W15 near Wallace st',
 'location not found',
 '12th Avenue West near Balaclava',
 '3462 Ross Drive',
 '6005 Walter Gage Road',
 '5725 Agronomy Road',
 'location not found',
 'location not found',
 'vine near 41st w',
 '5728 Berton Ave',
 'location not found',
 'west 7th avenue near balsam',
 '20th avenue near cambie',
 '3563 Ross drive',
 'location not found',
 'location not found',
 'W Point Pl',
 'location not found',
 'location not found',
 'location not found',
 '5638 Birney Avenue',
 '4754 West 2nd Avenue',
 '5983 Gray Ave',
 'location not found',
 '

In [5]:
df['location'] = location
df.to_csv('data_frame.csv', index = False)


In [16]:
df = pd.read_csv('data_frame.csv')
# this part requires mapquest API key
mapquest_key = os.getenv('MAPQUEST_KEY')
locations = df['location'].tolist()
lattitudes = []
longitudes = []

for index, ele in enumerate(df['location']):
    print(index)
    if ele != 'location not found':
        maprequest_api_url = "http://open.mapquestapi.com/geocoding/v1/address?key={}&location={}".format(mapquest_key, locations[index] + ',BC,Canada')
        response = requests.get(maprequest_api_url)
        data = response.json()
        data = data['results'][0]['locations'][0]['latLng']
        lat = data['lat']
        lng = data['lng']
        lattitudes.append(lat)
        longitudes.append(lng)
    else:
        lattitudes.append(0)
        longitudes.append(0)

# maprequest_api_url = "http://open.mapquestapi.com/geocoding/v1/address?key={}&location={}".format(mapquest_key, ele + ",BC,Canada")
# maprequest_api_url
# response = requests.get(maprequest_api_url)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [19]:
# data = response.json()
# data = data['results'][0]['locations'][0]['latLng']
# data
# lat = data['lat']
# lng = data['lng']


# locations
df['longitudes'] = longitudes
df['lattitudes'] = lattitudes
df.to_csv('data_frame.csv', index = False)
# lat > 48
# lng < -120


In [52]:

df = pd.read_csv('data_frame.csv')
mongo_uri = 'mongodb+srv://JW:{}@cluster0.q4g0hww.mongodb.net/?retryWrites=true&w=majority'.format(os.getenv('MONGODB_USR_PASSWORD'))
client = MongoClient(mongo_uri)
mongo_uri



'mongodb+srv://JW:Stemcell2018@cluster0.q4g0hww.mongodb.net/?retryWrites=true&w=majority'

In [46]:
collection = client.RentPredictorDatabase.RentPredictorCollection
collection.drop()

CONDITION = (df['longitudes'] < - 110) & (df['lattitudes'] > 45)
df['validation_location'] = CONDITION

collection.insert_many(df.to_dict('records'))


<pymongo.results.InsertManyResult at 0x7fe5e4517f70>

In [62]:



mongo_uri = 'mongodb+srv://JW:{}@cluster0.q4g0hww.mongodb.net/?retryWrites=true&w=majority'.format(os.getenv('MONGODB_USR_PASSWORD'))
client = MongoClient(mongo_uri)
collection = client.RentPredictorDatabase.RentPredictorCollection
df = pd.DataFrame(list(collection.find({})))
df.drop('_id',axis = 1)

Unnamed: 0,name,href,time,price,bedroom,location,longitudes,lattitudes,validation_location
0,"2 Bedrooms, Townhouse in Vancouver Cambie, VGH...",https://vancouver.craigslist.org/van/apa/d/van...,2022-07-07 20:19:00,3200,2,location not found,0.000000,0.00000,False
1,3 bedrooms basement near UBC,https://vancouver.craigslist.org/van/apa/d/van...,2022-07-07 19:56:00,3300,3,location not found,0.000000,0.00000,False
2,UPSTAIRS UNIT FAMILY HOME CLOSE TO UBC,https://vancouver.craigslist.org/van/apa/d/van...,2022-07-07 18:41:00,3600,3,location not found,0.000000,0.00000,False
3,3 BED UPSTAIRS HOME CLOSE TO UBC,https://vancouver.craigslist.org/van/apa/d/van...,2022-07-07 18:40:00,3600,2,location not found,0.000000,0.00000,False
4,"One bedroom-Dunbar,Lord Byng, UBC, Point Grey",https://vancouver.craigslist.org/van/apa/d/van...,2022-07-07 18:10:00,2150,1,Carnarvon St near 21st,-100.445882,39.78373,False
...,...,...,...,...,...,...,...,...,...
456,$2800 Dunbar / UBC area Ground Level 3 bedroom...,https://vancouver.craigslist.org/van/apa/d/van...,2022-05-24 22:56:00,2800,3,location not found,0.000000,0.00000,False
457,Huge 2 Bedroom Corner Unit In Sth Granville,https://vancouver.craigslist.org/van/apa/d/van...,2022-05-25 20:51:00,2575,2,location not found,0.000000,0.00000,False
458,Vancouver west Dunbar area 6 bedroom for rent,https://vancouver.craigslist.org/van/apa/d/van...,2022-05-24 23:41:00,7800,6,Dunbar near 41 Ave,-100.445882,39.78373,False
459,"$3,500 / 3BR 2Bath - Upper Level House New Ren...",https://vancouver.craigslist.org/van/apa/d/van...,2022-05-24 11:21:00,3500,3,location not found,0.000000,0.00000,False


In [65]:
load_dotenv('.env')
print(os.getenv('MONGODB_USR_PASSWORD'))

Stemcell2018
