In [31]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from re import search
from time import sleep
import random
from dotenv import load_dotenv
import os
from pymongo import MongoClient



startingUrl = "https://vancouver.craigslist.org/search/apa?query=ubc&min_price=&max_price=&availabilityMode=0&sale_date=all+dates"
# add 's=START_NUMBER' to the query in order to scrape through paginator

# try to avoid scraper defense
HEADER = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
req = requests.get(startingUrl, HEADER)
soup = BeautifulSoup(req.content, 'html.parser')
# find all rows
resultRows = soup.find_all("li", {"class": "result-row"})

data = []
for result in resultRows:
    row = {}
    time = result.find("time", {"class": "result-date"})['datetime']
    header = result.find("h3", {"class": "result-heading"}).find("a")
    name = header.text
    href = header['href']
    price = result.find("span", {"class": "result-price"}).text
    bdr = result.find("span", {"class": "housing"})
    if bdr:
        bdr = bdr.text
        if search("br", bdr):
            bdr = int(bdr[bdr.find(' ')+len(' '):bdr.rfind('br')])
        else:
            bdr = 1
    else:
        bdr = 1
    # put everything together
    row['name'] = name
    row['href'] = href
    row['time'] = time
    row['price'] = price
    row['bedroom'] = bdr
    data.append(row)
# Put everything together in a dataframe
df = pd.DataFrame(data)



In [32]:
# Pagination Handler
totalCounts = int(soup.find("span", {"class" : "totalcount"}).text)
totalCounts
def getPaginatorParameter(df, totalCounts):
    result = []
    temp = len(df)
    one_iter = len(df)
    while temp < totalCounts:
        result.append(temp)
        temp += one_iter
    return result
parameters = getPaginatorParameter(df, totalCounts)



for param in parameters:
    link = startingUrl + '&s=' + str(param)
    req = requests.get(link, HEADER)
    soup = BeautifulSoup(req.content, 'html.parser')
# find all rows
    resultRows = soup.find_all("li", {"class": "result-row"})
    tempData = []
    for result in resultRows:
        row = {}
        time = result.find("time", {"class": "result-date"})['datetime']
        header = result.find("h3", {"class": "result-heading"}).find("a")
        name = header.text
        href = header['href']
        price = result.find("span", {"class": "result-price"}).text
        bdr = result.find("span", {"class": "housing"})
        if bdr:
            bdr = bdr.text
            if search("br", bdr):
                bdr = int(bdr[bdr.find(' ')+len(' '):bdr.rfind('br')])
            else:
                bdr = 1
        else:
            bdr = 1
        # put everything together
        row['name'] = name
        row['href'] = href
        row['time'] = time
        row['price'] = price
        row['bedroom'] = bdr
        tempData.append(row)
    # Put everything together in a dataframe
    tempDf = pd.DataFrame(tempData)
    df = df.append(tempDf, ignore_index=True)

In [33]:
# Message the data
# Convert string to datetime
df['time'] = pd.to_datetime(df['time'])
# Currency
df['price'] = df['price'].apply(lambda x: x.replace('$','')).apply(lambda x: x.replace(',','')).astype(np.int64)

df.drop(df.loc[df['price']==0].index, inplace=True)
df = df.drop_duplicates(subset=['name','price','bedroom'], keep='first')
df

Unnamed: 0,name,href,time,price,bedroom
1,Brand New Main Floor of House/10 mins from UBC...,https://vancouver.craigslist.org/van/apa/d/van...,2022-06-24 11:31:00,5195,3
2,The Laureates 112 - UBC Garden Suite,https://vancouver.craigslist.org/van/apa/d/van...,2022-06-24 11:17:00,3600,2
3,The Laureates 703 - UBC,https://vancouver.craigslist.org/van/apa/d/van...,2022-06-24 11:17:00,3600,2
4,"UBC Spacious 2-Bed, 2-Bath Condo for Rent!",https://vancouver.craigslist.org/van/apa/d/van...,2022-06-24 10:51:00,3700,2
5,2 Bedroom and 2 Bath Apartment Unit at UBC!!!,https://vancouver.craigslist.org/van/apa/d/van...,2022-06-24 10:51:00,3600,2
...,...,...,...,...,...
492,1 BR condo for rent,https://vancouver.craigslist.org/van/apa/d/van...,2022-05-12 10:05:00,2250,1
493,Oakridge 2 bedroom 1bathrm spacious non-smokin...,https://vancouver.craigslist.org/van/apa/d/van...,2022-05-11 11:42:00,2585,2
494,Upper Large 2 Bedroom Suite next To Downtown,https://vancouver.craigslist.org/van/apa/d/van...,2022-05-11 11:32:00,2450,2
495,Two-Bedroom Apartment Utility included,https://vancouver.craigslist.org/van/apa/d/van...,2022-05-11 10:43:00,2700,2


In [34]:
hrefs = df['href'].tolist()
location = []
for index in range(0,len(hrefs)):
    req = requests.get(hrefs[index], headers={'User-Agent': 'Custom'})
    
    sleep(random.randint(3, 7))
    print("currently on row {}".format(index + 1))
    if req.status_code == 200:
        soup = BeautifulSoup(req.content, 'html.parser')
        result = soup.find("div", {"class": "mapaddress"})
        # If listing does specifies map address
        if result is not None:
            location.append(result.text)
        # If listing does not specify map address
        else:
            location.append('location not found')
    else:
        print("unexpected status code {}".format(req.status_code))
        break
location



currently on row 1
currently on row 2
currently on row 3
currently on row 4


KeyboardInterrupt: 

In [None]:
df['location'] = location
df.to_csv('data_frame.csv', index = False)


In [None]:
# df = pd.read_csv('data_frame.csv')
# this part requires mapquest API key
mapquest_key = os.getenv('MAPQUEST_KEY')
locations = df['location'].tolist()
lattitudes = []
longitudes = []

for index, ele in enumerate(df['location']):
    print(index)
    if ele != 'location not found':
        maprequest_api_url = "http://open.mapquestapi.com/geocoding/v1/address?key={}&location={}".format(mapquest_key, locations[index] + ',BC,Canada')
        response = requests.get(maprequest_api_url)
        data = response.json()
        data = data['results'][0]['locations'][0]['latLng']
        lat = data['lat']
        lng = data['lng']
        lattitudes.append(lat)
        longitudes.append(lng)
    else:
        lattitudes.append(0)
        longitudes.append(0)

# maprequest_api_url = "http://open.mapquestapi.com/geocoding/v1/address?key={}&location={}".format(mapquest_key, ele + ",BC,Canada")
# maprequest_api_url
# response = requests.get(maprequest_api_url)

In [84]:
# data = response.json()
# data = data['results'][0]['locations'][0]['latLng']
# data
# lat = data['lat']
# lng = data['lng']


# locations
df['longitudes'] = longitudes
df['lattitudes'] = lattitudes
df.to_csv('data_frame.csv', index = False)
# lat > 48
# lng < -120


In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from re import search
from time import sleep
import random
from dotenv import load_dotenv
import os
from pymongo import MongoClient
import json
import uuid
import sys

load_dotenv('.env')
df = pd.read_csv('data_frame.csv')
mongo_uri = 'mongodb+srv://JW:{}@cluster0.q4g0hww.mongodb.net/?retryWrites=true&w=majority'.format(os.getenv('MONGODB_USR_PASSWORD'))
client = MongoClient(mongo_uri)
mongo_uri


'mongodb+srv://JW:Stemcell2018@cluster0.q4g0hww.mongodb.net/?retryWrites=true&w=majority'

In [3]:
collection = client.RentPredictorDatabase.RentPredictorCollection
# records = json.loads(df.to_json(orient='records'))
# records
collection.insert_many(df.to_dict('records'))


<pymongo.results.InsertManyResult at 0x7f92c22b5e80>

0      2022-06-23 20:08:00
1      2022-06-23 20:06:00
2      2022-06-23 19:00:00
3      2022-06-23 16:56:00
4      2022-06-23 14:53:00
              ...         
439    2022-05-10 21:21:00
440    2022-05-09 01:18:00
441    2022-05-09 07:44:00
442    2022-05-09 10:53:00
443    2022-05-09 08:54:00
Name: time, Length: 444, dtype: object