# Scrape

In [63]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

from bs4 import BeautifulSoup
import lxml
import pandas as pd
import time
import urllib.request
import re

# Rent Cost Crawler

In [64]:
class CraigslistRENTScraper(object):
    def __init__(self, location, bedrooms, sqft, radius, pagenum):
# What information I'm scrapping
        self.location = location
        self.bedrooms = bedrooms
        self.sqft = sqft
        self.radius = radius
        self.pagenum = pagenum
        self.num_pages_to_scrape = 25
        
        self.url = f"https://{location}.craigslist.org/search/apa?availabilityMode=0&minSqft={sqft}&min_bedrooms={bedrooms}&{pagenum}search_distance={radius}"

#         for i in range(0, 24):
#                 if i == 0:
#                     self.url = f"https://{location}.craigslist.org/search/apa?availabilityMode=0&minSqft={sqft}&min_bedrooms={bedrooms}&search_distance={radius}"
#                     print("First iteration")
#                     time.sleep(30)
#                 else:
#                     self.url = f"https://{location}.craigslist.org/search/apa?availabilityMode=0&minSqft={sqft}&min_bedrooms={bedrooms}&s=" + str(i * 120) + "&search_distance={radius}"
#                     print("s=" + str(i * 120))
#                     time.sleep(2)
        self.driver = webdriver.Firefox()
        self.delay = 3


# Launch URL and confirm it fully loaded    
    def load_craigslist_url(self):
        self.driver.get(self.url)
        try:
            wait = WebDriverWait(self.driver, self.delay)
            wait.until(EC.presence_of_element_located((By.ID, "searchform")))
            print("Page is ready")


        except TimeoutException:
            print("Loading took to much time")


# Build lists of information   
    def extract_post_information(self):
        all_posts = self.driver.find_elements_by_class_name("result-row")
        prices = []
        titles = []
        dates = []
        addresss = []
        bedrooms = []
        sqfts = []

# Clean and divide the post into important sections    
        for post in all_posts:
#             print(post.text)
            title = post.text.split("$")

            if title[0] == '':
                title = title[1]
            else:
                title = title[0]

            info = post.text.split(" - ")
            address = info[-1]
            address = address.strip("()")

            bedroom = post.text
            pattern = r"\s\d*[b][r]\s"
            bedroom = (re.findall(pattern,bedroom)[0])

            sqft = post.text
            sqftpat = r"\s\d*[f][t][2]"
            sqft = (re.findall(sqftpat,sqft)[0])

            title = title.split("\n")
            price = title[0]
            title = title[-1]

            title = title.split(" ")

            month = title[0]
            day = title[1]
            title = ' '.join(title[2:])
            date = month + " " + day
            
# Add to Lists        
            prices.append(price)
            titles.append(title)
            dates.append(date)
            addresss.append(address)
            bedrooms.append(bedroom)
            sqfts.append(sqft)

        return prices, titles, dates, addresss, bedrooms, sqfts

# Collect post urls and build list    
    def extract_post_urls(self):
        url_list = []
        html_page = urllib.request.urlopen(self.url)
        soup = BeautifulSoup(html_page, "lxml")
        for link in soup.findAll("a", {"class": "result-title hdrlnk"}):
            url_list.append(link["href"])
        return url_list
        
# Close FireFox Browser        
    def quit(self):
        self.driver.close()
    
# Specifications for scrape  
    
location = "portland"
bedrooms = "1"
sqft = "1"
radius = "100"
pagenum = ""

scraper = CraigslistRENTScraper(location, bedrooms, sqft, radius, pagenum)
scraper.load_craigslist_url()
prices, titles, dates, addresss, bedrooms, sqfts = scraper.extract_post_information()
url_list = scraper.extract_post_urls()
scraper.quit()


Page is ready


In [65]:
rentdf = pd.DataFrame(
    {'Price': prices,
     'Date': dates,
     'Bedrooms': bedrooms,
     'SqFt': sqfts,
     'Location': addresss,
     'Post': titles,
     'Url': url_list,
    })

In [70]:
rentdf["rooms"] = rentdf["Bedrooms"].str[:2]
rentdf["SQFT"] = rentdf["SqFt"].str[:-3]
# rentdf["Location"]

In [69]:
rentdf.head()

Unnamed: 0,Bedrooms,Date,Location,Post,Price,SqFt,Url,rooms,SQFT
0,1br,Jul 24,Sherwood,Quartz Counter Tops! Amazing Renovated 2BD 2BA...,1609,1026ft2,https://portland.craigslist.org/wsc/apa/d/quar...,1,1026
1,1br,Jul 24,Kerns,"Looking For A Place To Call Home In August, Li...",1399,504ft2,https://portland.craigslist.org/mlt/apa/d/look...,1,504
2,1br,Jul 24,clark/cowlitz,"Lease Today, Swimming Pool, Gated Community, P...",1354,747ft2,https://portland.craigslist.org/clk/apa/d/leas...,1,747
3,1br,Jul 24,East Gresham,"Rare, 1 Bedroom Loft Layout; w/Attached Garage...",1135,670ft2,https://portland.craigslist.org/mlt/apa/d/rare...,1,670
4,2br,Jul 24,Tualatin,This 2bed/2bath is Perfect for You! Come Take ...,1614,944ft2,https://portland.craigslist.org/clc/apa/d/this...,2,944


In [71]:
# rentdf["Location"].value_counts()

# Craigslist Real Estate Crawler

In [None]:
# class CraigslistHOUSEScraper(object):
#     def __init__(self, location, bedrooms, sqft, radius, pagenum):
# # What information I'm scrapping
#         self.location = location
#         self.bedrooms = bedrooms
#         self.sqft = sqft
#         self.radius = radius
#         self.pagenum = pagenum
#         self.num_pages_to_scrape = 25
        
#         self.url = f"https://{location}.craigslist.org/search/rea?availabilityMode=0&minSqft={sqft}&min_bedrooms={bedrooms}&{pagenum}search_distance={radius}"

# #         for i in range(0, 24):
# #                 if i == 0:
# #                     self.url = f"https://{location}.craigslist.org/search/rea?availabilityMode=0&minSqft={sqft}&min_bedrooms={bedrooms}&search_distance={radius}"
# #                     print("First iteration")
# #                     time.sleep(30)
# #                 else:
# #                     self.url = f"https://{location}.craigslist.org/search/rea?availabilityMode=0&minSqft={sqft}&min_bedrooms={bedrooms}&s=" + str(i * 120) + "&search_distance={radius}"
# #                     print("s=" + str(i * 120))
# #                     time.sleep(2)
#         self.driver = webdriver.Firefox()
#         self.delay = 3


# # Launch URL and confirm it fully loaded    
#     def load_craigslist_url(self):
#         self.driver.get(self.url)
#         try:
#             wait = WebDriverWait(self.driver, self.delay)
#             wait.until(EC.presence_of_element_located((By.ID, "searchform")))
#             print("Page is ready")


#         except TimeoutException:
#             print("Loading took to much time")


# # Build lists of information   
#     def extract_post_information(self):
#         all_posts = self.driver.find_elements_by_class_name("result-row")
#         prices = []
#         titles = []
#         dates = []
#         addresss = []
#         bedrooms = []
#         sqfts = []

# # Clean and divide the post into important sections    
#         for post in all_posts:
# #             print(post.text)
#             title = post.text.split("$")

#             if title[0] == '':
#                 title = title[1]
#             else:
#                 title = title[0]

#             info = post.text.split(" - ")
#             address = info[-1]
#             address = address.strip("()")

#             bedroom = post.text
#             pattern = r"\s\d*[b][r]\s"
#             bedroom = (re.findall(pattern,bedroom)[0])

#             sqft = post.text
#             sqftpat = r"\s\d*[f][t][2]"
#             sqft = (re.findall(sqftpat,sqft)[0])

#             title = title.split("\n")
#             price = title[0]
#             title = title[-1]

#             title = title.split(" ")

#             month = title[0]
#             day = title[1]
#             title = ' '.join(title[2:])
#             date = month + " " + day
            
# # Add to Lists        
#             prices.append(price)
#             titles.append(title)
#             dates.append(date)
#             addresss.append(address)
#             bedrooms.append(bedroom)
#             sqfts.append(sqft)

#         return prices, titles, dates, addresss, bedrooms, sqfts



# #     ['', '375000\nJul 20 Mid-Century Modern! N. PDX! ', '375000 3br, - 1471ft2, - (N. Portland)']
# # Collect post urls and build list    
#     def extract_post_urls(self):
#         url_list = []
#         html_page = urllib.request.urlopen(self.url)
#         soup = BeautifulSoup(html_page, "lxml")
#         for link in soup.findAll("a", {"class": "result-title hdrlnk"}):
#             print(link["href"])
#             url_list.append(link["href"])
#         return url_list

        
# # Close FireFox Browser        
#     def quit(self):
#         self.driver.close()
    
# # Specifications for scrape  
    
# location = "portland"
# bedrooms = "1"
# sqft = "1"
# radius = "100"


# scraper = CraigslistScraper(location, bedrooms, sqft, radius, pagenum)
# scraper.load_craigslist_url()
# prices, titles, dates, addresss, bedrooms, sqfts = scraper.extract_post_information()
# scraper.extract_post_urls()
# scraper.quit()
