In [None]:
#tutorial example for using selenium in data scraping from craigslist:
#using this web "https://sfbay.craigslist.org/search/sss"

#first we should install selenium using this line: "pip install selenium",
#we can also install BeautifulSoup using this line: "pip install bs4",
#then download selenium driver for chrome:
#u can follow this page: "https://sites.google.com/a/chromium.org/chromedriver/"


In [37]:
#import important data
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

#use BeautifulSoup for easier html extraction
from bs4 import BeautifulSoup
import urllib.request


class CraigslistScraper(object):
    def __init__(self,location, postal, min_price, max_price, radius):
        self.location = location  #sfbay in the web url refer to sanfrancisco, so we make it variable to be general url
        self.postal = postal
        self.min_price=min_price
        self.max_price = max_price
        self.radius = radius
        #to make our url general
        self.url = f"https://{location}.craigslist.org/search/sss?search_distance={radius}&postal={postal}&min_price={min_price}&max_price={max_price}"
        self.driver = webdriver.Chrome()
        self.delay=3                                                        
#test the url    
#    def test(self):
#        print(self.url)


#Make sure that page have been run at proper way.
    def load_craigslist_url(self):
        self.driver.get(self.url)
        try:
            wait = WebDriverWait(self.driver, self.delay)
            #wait until searchform loadded to the page
            #we search here using the ID
            wait.until(EC.presence_of_element_located((By.ID, "searchform"))) 
            print("Page is ready")
        except TimeoutException:
            print("Loading took too much time")

    def extract_post_information(self):
        #find all elements that carry "result-row" tag
        all_posts = self.driver.find_elements_by_class_name("result-row")

        dates = []
        titles = []
        prices = []
        
        
#        post_all_posts=[]
#        for post in all_posts:
#            post_all_posts.append(post.text)
#            print(post.text)

#        return post_all_posts     



        for post in all_posts:
            title = post.text.split("$")

            if title[0] == '':
                title = title[1]
            else:
                title = title[0]

            title = title.split("\n")
            price = title[0]
            title = title[-1]

            title = title.split(" ")

            month = title[0]
            day = title[1]
            title = ' '.join(title[2:])
            date = month + " " + day

            #print("PRICE: " + price)
            #print("TITLE: " + title)
            #print("DATE: " + date)

            titles.append(title)
            prices.append(price)
            dates.append(date)

        return titles, prices, dates    

    def extract_post_urls(self):
        url_list = []
        html_page = urllib.request.urlopen(self.url)
        soup = BeautifulSoup(html_page, "lxml")  #"lxml": (optional) make output alittle clraner
        for link in soup.findAll("a", {"class": "result-title hdrlnk"}):
            print(link["href"])
            url_list.append(link["href"])
        return url_list

    def quit(self):
        self.driver.close()


In [38]:
#run example
location = "sfbay"
postal = "94201"
min_price= "0"
max_price = "500"
radius = "5"
scraper =  CraigslistScraper(location, postal, min_price , max_price, radius)        
scraper.load_craigslist_url()
#all_posts=scraper.extract_post_information()
scraper.extract_post_urls()

Page is ready
https://sfbay.craigslist.org/sfc/bik/d/san-francisco-womens-cannondale-synapse/7066814592.html
https://sfbay.craigslist.org/pen/spo/d/pacifica-80s-49ers-jacket/7066814601.html
https://sfbay.craigslist.org/sfc/ele/d/san-francisco-samsung-32-tv/7061892434.html
https://sfbay.craigslist.org/sfc/msg/d/san-francisco-orangewood-guitar/7066814496.html
https://sfbay.craigslist.org/sby/pts/d/escalon-1966-chevelle-ss-factory-orig/7066814489.html
https://sfbay.craigslist.org/eby/for/d/concord-duraflame-heater/7066814453.html
https://sfbay.craigslist.org/sfc/for/d/san-francisco-mining-rig/7066814441.html
https://sfbay.craigslist.org/sby/ele/d/san-jose-fitbit-versa-smart-watch/7050932605.html
https://sfbay.craigslist.org/sby/sop/d/san-jose-geforce-rtx-2080/7066814380.html
https://sfbay.craigslist.org/eby/pho/d/oakland-canon-5d-mkii-battery-grip/7062459292.html
https://sfbay.craigslist.org/sfc/grq/d/want-free-shipping-we-have-it-rubber/7066816818.html
https://sfbay.craigslist.org/sby/tl

['https://sfbay.craigslist.org/sfc/bik/d/san-francisco-womens-cannondale-synapse/7066814592.html',
 'https://sfbay.craigslist.org/pen/spo/d/pacifica-80s-49ers-jacket/7066814601.html',
 'https://sfbay.craigslist.org/sfc/ele/d/san-francisco-samsung-32-tv/7061892434.html',
 'https://sfbay.craigslist.org/sfc/msg/d/san-francisco-orangewood-guitar/7066814496.html',
 'https://sfbay.craigslist.org/sby/pts/d/escalon-1966-chevelle-ss-factory-orig/7066814489.html',
 'https://sfbay.craigslist.org/eby/for/d/concord-duraflame-heater/7066814453.html',
 'https://sfbay.craigslist.org/sfc/for/d/san-francisco-mining-rig/7066814441.html',
 'https://sfbay.craigslist.org/sby/ele/d/san-jose-fitbit-versa-smart-watch/7050932605.html',
 'https://sfbay.craigslist.org/sby/sop/d/san-jose-geforce-rtx-2080/7066814380.html',
 'https://sfbay.craigslist.org/eby/pho/d/oakland-canon-5d-mkii-battery-grip/7062459292.html',
 'https://sfbay.craigslist.org/sfc/grq/d/want-free-shipping-we-have-it-rubber/7066816818.html',
 'htt