# Imports

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests

# Website URL list construction

In [2]:
## The 'target_url' is the homepage of the target website
## The 'url_prefix' is the specific URL you use to append with the
## for-loop below.

target_url = 'https://sfbay.craigslist.org'
url_prefix = 'https://sfbay.craigslist.org/d/musical-instruments/search/msa?s='

pages = ['120','240','360','480','600','720','840',
         '960','1080','1200','1320','1440','1560','1680',
         '1800','1920','2040','2160','2280','2400','2520',
         '2640','2760','2880','3000']
        
## This tests to make sure the URL list compiler is working
## on 3 pages.
# pages = ['120', '240', '360']

url_list = []

## This loop takes the base URL and adds just the string from the
## 'pages' object above so that each 'url' that goes into the
## 'url_list' is in the correct step of 120 results.

for page in pages:
    url = url_prefix + page
    url_list.append(url)

In [3]:
## This prints the 'url_list' as a QC check.

url_list

# Scraping for-loop

* This is what I'm calling a "dynamic" scraping function.  It's dynamic in the sense that it collects and defines the html as objects in real time.  
* Another method would be what I'm calling "static" scraping where the output from the 'url in url_list' for-loop is put into a list outside of the function with the entirity of the url's html.  The scraping then happens to a static object.
* Choose ** **ONE** ** approach: Dynamic or Static 

## The "dynamic" method

In [6]:
''' 
                            ****NOTE****
The two empty lists 'df_list' and 'each_html_output' will
need to be empty. Therefore, make sure to restart the kernal before
running this cell.

'''

df_list = []
each_html_output = []

def attribute_scraping(starting_url):
    
    """ 
    These are the 5 attributes I am scraping from Craigslist. Any
    additional pieces of information to be made into objects will
    require 
    
        * adding an empty list
        
        *an additional for-loop or if statement depending on the find 
            target
        
        * adding to the dictionary at the end of the this function
        
        * adding to the print statement set at the end of this function
    """
    
    has_pics_bool = []
    price = []
    just_titles = []
    HOOD_list = []
    just_posted_datetimes = []
    
    """
    Parameters
    ----------
    response = requests.get(url)
        * This makes a request to the URL and returns a status code
        
    page = response.text
        * the html text (str object) from the 'get(url)'
        
    soup = BeautifulSoup(page, 'html.parser')
        * makes a BeautifulSoup object called 'page'
        * utilizes the parser designated in quotes as the second
            input of the method
            
    results = soup.find_all('li', class_='result-row')
        * returns an element ResultSet object.
        * this is the html text that was isolated from using the 
            'find()' or 'find_all()' methods.
        * 'li' is an html list tag.
        * 'class_' is the designator for a class attribute.
            - Here this corresponds with the 'result_row' class 
    
    """
    for url in url_list:
            response = requests.get(url)
            page = response.text
            soup = BeautifulSoup(page, 'html.parser')
            results = soup.find_all('li', class_='result-row')

            
        for res in results:
            """PRICE"""
            ## Loop for finding PRICE for a single page of 120 results
            p = res.find('span', class_='result-price').text
            price.append(p)

            """PICS"""
            ## Loop for finding the boolean HAS PICS of a single page of 
            ## 120 results. This tests whether >=1 picture is an attribute
            ## of the post.
            if res.find('span', class_='pictag') is None:
                has_pics_bool.append("False")
            else:
                has_pics_bool.append('True')


            """NEIGHBORHOOD"""
            ## Loop for finding NEIGHBORHOOD name for a single page of 120
            ## results.  This includes the drop down menu choices on
            ## Craigslist as well as the manually entered neighborhoods.
            if res.find('span', class_="result-hood") is None:
                HOOD_list.append("NONE")
            else:    
                h = res.find('span', class_="result-hood").text
                HOOD_list.append(h)

        """TITLE"""    
        ## Loop for finding TITLE for a single page of 120 results   
        titles=soup.find_all('a', class_="result-title hdrlnk")
        for title in titles:
            just_titles.append(title.text) 

        """DATETIME"""
        ## Loop for finding DATETIME for a single page of 120 results   
        posted_datetimes=soup.find_all(class_='result-date')
        for posted_datetime in posted_datetimes:
            if posted_datetime.has_attr('datetime'):
                just_posted_datetimes.append(posted_datetime['datetime'])       
    
    # Compilation dictionary of for-loop results       
    comp_dict = {'price': price, 
                'pics': has_pics_bool,
                'hood': HOOD_list,
                'title': just_titles,
                'datetimes': just_posted_datetimes}

          
    return comp_dict

    print(len(price))
    print(len(has_pics_bool))
    print(len(HOOD_list))
    print(len(just_titles))
    print(len(just_posted_datetimes))

Run the function and check the output dictionary.

In [5]:
base_dict = attribute_scraping(target_url)
base_dict

Construct dataframe using dictionary

In [6]:
df_base = pd.DataFrame(base_dict)
df_base

Unnamed: 0,price,pics,hood,title,datetimes
0,$65,False,(hayward / castro valley),Amperex bugle boy mullard 5ar4 gz34,2021-01-19 18:45
1,$700,False,(hayward / castro valley),American standard fender precision bass wanted,2021-01-19 18:45
2,$75,True,(hayward / castro valley),Jbl 2118h 8” woofer/midrange,2021-01-19 18:45
3,$10,True,(hayward / castro valley),Crossovers 3 way,2021-01-19 18:45
4,$775,True,(hayward / castro valley),Fractal Axefx ultra. (Mint),2021-01-19 18:44
...,...,...,...,...,...
2875,$40,True,(marina / cow hollow),Vox VX12 Celestion G12P-80 16 ohm Guitar Speak...,2021-01-17 09:00
2876,$900,True,(oakland piedmont / montclair),King 2b+ Trombone,2021-01-17 09:00
2877,$250,True,(willow glen / cambrian),Line 6 Pod HD500 Multi-Effects Guitar Pedal,2021-01-17 08:59
2878,"$1,800",True,(novato),1921 Chickering Baby Grand Piano,2021-01-17 08:57


Sort the results by the 'datetime' to order them by posting time.

In [7]:
df_base.sort_values('datetimes')

Unnamed: 0,price,pics,hood,title,datetimes
2879,$50,True,(san jose east),Selmar Bundy flute with case vintage,2021-01-17 08:53
2878,"$1,800",True,(novato),1921 Chickering Baby Grand Piano,2021-01-17 08:57
2877,$250,True,(willow glen / cambrian),Line 6 Pod HD500 Multi-Effects Guitar Pedal,2021-01-17 08:59
2876,$900,True,(oakland piedmont / montclair),King 2b+ Trombone,2021-01-17 09:00
2875,$40,True,(marina / cow hollow),Vox VX12 Celestion G12P-80 16 ohm Guitar Speak...,2021-01-17 09:00
...,...,...,...,...,...
4,$775,True,(hayward / castro valley),Fractal Axefx ultra. (Mint),2021-01-19 18:44
3,$10,True,(hayward / castro valley),Crossovers 3 way,2021-01-19 18:45
2,$75,True,(hayward / castro valley),Jbl 2118h 8” woofer/midrange,2021-01-19 18:45
1,$700,False,(hayward / castro valley),American standard fender precision bass wanted,2021-01-19 18:45


Convert to csv for import into regression notebook

In [13]:
df_base.to_csv('/Users/johnmetzger/Desktop/Coding/Project2/base_scrape.csv', index = False)