In [9]:
from requests import get 
from bs4 import BeautifulSoup
from time import sleep
import re
from random import randint 
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np
import pandas as pd
from datetime import datetime

# request the contents of the page we're scraping
results = get('https://philadelphia.craigslist.org/d/apartments-housing-for-rent/search/apa?availabilityMode=0&bundleDuplicates=1&hasPic=1&laundry=1&max_price=2050&min_bedrooms=2')

# make the content we grabbed easy to read
html_soup = BeautifulSoup(results.text, 'html.parser')

# get the macro-container containing the posts we want
post = html_soup.find_all('li', class_= 'result-row')


# find the total number of posts to find the limit of the pagination
find_total = html_soup.find('div', class_= 'search-legend')

# grab the total count of posts 
total_posts = int(find_total.find('span', class_='totalcount').text) 

# vary the value of the page parameters
pages = np.arange(0, total_posts+1, 120)

# count tracker for number of iterations
iterations = 0

# initialize empty lists where we'll store our date 
post_times = []
post_neighborhoods = []
post_titles = []
post_bedrooms = []
post_sqft = []
post_links = []
post_prices = []


# create for loop
for page in pages:
    
    # get request
    response = get("https://philadelphia.craigslist.org/d/apartments-housing-for-rent/search/apa?" 
                   + "s=" # parameter for defining the page number 
                   + str(page) # page number in the pages array 
                   + "&availabilityMode=0"
                   +"&bundleDuplicates=1"
                   + "&hasPic=1"
                   + "&laundry=1"
                   +"&max_price=2050"
                   +"&min_bedrooms=2"
                  )
    
    # control the crawl rate 
    sleep(randint(1,10))
    
    # throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
    # define the html text
    html = BeautifulSoup(response.text, 'html.parser')
    
    # define the posts
    posts = html_soup.find_all('li', class_= 'result-row')
   
   # extract data item-wise
    for post in posts:

        # if we aren't missing the neighborhood information
        if post.find('span', class_ = 'result-hood') is not None:

            # date
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_times.append(post_datetime)

            # neighborhoods
            post_hoods = post.find('span', class_= 'result-hood').text
            post_neighborhoods.append(post_hoods)

            # title 
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_titles.append(post_title_text)

            # link
            post_link = post_title['href']
            post_links.append(post_link)
            
            # removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            post_price = int(float(post.a.text.strip().replace("$", "").replace(",","")))
            post_prices.append(post_price)
            # if the number of bedrooms OR sqft aren't missing 
            if post.find('span', class_ = 'housing') is not None:
                
                # if the first element is accidentally square footage
                if 'ft' in post.find('span', class_ = 'housing').text.split()[0]:
                    
                    # make bedroom NaN
                    bedroom_count = np.nan
                    post_bedrooms.append(bedroom_count)
                    
                    # make sqft the first element
                    sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
                    post_sqft.append(sqft)
                    
                # if the length of the housing details element is more than 2
                elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                    
                    # therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    post_bedrooms.append(bedroom_count)
                    
                    # and sqft will be number 3, so set these here and append
                    sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
                    post_sqft.append(sqft)
                    
                # if there is num bedrooms but no sqft
                elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                    
                    # therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    post_bedrooms.append(bedroom_count)
                    
                    # and sqft will be number 3, so set these here and append
                    sqft = np.nan
                    post_sqft.append(sqft)                    
                
                else:
                    bedroom_count = np.nan
                    post_bedrooms.append(bedroom_count)
                
                    sqft = np.nan
                    post_sqft.append(sqft)
            # if none of those conditions catch, make bedroom NaN 
            else:
                bedroom_count = np.nan
                post_bedrooms.append(bedroom_count)
                
                sqft = np.nan
                post_sqft.append(sqft)
                
    iterations += 1
      
    print("Page " + str(iterations) + " scraped successfully!")

    
print("\n")
print("Scrape complete!")

philly_apts = pd.DataFrame({'posted': post_times,
                       'neighborhood': post_neighborhoods,
                       'post title': post_titles,
                       'number bedrooms': post_bedrooms,
                        'sqft': post_sqft,
                        'URL': post_links,
                       'price': post_prices})

print(philly_apts.info())
philly_apts

# to move all the scraped data to a CSV file
philly_apts.to_csv('philly_apts.csv', index=False)

Page 1 scraped successfully!
Page 2 scraped successfully!
Page 3 scraped successfully!
Page 4 scraped successfully!
Page 5 scraped successfully!
Page 6 scraped successfully!
Page 7 scraped successfully!
Page 8 scraped successfully!
Page 9 scraped successfully!
Page 10 scraped successfully!
Page 11 scraped successfully!
Page 12 scraped successfully!
Page 13 scraped successfully!
Page 14 scraped successfully!
Page 15 scraped successfully!
Page 16 scraped successfully!
Page 17 scraped successfully!


Scrape complete!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2193 entries, 0 to 2192
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   posted           2193 non-null   object 
 1   neighborhood     2193 non-null   object 
 2   post title       2193 non-null   object 
 3   number bedrooms  2193 non-null   object 
 4   sqft             1666 non-null   float64
 5   URL              2193 non-null   object 
 6   p

In [10]:
philly_apts

Unnamed: 0,posted,neighborhood,post title,number bedrooms,sqft,URL,price
0,2021-12-08 20:54,(East Kensington ),Newly Built! Fabulous 2BD with Central A/C & S...,2,,https://philadelphia.craigslist.org/apa/d/phil...,1595
1,2021-12-08 20:48,(Art Museum ),Pet Friendly! Terrific 2BD with HW Flooring - ...,2,988.0,https://philadelphia.craigslist.org/apa/d/phil...,2015
2,2021-12-08 20:30,(Brewerytown ),Large 2BD in Brewerytown with Private Entrance...,2,1240.0,https://philadelphia.craigslist.org/apa/d/phil...,1995
3,2021-12-08 20:21,(West Philly ),Very cute 2bdrm 1bth apt at 50th and Baltimore,2,,https://philadelphia.craigslist.org/apa/d/phil...,1600
4,2021-12-08 20:14,(Temple Girard Art Museum Francisville Brewer...,*Spacious Bi-Level Apartment Available Updated...,2,950.0,https://philadelphia.craigslist.org/apa/d/phil...,1475
...,...,...,...,...,...,...,...
2188,2021-12-08 07:36,(West Oak Lane ),UPDATED 3 BEDROOMS AND 1.5 BATHS HOME LOCATED ...,3,,https://philadelphia.craigslist.org/apa/d/phil...,1600
2189,2021-12-08 07:36,(TEMPLE UNIVERSITY ),NEWLY RENOVATED 2 BD / 1.5 BA MINUTES AWAY FRO...,2,950.0,https://philadelphia.craigslist.org/apa/d/phil...,1000
2190,2021-12-08 07:36,(OGONTZ ),UPDATED 2 BEDROOMS / 1 BATH APARTMENT AVAILABL...,2,800.0,https://philadelphia.craigslist.org/apa/d/phil...,1100
2191,2021-12-08 07:36,(OGONTZ ),UPDATED 2 BEDROOMS / 1 BATH APARTMENT AVAILABL...,2,800.0,https://philadelphia.craigslist.org/apa/d/phil...,1100


In [12]:
philly_apts['post title'].duplicated().sum()

2082