In [None]:
# San Diego
# Albuquerque

# Colorado
# Philadelphia
# Indianapolis
# Las Vegas

# Washington
# Miami
# New York
# San Francisco

### Imports

In [66]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd 
import urllib.parse 

### HTTP Request

#### store website in variable

In [28]:
website = 'https://www.trulia.com/CA/San_Diego/' 

#### Get Request

In [29]:
response = requests.get(website)

#### Status Code

In [30]:
response.status_code

200

### Soup Object 

In [31]:
soup = BeautifulSoup(response.content, 'html.parser')

In [25]:
soup

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><script>
            window.__uspapi = function(command, version, callback) {
              try {
                if (command === 'getUSPData') {
                  var cookies = document.cookie.split(';');
                  for (var i = 0; i < cookies.length; i++) {
                    var cookie = cookies[i];
                    var separatorIndex = cookie.indexOf('=');
                    separatorIndex = separatorIndex < 0 ? cookie.length : separatorIndex;
                    var cookie_name = decodeURIComponent(cookie.slice(0, separatorIndex).replace(/^\s+/, ''));
                    if (cookie_name === 'usprivacy') {
                      var uspString = decodeURIComponent(cookie.slice(separatorIndex + 1));
                      callback({ version: version, uspString: uspString }, true);
                      return;
                    }
                  }
                }
              } catch (ex) {
                

### Results

In [36]:
result_container = soup.find_all('li', {'class':'SearchResultsList__WideCell-b7y9ki-2'})

In [37]:
len(result_container)

42

### Update Results

we just want to target the elements which have the attribute 'data-testid'

In [38]:
results_update = []

In [39]:
for r in result_container:
    if r.has_attr('data-testid'): 
        results_update.append(r)

In [40]:
len(results_update)

40

### Concatenate 2 URL Parts to get absolute URL

#### URL Part 1

In [41]:
# we combine ulr part 1 with url part 2 in order to get the absolute url

url_part_1 = 'https://www.trulia.com'

#### Create List for URL Part 2

In [42]:
url_part_2 = []

for item in results_update:
    
    for link in item.find_all('div', {'data-testid':'property-card-details'}):
        url_part_2.append(link.find('a').get('href'))

In [47]:
len(url_part_2)

40

#### Join Url 1 and Url 2

In [48]:
url_joined = []

for link_2 in url_part_2:
    url_joined.append(urllib.parse.urljoin(url_part_1,link_2))

In [49]:
url_joined

['https://www.trulia.com/p/ca/san-diego/10925-polaris-dr-san-diego-ca-92126--2079929837',
 'https://www.trulia.com/p/ca/san-diego/4014-texas-st-san-diego-ca-92104--1013559461',
 'https://www.trulia.com/p/ca/la-jolla/5850-camino-de-la-costa-la-jolla-ca-92037--2079478670',
 'https://www.trulia.com/p/ca/san-diego/1652-oro-vista-rd-255-san-diego-ca-92154--2080031909',
 'https://www.trulia.com/p/ca/san-diego/6880-panamint-row-4-san-diego-ca-92139--2080026071',
 'https://www.trulia.com/p/ca/san-diego/3964-lago-di-grata-cir-san-diego-ca-92130--1063448382',
 'https://www.trulia.com/p/ca/san-diego/6765-broadway-san-diego-ca-92114--2079820208',
 'https://www.trulia.com/p/ca/san-diego/851-euclid-ave-san-diego-ca-92114--2079824167',
 'https://www.trulia.com/p/ca/san-diego/10702-ancona-ln-san-diego-ca-92131--2080009142',
 'https://www.trulia.com/p/ca/san-diego/4844-coronado-ave-san-diego-ca-92107--2079745915',
 'https://www.trulia.com/p/ca/san-diego/6256-osler-st-san-diego-ca-92111--1008770379',
 '

### Get Data from First Link 

In [86]:
# Address
# bedrooms
# bathrooms
# sqft
# year built
# parking
# price

#### Store first link in variable

In [50]:
first_link = url_joined[0]

#### Get Request & Soup Object

In [51]:
response = requests.get(first_link)

In [52]:
soup = BeautifulSoup(response.content, 'html.parser')

In [53]:
soup

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><script>
            window.__uspapi = function(command, version, callback) {
              try {
                if (command === 'getUSPData') {
                  var cookies = document.cookie.split(';');
                  for (var i = 0; i < cookies.length; i++) {
                    var cookie = cookies[i];
                    var separatorIndex = cookie.indexOf('=');
                    separatorIndex = separatorIndex < 0 ? cookie.length : separatorIndex;
                    var cookie_name = decodeURIComponent(cookie.slice(0, separatorIndex).replace(/^\s+/, ''));
                    if (cookie_name === 'usprivacy') {
                      var uspString = decodeURIComponent(cookie.slice(separatorIndex + 1));
                      callback({ version: version, uspString: uspString }, true);
                      return;
                    }
                  }
                }
              } catch (ex) {
                

#### Address

In [55]:
soup.find('span', {'data-testid':'home-details-summary-headline'}).get_text()

'10925 Polaris Dr'

#### Bedrooms

In [57]:
soup.find('li', {'data-testid':'bed'}).get_text()

'3 Beds'

#### Bathrooms

In [58]:
soup.find('li', {'data-testid':'bath'}).get_text()

'2 Baths'

#### Sqft

In [59]:
soup.find('li', {'data-testid':'floor'}).get_text()

'1,594 sqft'

#### year built

In [60]:
soup.find('div', string='Year Built').findNext('div').get_text()

'1974'

#### parking

In [61]:
soup.find('div', string='Parking').findNext('div').get_text()

'Garage'

#### price

In [63]:
soup.find('h3', {'data-testid':'on-market-price-details'}).get_text()

'$974,200'

### Put all together and loop through all results in page 1

In [64]:
# empty lists
address = []
bedrooms = []
bathrooms = []
area = []
year_built = []
parking = []
price = []


# loop through all joined links
for link in url_joined:
    response = requests.get(link)
    
    # create soup object
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # address 
    try:
        address.append(soup.find('span', {'data-testid':'home-details-summary-headline'}).get_text())
    except:
        address.append('')
        
    # bedrooms 
    try:
        bedrooms.append(soup.find('li', {'data-testid':'bed'}).get_text())
    except:
        bedrooms.append('')

        
    # bathrooms 
    try:
        bathrooms.append(soup.find('li', {'data-testid':'bath'}).get_text())
    except:
        bathrooms.append('')
    
    # area 
    try:
        area.append(soup.find('li', {'data-testid':'floor'}).get_text())
    except:
        area.append('')
        
    # year_built 
    try:
        year_built.append(soup.find('div', string='Year Built').findNext('div').get_text())
    except:
        year_built.append('')
        
    
    # parking 
    try:
        parking.append(soup.find('div', string='Parking').findNext('div').get_text())
    except:
        parking.append('')
        
    # price 
    try:
        price.append(soup.find('h3', {'data-testid':'on-market-price-details'}).get_text())
    except:
        price.append('')
            
    
    # create a dictionary with results
    output = {'Address': address, 'Bedrooms':bedrooms, 'Bathrooms': bathrooms, 'Area':area,
             'Year Built': year_built, 'Parking':parking, 'Price':price}

In [65]:
# show output
output

{'Address': ['10925 Polaris Dr',
  '4014 Texas St',
  '5850 Camino De La Costa',
  '1652 Oro Vista Rd #255',
  '6880 Panamint Row #4',
  '3964 Lago Di Grata Cir',
  '6765 Broadway',
  '851 Euclid Ave',
  '10702 Ancona Ln',
  '4844 Coronado Ave',
  '6256 Osler St',
  '308 Vista De La Playa',
  '5170 Clairemont Mesa Blvd #51-18',
  '11476 Azucena Dr',
  '5340 Calle Rockfish #20',
  '2854 Amulet St',
  '8617 Via Mallorca #A',
  '2751 Saint Laurent Pl',
  '5017 Reynolds St',
  '15742 Potomac Ridge Rd',
  '1080 Park Blvd #402',
  '4608 Monongahela St',
  '159 W Hall Ave',
  '425 W Beech St #602',
  '3163 Easy St',
  '8866 Capcano Rd',
  '5937 Dirac St',
  '1236 Grand Ave',
  '8787 Covina St',
  '4545 Collwood Blvd #15',
  '1261 Saturn Blvd',
  '1864 Sunset Blvd',
  '742 Jewell Dr',
  '7266 Oakham Way',
  '6215 Streamview Dr',
  '1320 Muirlands Dr',
  '4254 Maryland St',
  '342 Playa Del Sur',
  '3215 44th St #1',
  '5167 Edgeware Rd'],
 'Bedrooms': ['3 Beds',
  '1 Bed',
  '4 Beds',
  '3 Bed

In [69]:
df = pd.DataFrame(output)
df 

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price
0,10925 Polaris Dr,3 Beds,2 Baths,"1,594 sqft",1974,Garage,"$974,200"
1,4014 Texas St,1 Bed,1 Bath,384 sqft,1980,No Info,"$299,999"
2,5850 Camino De La Costa,4 Beds,5 Baths,"5,139 sqft",2016,Garage,"$13,595,000"
3,1652 Oro Vista Rd #255,3 Beds,2 Baths,"1,222 sqft",1980,No Info,"$460,200"
4,6880 Panamint Row #4,3 Beds,2 Baths,"1,183 sqft",1980,1 Carport Spaces,"$499,999"
5,3964 Lago Di Grata Cir,4 Beds,3 Baths,"3,331 sqft",2000,Garage,"$1,949,000"
6,6765 Broadway,4 Beds,3 Baths,"1,949 sqft",1958,1 Car Garage,"$650,000"
7,851 Euclid Ave,3 Beds,2 Baths,"1,120 sqft",1921,Open Parking,"$584,900"
8,10702 Ancona Ln,4 Beds,3 Baths,"2,529 sqft",1988,3 Car Garage,"$1,395,000"
9,4844 Coronado Ave,2 Beds,2 Baths,894 sqft,2008,Garage,"$899,000"


## Multiple Pages - San Diego

In [73]:
# empty lists

address = []
bedrooms = []
bathrooms = []
area = []
year_built = []
parking = []
price = []

# url part 1
url_part_1 = 'https://www.trulia.com'

for i in range(1,26):
    
    # website
    website = 'https://www.trulia.com/CA/San_Diego/' +str(i) +'_p/'
    
    # request
    response = requests.get(website)
    
    # soup 
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # result container 
    result_container = soup.find_all('li', {'class':'SearchResultsList__WideCell-b7y9ki-2'})
    
    # results update
    results_update = []
    
    # only results with attribute "data-testid"
    for r in result_container:
        if r.has_attr('data-testid'): 
            results_update.append(r)
        
    
    # relative url
    relative_url = []
    
    # loop thorugh results
    for item in results_update:
    
        for link in item.find_all('div', {'data-testid':'property-card-details'}):
            relative_url.append(link.find('a').get('href'))
    
    # empty list (url joined)
    url_joined = []

    for link_2 in relative_url:
        url_joined.append(urllib.parse.urljoin(url_part_1,link_2))
        
    # loop through all joined links
    for link in url_joined:
        response = requests.get(link)

        # create soup object
        soup = BeautifulSoup(response.content, 'html.parser')

        # address 
        try:
            address.append(soup.find('span', {'data-testid':'home-details-summary-headline'}).get_text())
        except:
            address.append('')

        # bedrooms 
        try:
            bedrooms.append(soup.find('li', {'data-testid':'bed'}).get_text())
        except:
            bedrooms.append('')


        # bathrooms 
        try:
            bathrooms.append(soup.find('li', {'data-testid':'bath'}).get_text())
        except:
            bathrooms.append('')

        # area 
        try:
            area.append(soup.find('li', {'data-testid':'floor'}).get_text())
        except:
            area.append('')

        # year_built 
        try:
            year_built.append(soup.find('div', string='Year Built').findNext('div').get_text())
        except:
            year_built.append('')


        # parking 
        try:
            parking.append(soup.find('div', string='Parking').findNext('div').get_text())
        except:
            parking.append('')

        # price 
        try:
            price.append(soup.find('h3', {'data-testid':'on-market-price-details'}).get_text())
        except:
            price.append('')


        # create a dictionary with results
        output = {'Address': address, 'Bedrooms':bedrooms, 'Bathrooms': bathrooms, 'Area':area,
                 'Year Built': year_built, 'Parking':parking, 'Price':price}


In [74]:
df = pd.DataFrame(output)
df 

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price
0,10925 Polaris Dr,3 Beds,2 Baths,"1,594 sqft",1974,Garage,"$974,200"
1,4014 Texas St,1 Bed,1 Bath,384 sqft,1980,No Info,"$299,999"
2,851 Euclid Ave,3 Beds,2 Baths,"1,120 sqft",1921,Open Parking,"$584,900"
3,6880 Panamint Row #4,3 Beds,2 Baths,"1,183 sqft",1980,1 Carport Spaces,"$499,999"
4,4844 Coronado Ave,2 Beds,2 Baths,894 sqft,2008,Garage,"$899,000"
...,...,...,...,...,...,...,...
865,700 W East St #605,2 Beds,2 Baths,985 sqft,2007,Garage,"$665,000"
866,5641 Chelsea Ave,3 Beds,2 Baths,"1,280 sqft",1944,Garage,"$2,795,000"
867,2293 Judson St,3 Beds,2 Baths,"1,000 sqft",1960,1 Car Garage,"$711,000"
868,6356 Camino Largo,4 Beds,3 Baths,"3,112 sqft",2004,2 Car Garage,"$1,450,000"


In [75]:
df['Location'] = 'San Diego'

In [76]:
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,10925 Polaris Dr,3 Beds,2 Baths,"1,594 sqft",1974,Garage,"$974,200",San Diego
1,4014 Texas St,1 Bed,1 Bath,384 sqft,1980,No Info,"$299,999",San Diego
2,851 Euclid Ave,3 Beds,2 Baths,"1,120 sqft",1921,Open Parking,"$584,900",San Diego
3,6880 Panamint Row #4,3 Beds,2 Baths,"1,183 sqft",1980,1 Carport Spaces,"$499,999",San Diego
4,4844 Coronado Ave,2 Beds,2 Baths,894 sqft,2008,Garage,"$899,000",San Diego
...,...,...,...,...,...,...,...,...
865,700 W East St #605,2 Beds,2 Baths,985 sqft,2007,Garage,"$665,000",San Diego
866,5641 Chelsea Ave,3 Beds,2 Baths,"1,280 sqft",1944,Garage,"$2,795,000",San Diego
867,2293 Judson St,3 Beds,2 Baths,"1,000 sqft",1960,1 Car Garage,"$711,000",San Diego
868,6356 Camino Largo,4 Beds,3 Baths,"3,112 sqft",2004,2 Car Garage,"$1,450,000",San Diego


In [77]:
df.to_excel('dataframe_san_diego.xlsx', index=False)