In [2]:
import requests
import pandas as pd
import time
import datetime
import random
from tqdm import tqdm
import sys, os
this_path = '/home/ibi/Documents/GitHub/mas291-project/'
sys.path.append(this_path)
os.chdir(this_path)
print(os.getcwd())

/home/ibi/Documents/GitHub/mas291-project


In [3]:
def dynamic_delay(minimum=1.0, maximum=3.0):
    return random.uniform(minimum, maximum)

def exponential_backoff_retry(request_func, max_retries=6):
    base_wait = 2
    for attempt in range(max_retries):
        try:
            response = request_func()
            if response.ok:
                time.sleep(dynamic_delay(1, 2))
                return response
        except Exception as e:
            print(f"Request failed with exception {e}. Attempt {attempt + 1}/{max_retries}.")
        
        time.sleep(base_wait * (2 ** attempt) + dynamic_delay())

    return None

In [4]:
def scrape_properties(city_slug, city_name, status=["for_sale", "ready_to_build"]):
    url = 'https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-for-sale-search&schema=vesta'
    headers = {"content-type": "application/json"}
    limit = 200
    all_properties = []
    total_checked = 0
    earliest_date = datetime.datetime.strptime('2000-01-01', "%Y-%m-%d")
    current_date = today = datetime.datetime.now()
    global_total_properties = 0

    def send_request(body):
        return requests.post(url, headers=headers, json=body)
    
    graphql_query = """
        query ConsumerSearchQuery(
            $query: HomeSearchCriteria!
            $limit: Int
            $offset: Int
            $search_promotion: SearchPromotionInput
            $sort: [SearchAPISort]
            $sort_type: SearchSortType
            $client_data: JSON
            $bucket: SearchAPIBucket
            ) {
            home_search: home_search(
                query: $query
                sort: $sort
                limit: $limit
                offset: $offset
                sort_type: $sort_type
                client_data: $client_data
                bucket: $bucket
                search_promotion: $search_promotion
            ) {
                count
                total
                search_promotion {
                names
                slots
                promoted_properties {
                    id
                    from_other_page
                }
                }
                mortgage_params {
                interest_rate
                }
                properties: results {
                property_id
                list_price
                search_promotions {
                    name
                    asset_id
                }
                primary_photo(https: true) {
                    href
                }
                rent_to_own {
                    right_to_purchase
                    rent
                }
                listing_id
                matterport
                virtual_tours {
                    href
                    type
                }
                status
                products {
                    products
                    brand_name
                }
                source {
                    id
                    type
                    spec_id
                    plan_id
                    agents {
                    office_name
                    }
                }
                lead_attributes {
                    show_contact_an_agent
                    opcity_lead_attributes {
                    cashback_enabled
                    flip_the_market_enabled
                    }
                    lead_type
                    ready_connect_mortgage {
                    show_contact_a_lender
                    show_veterans_united
                    }
                }
                community {
                    description {
                    name
                    }
                    property_id
                    permalink
                    advertisers {
                    office {
                        hours
                        phones {
                        type
                        number
                        primary
                        trackable
                        }
                    }
                    }
                    promotions {
                    description
                    href
                    headline
                    }
                }
                permalink
                price_reduced_amount
                description {
                    name
                    beds
                    baths_consolidated
                    sqft
                    lot_sqft
                    baths_max
                    baths_min
                    beds_min
                    beds_max
                    sqft_min
                    sqft_max
                    type
                    sub_type
                    sold_price
                    sold_date
                }
                location {
                    street_view_url
                    address {
                    line
                    postal_code
                    state
                    state_code
                    city
                    coordinate {
                        lat
                        lon
                    }
                    }
                    county {
                    name
                    fips_code
                    }
                }
                open_houses {
                    start_date
                    end_date
                }
                branding {
                    type
                    name
                    photo
                }
                flags {
                    is_coming_soon
                    is_new_listing(days: 14)
                    is_price_reduced(days: 30)
                    is_foreclosure
                    is_new_construction
                    is_pending
                    is_contingent
                }
                list_date
                photos(limit: 2, https: true) {
                    href
                }
                advertisers {
                    type
                    builder {
                    name
                    href
                    logo
                    }
                }
                }
            }
            commute_polygon: get_commute_polygon(query: $query) {
                areas {
                id
                breakpoints {
                    width
                    height
                    zoom
                }
                radius
                center {
                    lat
                    lng
                }
                }
                boundary
            }
            }
    """
    
    def get_global_total_properties():
        nonlocal global_total_properties
        query = {
            "sold_date": {"min": earliest_date.strftime("%Y-%m-%d"), "max": today.strftime("%Y-%m-%d")},
            "status": status,
            "search_location": {"location": city_name},
            "type": ["single_family"]
        }
        body = {
            "query": graphql_query,
            "variables": {
                "geoSupportedSlug": city_slug,
                "query": query,
                "client_data": {"device_data": {"device_type": "desktop"}},
                "limit": limit,
                "offset": 0,
                "sort": [{"field": "sold_date", "direction": "desc"}],
                "search_promotion": {"names": ["CITY"], "slots": [], "promoted_properties": []}
            },
            "isClient": True,
            "visitor_id": "7ffa9c49-550f-4c23-aa1b-e93786671450"
        }

        response = exponential_backoff_retry(lambda: send_request(body))
        if response is not None and 'data' in response.json() and 'home_search' in response.json()['data'] and response.json()['data']['home_search'] is not None:
            data = response.json()
            global_total_properties = data['data']['home_search']['total']
            print(f"Total sold properties to track for {city_name}: {global_total_properties}")

    if "sold" in status:
        get_global_total_properties()
        while current_date > earliest_date and total_checked < global_total_properties:
            start_date = current_date - datetime.timedelta(days=365)
            last_total = None

            while True:
                query = {
                    "sold_date": {"min": start_date.strftime("%Y-%m-%d"), "max": current_date.strftime("%Y-%m-%d")},
                    "status": status,
                    "search_location": {"location": city_name},
                    "type": ["single_family"]
                }

                body = {
                    "query": graphql_query,
                    "variables": {
                        "geoSupportedSlug": city_slug,
                        "query": query,
                        "client_data": {"device_data": {"device_type": "desktop"}},
                        "limit": limit,
                        "offset": 0,
                        "sort": [{"field": "sold_date", "direction": "desc"}],
                        "search_promotion": {"names": ["CITY"], "slots": [], "promoted_properties": []}
                    },
                    "isClient": True,
                    "visitor_id": "7ffa9c49-550f-4c23-aa1b-e93786671450"
                }

                response = exponential_backoff_retry(lambda: send_request(body))
                if response is None or 'data' not in response.json() or 'home_search' not in response.json()['data'] or response.json()['data']['home_search'] is None:
                    print("Error: Invalid response structure or missing data.")
                    print(response.json() if response else "No response")
                    break

                data = response.json()
                api_total = data['data']['home_search']['total']
                print(f"Total properties in {city_name} sold between {start_date.strftime('%Y-%m-%d')} and {current_date.strftime('%Y-%m-%d')}: {api_total}")

                if api_total > 10000:
                    start_date += datetime.timedelta(days=30)
                    time.sleep(dynamic_delay(5, 10))
                else:
                    if last_total is None or api_total > last_total:
                        last_total = api_total
                        start_date -= datetime.timedelta(days=1)
                        time.sleep(dynamic_delay(5, 10))
                    else:
                        break

            offset = 0
            with tqdm(total=api_total, desc=f"Fetching sold properties from {start_date.strftime('%Y-%m-%d')} to {current_date.strftime('%Y-%m-%d')}", unit="property") as pbar:
                while offset < api_total:
                    body['variables']['offset'] = offset
                    response = exponential_backoff_retry(lambda: send_request(body))
                    if response is None:
                        print("Failed to fetch data after retries.")
                        break
                    data = response.json()
                    current_batch = data['data']['home_search']['properties']
                    all_properties.extend(current_batch)
                    total_checked += len(current_batch)
                    pbar.update(len(current_batch))
                    offset += limit
                    time.sleep(dynamic_delay(2, 3))

                    if total_checked >= global_total_properties:
                        print(f"Reached total tracked properties count: {global_total_properties}. Stopping the scraping process.")
                        break

            current_date = start_date - datetime.timedelta(days=1)
            time.sleep(dynamic_delay(5, 10))
            if total_checked >= global_total_properties:
                break

    else:
        offset = 0
        api_total = None
        first_batch = True

        query = {
            "status": status,
            "search_location": {
                "location": city_name
            },
            "type": ["single_family"]
        }

        while api_total is None or offset < api_total:
            body = {
                "query": graphql_query,
                "variables": {
                    "geoSupportedSlug": city_slug,
                    "query": query,
                    "client_data": {"device_data": {"device_type": "desktop"}},
                    "limit": limit,
                    "offset": offset,
                    "sort_type": "relevant",
                    "search_promotion": {"names": ["CITY"], "slots": [], "promoted_properties": []}
                },
                "isClient": True,
                "visitor_id": "7ffa9c49-550f-4c23-aa1b-e93786671450"
            }

            response = exponential_backoff_retry(lambda: send_request(body))
            if response is None:
                print("Failed to fetch data after retries.")
                break

            data = response.json()

            if api_total is None:
                api_total = data['data']['home_search']['total']
                if first_batch:
                    print(f"Total properties available for sale in {city_name}: {api_total}")
                    first_batch = False
                    
            properties = data['data']['home_search']['properties']
            all_properties.extend(properties)
            total_checked += len(properties)
            offset += limit
            time.sleep(dynamic_delay(2, 3))

            with tqdm(total=api_total, desc="Fetching selling properties", unit="property") as pbar:
                while offset < api_total:
                    body['variables']['offset'] = offset
                    response = exponential_backoff_retry(lambda: send_request(body))
                    if response is None:
                        print("Failed to fetch data after retries.")
                        break
                    data = response.json()
                    current_batch = data['data']['home_search']['properties']
                    all_properties.extend(current_batch)
                    total_checked += len(current_batch)
                    pbar.update(len(current_batch))
                    offset += limit
                    time.sleep(dynamic_delay(2, 3))

    return {
        "total_properties_checked": total_checked,
        "properties": all_properties
    }

chicago_data = scrape_properties("Chicago_IL", "Chicago, IL")
print(f"Done fetching {chicago_data['total_properties_checked']} properties selling in Chicago.")
time.sleep(dynamic_delay(50, 60))

chicago_data_sold = scrape_properties("Chicago_IL", "Chicago, IL", ["sold"])
print(f"Done fetching {chicago_data_sold['total_properties_checked']} sold properties in Chicago.")
time.sleep(dynamic_delay(300, 600))

new_york_data = scrape_properties("New-York_NY", "New York, NY")
print(f"Done fetching {new_york_data['total_properties_checked']} properties selling in New York.")
time.sleep(dynamic_delay(50, 60))

new_york_data_sold = scrape_properties("New-York_NY", "New York, NY", ["sold"])
print(f"Done fetching {new_york_data_sold['total_properties_checked']} sold properties in New York.")

Total properties available for sale in New York, NY: 4799


Fetching selling properties:   0%|          | 0/4799 [00:02<?, ?property/s]


KeyboardInterrupt: 

In [None]:
chicago_json = chicago_data['properties']
chicago_selling = pd.DataFrame(chicago_json)
chicago_selling.head()

In [None]:
chicago_sold_json = chicago_data_sold['properties']
chicago_sold = pd.DataFrame(chicago_sold_json)
chicago_sold.head()

In [None]:
new_york_json = new_york_data['properties']
new_york_selling = pd.DataFrame(new_york_json)
new_york_selling.head()

In [None]:
new_york_sold_json = new_york_data_sold['properties']
new_york_sold = pd.DataFrame(new_york_sold_json)
new_york_sold.head()

In [None]:
def extract_data(properties):
    extracted_data = []

    for property in properties:
        id = property.get('property_id', None)
        permalink = property.get('permalink', None)
        post_link = "https://www.realtor.com/realestateandhomes-detail/" + permalink if permalink else None
        price = property.get('list_price', None)

        list_date = property.get('list_date', None)
        list_date = list_date.split('T')[0] if list_date else None

        location = property.get('location', {})
        address_line = location.get('address', {}).get('line', None)
        city = location.get('address', {}).get('city', None)
        state_code = location.get('address', {}).get('state_code', None)
        postal_code = location.get('address', {}).get('postal_code', None)
        address = f"{address_line}, {city}, {state_code} {postal_code}" if all([address_line, city, state_code, postal_code]) else None

        status = property.get('status', None)
        status = status.upper() if status else None

        if status == "SOLD":
            sold_date = property.get('description', {}).get('sold_date', None)
            if list_date and sold_date:
                list_date = datetime.datetime.strptime(list_date, "%Y-%m-%d")
                sold_date = datetime.datetime.strptime(sold_date, "%Y-%m-%d")
                days_until_sold = (sold_date - list_date).days
                sold_date = sold_date.strftime("%Y-%m-%d")
                list_date = list_date.strftime("%Y-%m-%d")
            else:
                days_until_sold = None
        else:
            sold_date = "Not sold yet"
            days_until_sold = "Not sold yet"

        description = property.get('description', {})
        area = description.get('sqft', None)
        bedrooms = description.get('beds', None)
        bathrooms = description.get('baths_consolidated', None)

        coordinate = location.get('address', {}).get('coordinate', None)
        latitude = coordinate['lat'] if coordinate else None
        longitude = coordinate['lon'] if coordinate else None

        extracted_data.append({
            'Data Source': 'https://www.realtor.com/',
            'ID': id,
            'Post link': post_link,
            'List date': list_date,
            'Sold date': sold_date,
            'Days until sold': days_until_sold,
            'Price': price,
            'Address': address,
            'Status': status,
            'Area': area,
            'Bedrooms': bedrooms,
            'Bathrooms': bathrooms,
            'Latitude': latitude,
            'Longitude': longitude
        })

    return extracted_data

batch = [chicago_json, chicago_sold_json, new_york_json, new_york_sold_json]
extracted_batches = [[], [], [], []]

for i, properties in enumerate(batch):
    extracted_batches[i] = extract_data(properties)

In [None]:
chicago_selling_extracted = pd.DataFrame(extracted_batches[0])
chicago_selling_extracted.head()

In [None]:
chicago_sold_extracted = pd.DataFrame(extracted_batches[1])
chicago_sold_extracted.head()

In [None]:
new_york_selling_extracted = pd.DataFrame(extracted_batches[2])
new_york_selling_extracted.head()

In [None]:
new_york_sold_extracted = pd.DataFrame(extracted_batches[3])
new_york_sold_extracted.head()

In [None]:
chicago_selling_extracted.to_csv('data/crawled/chicago_realtor_selling.csv', index=False)
chicago_sold_extracted.to_csv('data/crawled/chicago_realtor_sold.csv', index=False)
new_york_selling_extracted.to_csv('data/crawled/new_york_realtor_selling.csv', index=False)
new_york_sold_extracted.to_csv('data/crawled/new_york_realtor_sold.csv', index=False)