In [98]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json

In [184]:
head = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
target_urls = [
    "https://www.redfin.com/zipcode/94065/filter/include=sold-1yr",
    "https://www.redfin.com/zipcode/94065/filter/include=sold-1yr/page-2",
    "https://www.redfin.com/zipcode/94065/filter/include=sold-1yr/page-3"
]

# Variable to store concatenated results
all_results = ""

# Loop through each URL, fetch content, and concatenate results
for url in target_urls:
    try:
        response = requests.get(url, headers=head, verify=False)
        if response.status_code == 200:
            all_results += response.text  # Append the response content
        else:
            print(f"Failed to fetch {url}, Status Code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

# Print the concatenated result length as an example
print(f"Concatenated result length: {len(all_results)}")



Concatenated result length: 3023250


In [100]:
soup=BeautifulSoup(all_results,'html.parser')
allBoxes = soup.find_all("div",{"class":"HomeCardContainer"})



In [None]:
# # Extract text or string content from allBoxes
# all_text = " ".join(str(box) for box in soup)
# file_path = "redfin_raw.txt"
# with open(file_path, 'w') as file:
#     file.write(all_text)

In [183]:
# Extract text or string content from allBoxes
all_text = " ".join(str(box) for box in soup)
# Find all occurrences of the target base URL
matches = re.findall(r"https://www\.redfin\.com/CA/Redwood-City/[^\s]+", all_text)

# Count unique occurrences
unique_count = len(matches)

print(f"Unique count: {unique_count}")

Unique count: 0


In [181]:
l=[]
for box in allBoxes:
    o = {}
    try:
        # Extract price
        price_span = box.find("span", {"class": "bp-Homecard__Price--value"})
        o["property-price"] = price_span.text.strip() if price_span else None

         # Find the date sold
        date_sold_span = box.find("span", {"data-rf-test-id": "home-sash"})
        o["date_sold"] = date_sold_span.text.strip() if date_sold_span else None
        
        # Extract property stats
        stats_div = box.find("div", {"class": "bp-Homecard__Stats"})
        if stats_div:
            o["beds"] = stats_div.find("span", {"class": "bp-Homecard__Stats--beds"}).text.strip() if stats_div.find("span", {"class": "bp-Homecard__Stats--beds"}) else None
            o["baths"] = stats_div.find("span", {"class": "bp-Homecard__Stats--baths"}).text.strip() if stats_div.find("span", {"class": "bp-Homecard__Stats--baths"}) else None
            sqft_div = stats_div.find("span", {"class": "bp-Homecard__Stats--sqft"})
            o["sqft"] = sqft_div.find("span", {"class": "bp-Homecard__LockedStat--value"}).text.strip() if sqft_div else None
        
        # Extract address
        address_div = box.find("div", {"class": "bp-Homecard__Address"})
        o["address"] = address_div.text.strip() if address_div else None

        # Extract JSON data from the script tag
        script_tag = box.find("script", {"type": "application/ld+json"})
        if script_tag and script_tag.string:  # Ensure the tag and its content exist
            data = json.loads(script_tag.string)
            
        # Extracting required fields
            o["numberOfRooms"] = data.get("numberOfRooms")
            o["floorSize"] = data.get("floorSize", {}).get("value")
            o["typeOfHouse"] = data.get("@type")
            # Extract the house URL
            o["url"] = data.get("url")
            
    except Exception as e:
        # Handle parsing errors gracefully
        o["error"] = str(e)
    l.append(o)



In [182]:
# Convert data to data frame
df = pd.DataFrame(l)

# Drop NAs
df = df[df['property-price'].notna()]

# Remove "SOLD" and "SOLD BY REDFIN" using regex
df["date_sold"] = df["date_sold"].str.replace(r"SOLD BY REDFIN |SOLD |BOUGHT WITH REDFIN", "", regex=True)

# Convert the 'date_sold' column to datetime format
df["date_sold"] = pd.to_datetime(df["date_sold"], format="mixed")

# Convert the property-price, beds, baths, sqft columns to numeric
df['property-price'] = df['property-price'].replace({'\$': '', ',': ''}, regex=True).astype(int)
df['beds'] = df['beds'].str.replace(' beds', '').str.replace(' bed', '').astype(int)
df['baths'] = df['baths'].str.replace(' baths', '').str.replace(' bath', '').astype(float)
df['sqft'] = df['sqft'].str.replace(',', '').astype(int)

# Add a new column to check if the date is before Aug 17, 2024
threshold_date = pd.Timestamp("2024-08-17")
df["buyerfee"] = df["date_sold"] < threshold_date

# Add a new column to check if the date is before Sep 18, 2024
threshold_date = pd.Timestamp("2024-09-18")
df["interestcut"] = df["date_sold"] > threshold_date

# Save to CSV
df.to_csv("output.csv", index=False) 

df

  df['property-price'] = df['property-price'].replace({'\$': '', ',': ''}, regex=True).astype(int)


KeyError: 'property-price'

In [175]:


# def fetch_property_type_with_selenium(url):
#     # Set up Selenium WebDriver
#     driver = webdriver.Chrome()  # Ensure you have the correct WebDriver installed
#     driver.get(url)

#     # Get page source after JavaScript execution
#     soup = BeautifulSoup(driver.page_source, 'html.parser')

#     # Close the driver
#     driver.quit()

#     # Extract property type as before
#     property_type = None
#     type_keys = ["Ownership Type", "Style"]
#     amenity_groups = soup.find_all("div", class_="amenity-group")
#     for group in amenity_groups:
#         entries = group.find_all("li", class_="entryItem")
#         for entry in entries:
#             for key in type_keys:
#                 if key in entry.get_text():
#                     property_type = entry.find("span").get_text(strip=True)
#                     return property_type

#     return "Unknown"


# url = "	https://www.redfin.com/CA/Redwood-City/6-Portofino-Cir-94065/home/942122"
# fetch_property_type(url)

# # # Apply the function to the DataFrame
# # df["property_type"] = df["url"].apply(fetch_property_type)

# # # Save DataFrame to a CSV file
# # df.to_csv('redfin.csv', index=False)

# # df

'Unknown'

In [172]:
# head = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# }
# url = "https://www.redfin.com/CA/Redwood-City/100-Baltic-Cir-94065/unit-132/home/1884958"
# response = requests.get(url, headers=head, verify=False)
# soup=BeautifulSoup(response.text,'html.parser')





In [176]:
# head = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# }


# url = "https://www.redfin.com/CA/Redwood-City/308-Starfish-Ln-94065/home/942964"
# response = requests.get(url, headers=head, verify=False)
# soup=BeautifulSoup(response.text,'html.parser')


# # Initialize a variable to store the property type
# property_type = None

# # Keywords to check for property type
# type_keys = ["Ownership Type", "Style"]

# # Find the section containing "Style" or "Ownership Type"
# amenity_groups = soup.find_all("div", class_="amenity-group")
# for group in amenity_groups:
#     entries = group.find_all("li", class_="entryItem")
#     for entry in entries:
#         # Check if the entry contains one of the keys
#         for key in type_keys:
#             if key in entry.get_text():
#                 property_type = entry.find("span").get_text(strip=True)
#                 break
#         if property_type:
#             break

# # Output the extracted property type
# print("Property Type:", property_type)

Property Type: None




In [177]:
# head = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# }


# url = "	https://www.redfin.com/CA/Redwood-City/6-Portofino-Cir-94065/home/942122"
# response = requests.get(url, headers=head, verify=False)
# soup=BeautifulSoup(response.text,'html.parser')


# # Initialize a variable to store the property type
# property_type = None

# # Keywords to check for property type
# type_keys = ["Ownership Type", "Style"]

# # Find the section containing "Style" or "Ownership Type"
# amenity_groups = soup.find_all("div", class_="amenity-group")
# for group in amenity_groups:
#     entries = group.find_all("li", class_="entryItem")
#     for entry in entries:
#         # Check if the entry contains one of the keys
#         for key in type_keys:
#             if key in entry.get_text():
#                 property_type = entry.find("span").get_text(strip=True)
#                 break
#         if property_type:
#             break

# # Output the extracted property type
# print("Property Type:", property_type)

Property Type: None


