# Real Estate EDA

### Imports (Pandas - Python Data Analysis Library, Numbpy - Array Proessing Package, Seaborn - Statistical Data Visualisation, Matplotlib - Static Vizualization Library)

In [1]:
# Standard Import Structure for EDA;
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
import random
import csv

# Importing Requests for use in API calls.
import requests


# Import Standard Packages for Date and Time;
from datetime import datetime, timedelta

# Import Beautiful Soup, Selenium, for Scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

# Import packages to help with parsing Information
import lxml
from lxml.html.soupparser import fromstring
import prettify
import numbers
import htmltext


#### Setup of User-Agent Randomization

In [2]:
with open ('userAgents.txt', 'r') as f:
    data = f.read()
    userAgentList = data.split('\n')
    f.close()

print(str(len(userAgentList)))

1000


In [3]:
def GET_UA():
    return random.choice(userAgentList)

#### Setup Of Proxy Randomization

In [4]:
with open ('proxies.txt','r') as f:
    data = f.read()
    proxyList = data.split('\n')
    f.close()
print(str(len(proxyList)))

300


In [5]:
def GET_PROX():
    return random.choice(proxyList)

### Setup and Initialization

In [6]:
# Set Maximum Displayed Columns to None; All Columns Listed
pd.options.display.max_columns = None;

# Set MatPlotLib to Display Elements
%matplotlib inline

# Import Cities in CSV file for 10 most and least expenive north carolina cities.csv
cities_df = pd.read_csv("cities2.csv",header=0)

#Lists for Looping through GET Requests
urls = list()
cities = list()

# Create Headers for Future GET Requests
req_headers = {"User-Agent":GET_UA(),
    "Accept-Language":"en-US,en;q=0.9",
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding":"gzip, deflate, br",
    "upgrade-insecure-requests":"1"}

# Print cities list to double check correct information
print(cities_df)


       Expensive     Affordable
0       Cashiers     Greenville
1      Highlands  Winston Salem
2      Nags Head   Fayetteville
3   Holden Beach     Kannapolis
4   Emerald Isle     Greensboro
5   Blowing Rock        Concord
6     Banner Elk      Lexington
7       Davidson    Thomasville
8    Chapel Hill       Asheboro
9         Durham    Rocky Mount
10   Wake Forest        Sanford


#### Create URLs For Scraping

In [7]:
# Checking for Sale using Zillow.com
# Using 20 Cities we use 20 URLs
# Using loop to collect information from csv to pass to beautiful soup
for i in range(11):
    for j in range(2):
        # Get City List from cities_df
        cities.append(str(cities_df.iat[i,j]))

# SUCCESSFUL CHECKING FOR INCLUSION OF ALL OF CITIES FILES REQUIRED
# Uncomment for initial check and recomment for readability afterwards
# print(cities)

# Remove White Space to Make A Correct URL by Replacing with '_'
# E.g. 'Morehead City' -> 'Morehead_City'
for i in range(0,len(cities)):
    city = str(cities[i]).replace(' ','_')
    cities[i] = city


# SUCCESSFUL CHECKING FOR CORRECT FORMAT (city with space === city_with_space) REQUIRED
# Uncomment for initial check and recomment for readability afterwards
# print(cities)


# Creating the URLS based upon created cities list
for i in range(0,len(cities)):
    urls.append('https://www.zillow.com/homes/for_sale/'+cities[i]+',-NC_rb/')

# Create Additional URLS based on further page exploration...
# Placed Last so quick excel viewing gives snapshot of individual cities and in-depth numbers are furthur down.
# ***** Can Rarely Handle 5 Pages From 20 Cities *****
# # ***** Options Are Including More Cities and Less Pages, or Vice-Versa *****
for i in range(2,4):
    for j in range(0,len(cities)):
        urls.append('https://www.zillow.com/homes/for_sale/'+cities[j]+',-NC_rb/'+str(i)+'_p/')

# SUCCESSFUL CHECKING FOR APPROPRAITE URLs REQUIRED
# Uncomment for initial check and recomment for readability afterwards
print(len(urls))


66


#### Create Session and Loop Trough URLs with GET Requests

In [8]:
# Object for Individual Listings to be placed into List
# List to Store Total Scraped Information
l=list()
obj = {}
# Setup Cost to be added to data frame using list
cost = []
# Setup Beds to be added to data frame using list
beds = []
# Setup Baths to be added to data frame using list
baths = []
# Setup Square Feet to be added to data frame using list
sqft = []
# Setup Types to be added to data frame using list
types = []
# Setup Address to be added to data frame using list
address = []


# Create a DataFrame for Storage with Columns listing Wanted Information
house_df = pd.DataFrame(columns=['Price','Beds','Baths','Square_Feet','Type','Address'])

#loop through sessions to get urls used be Reqeusts Package for pulling information

for i in range(0,len(urls)):
    proxy = GET_PROX()
    try:
        r = requests.get(urls[i], headers=req_headers, proxies={'http': f"http://{proxy}"})
    except ProxyError:
        r = requests.get(urls[i], headers=req_headers, proxies={'https': f"https://{proxy}"})
    except:
        r = requests.get(urls[i], headers=req_headers)

    # Parse bs4 response into html parser
    soup = BeautifulSoup(r.text, 'html.parser')
    

    # Loop through the created soup to add to df
    for i in soup:
        properties = soup.find_all("div",{"class":"StyledPropertyCardDataWrapper-c11n-8-73-8__sc-1omp4c3-0 gXNuqr property-card-data"})
        
        for x in range(0,len(properties)):
            
            # Try and get pricing from listing excluding those without a number
            try:
                obj["pricing"]=properties[x].find("div",{"class":"StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 hRqIYX"}).text
            except:
                obj["pricing"]=None

            # Try to get Information Sizing
            # i.e. Beds, Baths, Square Feet, and Type (House, Apartment, Single-Family or Multi-Family Home)
            try:
                sizeInfo=properties[x].find("div",{"class":"StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 ghGYOB"}).text

                # Format Correctly The Sizing Information for Split

                # Steps:
                # Remove Sq Ft Unit
                # Remove Additional Info Indicator '-'
                # Remove 'For Sale' Descriptor
                # Remove WhiteSpace
                # Correctly Gap Beds and Baths (Bd, and Ba) to be Uniform
                # Remove Commas for Easier Delimitor for Later Split
                sizeInfo = sizeInfo.replace('sqft','',1)
                sizeInfo = sizeInfo.replace(' - ','')
                sizeInfo = sizeInfo.replace(' for sale',"")

                sizeInfo = sizeInfo.replace(" ","",2)

                sizeInfo = sizeInfo.replace('bds','bd')
                sizeInfo = sizeInfo.replace('bd','bd ')
                sizeInfo = sizeInfo.replace('ba','ba ')
                sizeInfo = sizeInfo.replace(',','')

                # Create split based on whitespace and only 3 split times for grouping construction details
                split = sizeInfo.split(' ',3)
                
                # Place information into object for later use
                # Edit Some Info --- If Land It is Ending up in 'Beds', and Type is Ending up in Baths

                if split[0].count('acre') or split[0].count('lot'):
                    obj['sqft'] = split[0]
                    obj['beds'] = 0
                    obj['baths'] = 0
                    obj['type'] = split[3]
                else:
                    obj['beds'] = split[0]
                    obj['baths'] = split[1]
                    obj['sqft'] = split[2]
                    obj['type'] = split[3]
                
            except:
                sizeInfo=None

            # Find Address information and exclude those without written address
            try:
                obj["address"]=properties[x].find("a",{"class":"StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 lhIXlm property-card-link"}).text
            except:
                obj["address"]=None
                
            l.append(obj)  
            obj={}



# If you really want to check for log output

# For Whole Log
# print(l)

# For Segment Wanted
# for i in range(0,len(l)):
#   print(l[i]['sqft'])

# Check Total Count
print("Amount of listings Scraped: "+str(len(l)))
# List Completed if number > 0 || Empty

Amount of listings Scraped: 108


In [9]:
for i in range(0,len(l)):

    cost.append(l[i]['pricing'])
    beds.append(l[i]['beds'])
    baths.append(l[i]['baths'])
    sqft.append(l[i]['sqft'])
    types.append(l[i]['type'])
    address.append(l[i]['address'])


# Create Price Column
house_df['Price'] = cost

# Create Beds Column
house_df['Beds'] = beds

# Create Baths Column
house_df['Baths'] = baths

# Create Square Feet Column
house_df['Square_Feet'] = sqft

# Create Type Column
house_df['Type'] = types

# Create Address Column
house_df['Address'] = address


# Format Columns as String
house_df['Price'] = house_df['Price'].astype('str')
house_df['Beds'] = house_df['Beds'].astype('str')
house_df['Baths'] = house_df['Baths'].astype('str')
house_df['Square_Feet'] = house_df['Square_Feet'].astype('str')
house_df['Type'] = house_df['Type'].astype('str')
house_df['Address'] = house_df['Address'].astype('str')

# Print to Check Output
print(house_df)


          Price Beds Baths   Square_Feet   Type  \
0      $739,000  3bd   4ba          1314  House   
1      $275,000  1bd   1ba           408  House   
2      $889,000  3bd   3ba          1757  House   
3    $6,500,000  4bd   9ba         11517  House   
4      $519,000  3bd   3ba          2017  House   
..          ...  ...   ...           ...    ...   
103    $499,000  4bd   4ba            --  House   
104    $270,000  3bd   3ba            --  House   
105    $225,000  4bd   3ba            --  House   
106     $10,000    0     0  0.11acreslot   Land   
107     $20,000    0     0   0.1acreslot   Land   

                                               Address  
0              140 Kettle Creek Rd, Cashiers, NC 28717  
1               7 At Last Ridge Rd, Cashiers, NC 28717  
2             760 Cashiers Lake Rd, Cashiers, NC 28717  
3                1090 Zeb Alley Rd, Cashiers, NC 28717  
4                92 Summerfield Ln, Cashiers, NC 28717  
..                                           

#### Export to Excel File

In [10]:
# Create an export to Excel File for Furthur Analysis

# Create File Name
fileName = 'RealEstateData.xlsx'

# Export to Excel; Sheet Named: 'Real Estate Data';
with pd.ExcelWriter(fileName, engine="openpyxl", mode="a", if_sheet_exists='overlay') as writer:
    house_df.to_excel(writer, 'Real Estate Data', index=False, startrow=writer.sheets['Real Estate Data'].max_row)
