# Real Estate EDA

### Imports (Pandas - Python Data Analysis Library, Numbpy - Array Proessing Package, Seaborn - Statistical Data Visualisation, Matplotlib - Static Vizualization Library)

In [245]:
# Standard Import Structure for EDA;
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re

# Importing Requests for use in API calls.
import requests


# Import Standard Packages for Date and Time;
from datetime import datetime, timedelta

# Import Beautiful Soup, Selenium, for Scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

# Import packages to help with parsing Information
import lxml
from lxml.html.soupparser import fromstring
import prettify
import numbers
import htmltext


### Setup and Initialization

In [246]:
# Set Maximum Displayed Columns to None; All Columns Listed
pd.options.display.max_columns = None;

# Set MatPlotLib to Display Elements
%matplotlib inline

# Import Cities in CSV file for 10 most and least expenive north carolina cities.csv
cities_df = pd.read_csv("cities.csv",header=0)

print(cities_df)

# Initialize Variables to be read into.
address = ''
price = 0
beds = 0
details = 0
home_type = ''
last_updated = ''
brokerage = ''
link = ''


        Expensive   Affordable
0            Elon         Eden
1           Boone        Ayden
2         Woodfin       Maiden
3        Carrboro       Hamlet
4       Asheville       Newton
5       Pineville     Sawmills
6      Wilmington     ArchDale
7   Morehead City   Rockingham
8  Black Mountain  Winterville
9  Hendersonville  Gibsonville


In [247]:
# Insert Headers for various websites to pull information specifically.
req_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
    "Accept-Language":"en-US,en;q=0.9","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding":"gzip, deflate, br","upgrade-insecure-requests":"1"}

# Checking for Sale using Zillow.com
# Using 20 Cities we use 20 URLs
# Using loop to collect information from csv to pass to beautiful soup
# for i in range(9):
#     for j in range(2):
#         # Get City from cities_df
#         city = cities_df.iat[i,j]
#         print(city)

with requests.Session() as s:
   city = 'asheville,-NC_rb/' #*****change this city to what you want*****
   url = 'https://www.zillow.com/homes/for_sale/'+city    
   r = s.get(url, headers=req_headers)


# Parse bs4 response into html parser
soup = BeautifulSoup(r.text, 'html.parser')

# Create a DataFrame for Storage.
house_df = pd.DataFrame(columns=['Price','Size','Address'])
l=list()
obj = {}


# Loop through the created soup to add to df
for i in soup:
    properties = soup.find_all("div",{"class":"StyledPropertyCardDataWrapper-c11n-8-73-8__sc-1omp4c3-0 gXNuqr property-card-data"})

    for x in range(0,len(properties)):
        try:
            obj["pricing"]=properties[x].find("div",{"class":"StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 hRqIYX"}).text
        except:
            obj["pricing"]=None
        try:
            sizeInfo=properties[x].find("div",{"class":"StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 ghGYOB"}).text

            # Format Correctly The Sizing Information for Split

            # Steps:
            # Remove Sq Ft Unit
            # Remove Additional Info Indicator '-'
            # Remove 'For Sale' Descriptor
            # Remove WhiteSpace
            # Correctly Gap Beds and Baths (Bd, and Ba) to be Uniform
            # Remove Commas for Easier Delimitor for Later Split
            sizeInfo = sizeInfo.replace('sqft','',1)
            sizeInfo = sizeInfo.replace(' - ','')
            sizeInfo = sizeInfo.replace(' for sale',"")

            sizeInfo = sizeInfo.replace(" ","",2)

            sizeInfo = sizeInfo.replace('bds','bd')
            sizeInfo = sizeInfo.replace('bd','bd ')
            sizeInfo = sizeInfo.replace('ba','ba ')
            sizeInfo = sizeInfo.replace(',','')

            # Create split based on whitespace and only 3 split times for grouping construction details
            split = sizeInfo.split(' ',3)
            print(split)
            
        except:
            obj["size"]=None
        try:
            obj["address"]=properties[x].find("a",{"class":"StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 lhIXlm property-card-link"}).text
        except:
            obj["address"]=None
            
        l.append(obj)  
        obj={}
print("Amount of listings Scraped: "+str(len(l)))
# List Completed if number > 0 || Empty
        

['1bd', '1ba', '786', 'House']
['2bd', '2ba', '1164', 'New construction']
['4bd', '3ba', '2233', 'House']
['1bd', '1ba', '509', 'House']
['2bd', '1ba', '1234', 'House']
['4bd', '2ba', '2141', 'House']
['3bd', '2ba', '1675', 'House']
['3bd', '2ba', '1790', 'House']
['4bd', '3ba', '2702', 'House']
['1bd', '1ba', '786', 'House']
['2bd', '2ba', '1164', 'New construction']
['4bd', '3ba', '2233', 'House']
['1bd', '1ba', '509', 'House']
['2bd', '1ba', '1234', 'House']
['4bd', '2ba', '2141', 'House']
['3bd', '2ba', '1675', 'House']
['3bd', '2ba', '1790', 'House']
['4bd', '3ba', '2702', 'House']
['1bd', '1ba', '786', 'House']
['2bd', '2ba', '1164', 'New construction']
['4bd', '3ba', '2233', 'House']
['1bd', '1ba', '509', 'House']
['2bd', '1ba', '1234', 'House']
['4bd', '2ba', '2141', 'House']
['3bd', '2ba', '1675', 'House']
['3bd', '2ba', '1790', 'House']
['4bd', '3ba', '2702', 'House']
Amount of listings Scraped: 27


In [248]:
# Setup cost to be added to data frame using list
cost = []

for i in range(0,len(l)):

    cost.append(l[i]['pricing'])

# Create DataFrame Column
house_df['Price'] = cost

# Setup size to be added to data frame using list
size = []

for j in range(0,len(l)):

    size.append(l[j]['size'])

# Create DataFrame Column
house_df['Size'] = size

# Setup address to be added to data frame using list
address = []

for k in range(0,len(l)):

    address.append(l[k]['address'])

# Create DataFrame Column
house_df['Address'] = address


# Format Columns as String
house_df['Price'] = house_df['Price'].astype('str')
house_df['Size'] = house_df['Size'].astype('str')
house_df['Address'] = house_df['Address'].astype('str')

print(house_df)


KeyError: 'size'