# Real Estate EDA - South Carolina

### Imports (Pandas - Python Data Analysis Library, Numbpy - Array Proessing Package, Seaborn - Statistical Data Visualisation, Matplotlib - Static Vizualization Library)

In [None]:
# Standard Import Structure for EDA;
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
import random
import time

# Importing Requests for use in API calls.
import requests


# Import Standard Packages for Date and Time;
from datetime import datetime, timedelta

# Import Beautiful Soup, Selenium, for Scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

# Import packages to help with parsing Information
import lxml
from lxml.html.soupparser import fromstring
import prettify
import numbers
import htmltext


#### Random User Agent

In [None]:
def GET_UA():
    uastrings = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",\
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",\
                "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 Safari/537.85.10",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",\
                "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"\
                ]
 
    return random.choice(uastrings)

### Setup and Initialization

In [None]:
# Set Maximum Displayed Columns to None; All Columns Listed
pd.options.display.max_columns = None;

# Set MatPlotLib to Display Elements
%matplotlib inline

# Import Cities in CSV file for 10 most and least expenive north carolina cities.csv
cities_df = pd.read_csv("scCities.csv",header=0)

#Lists for Looping through GET Requests
urls = list()
cities = list()

# Create Headers for Future GET Requests
req_headers = {"User-Agent":GET_UA(),
    "Accept-Language":"en-US,en;q=0.9",
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding":"gzip, deflate, br",
    "upgrade-insecure-requests":"1"}

# Print cities list to double check correct information
print(cities_df)


#### Create URLs For Scraping

In [None]:
# Checking for Sale using Zillow.com
# Using 20 Cities we use 20 URLs
# Using loop to collect information from csv to pass to beautiful soup
for i in range(10):
    for j in range(2):
        # Get City List from cities_df
        cities.append(str(cities_df.iat[i,j]))

# SUCCESSFUL CHECKING FOR INCLUSION OF ALL OF CITIES FILES REQUIRED
# Uncomment for initial check and recomment for readability afterwards
# print(cities)

# Remove White Space to Make A Correct URL by Replacing with '_'
# E.g. 'Morehead City' -> 'Morehead_City'
for i in range(0,len(cities)):
    city = str(cities[i]).replace(' ','_')
    cities[i] = city


# SUCCESSFUL CHECKING FOR CORRECT FORMAT (city with space === city_with_space) REQUIRED
# Uncomment for initial check and recomment for readability afterwards
# print(cities)


# Creating the URLS based upon created cities list
for i in range(0,len(cities)):
    urls.append('https://www.zillow.com/homes/for_sale/'+cities[i]+',-SC_rb/')

# Create Additional URLS based on further page exploration...
# Placed Last so quick excel viewing gives snapshot of individual cities and in-depth numbers are furthur down.
# ***** Can Rarely Handle 5 Pages From 20 Cities *****
# ***** Options Are Including More Cities and Less Pages, or Vice-Versa *****
for i in range(2,4):
    for j in range(0,len(cities)):
        urls.append('https://www.zillow.com/homes/for_sale/'+cities[j]+',-SC_rb/'+str(i)+'_p/')

# SUCCESSFUL CHECKING FOR APPROPRAITE URLs REQUIRED
# Uncomment for initial check and recomment for readability afterwards
print(str(len(urls)))
print(urls)


#### Create Session and Loop Trough URLs with GET Requests

In [None]:
# Object for Individual Listings to be placed into List
# List to Store Total Scraped Information
l=[]
obj = {}

# Create a DataFrame for Storage with Columns listing Wanted Information
house_df = pd.DataFrame(columns=['Price','Beds','Baths','Square_Feet','Type','Address'])

#loop through sessions to get urls used be Reqeusts Package for pulling information

with requests.Session() as s:

    for i in range(0,len(urls)):
        r = s.get(urls[i], headers=req_headers)
    
        # Parse bs4 response into html parser
        soup = BeautifulSoup(r.text, 'html.parser')
        
        # Loop through the created soup to add to df
        for i in soup:
            properties = soup.find_all("div",{"class":"StyledPropertyCardDataWrapper-c11n-8-73-8__sc-1omp4c3-0 gXNuqr property-card-data"})

            for x in range(0,len(properties)):
                
                # Try and get pricing from listing excluding those without a number
                try:
                    obj["pricing"]=properties[x].find("div",{"class":"StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 hRqIYX"}).text
                except:
                    obj["pricing"]='Unknown'

                # Try to get Information Sizing
                # i.e. Beds, Baths, Square Feet, and Type (House, Apartment, Single-Family or Multi-Family Home)

                try:
                    sizeInfo=properties[x].find("div",{"class":"StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 ghGYOB"}).text

                    # ***** Check for Studios ***** ***** Remove Them ******
                    if 'studio' in sizeInfo:

                        sizeInfo = 'Unknown Unknown Unknown Unknown'

                        split = sizeInfo.split(' ',3)

                        obj['beds'] = split[0]
                        obj['baths'] = split[1]
                        obj['sqft'] = split[2]
                        obj['type'] = split[3]
                    else:
                        # Format Correctly The Sizing Information for Split

                        # Steps:
                        # Remove Sq Ft Unit
                        # Remove Additional Info Indicator '-'
                        # Remove 'For Sale' Descriptor
                        # Remove WhiteSpace
                        # Correctly Gap Beds and Baths (Bd, and Ba) to be Uniform
                        # Remove Commas for Easier Delimitor for Later Split
                        sizeInfo = sizeInfo.replace('sqft','',1)
                        sizeInfo = sizeInfo.replace(' - ','')
                        sizeInfo = sizeInfo.replace(' for sale',"")

                        sizeInfo = sizeInfo.replace(" ","",2)

                        sizeInfo = sizeInfo.replace('bds','bd')
                        sizeInfo = sizeInfo.replace('bd','bd ')
                        sizeInfo = sizeInfo.replace('ba','ba ')
                        sizeInfo = sizeInfo.replace(',','')

                        # Create split based on whitespace and only 3 split times for grouping construction details
                        split = sizeInfo.split(' ',3)
                        
                        # Place information into object for later use
                        # Edit Some Info --- If Land It is Ending up in 'Beds', and Type is Ending up in Baths

                        if split[0].count('acre') or split[0].count('lot'):
                            obj['sqft'] = split[0]
                            obj['beds'] = 0
                            obj['baths'] = 0
                            obj['type'] = split[3]
                        else:
                            obj['beds'] = split[0]
                            obj['baths'] = split[1]
                            obj['sqft'] = split[2]
                            obj['type'] = split[3]
                        
                except:
                    obj['sqft'] = 'Unknown'
                    obj['beds'] = 'Unknown'
                    obj['baths'] = 'Unknown'
                    obj['type'] = 'Unknown'

                # Find Address information and exclude those without written address
                try:
                    obj["address"]=properties[x].find("a",{"class":"StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 lhIXlm property-card-link"}).text
                except:
                    obj["address"]='Unknown'
                
                print(obj)
                l.append(obj)  
                obj={}
                time.sleep(random.randint(5,20))


# Setup Cost to be added to data frame using list
cost = []
# Setup Beds to be added to data frame using list
beds = []
# Setup Baths to be added to data frame using list
baths = []
# Setup Square Feet to be added to data frame using list
sqft = []
# Setup Types to be added to data frame using list
types = []
# Setup Address to be added to data frame using list
address = []


# If you really want to check for log output

# For Whole Log
# print(l)


# Check Total Count
print("Amount of listings Scraped: "+str(len(l)))
# List Completed if number > 0 || Empty
        


for i in range(0,len(l)):

    cost.append(l[i]['pricing'])
    beds.append(l[i]['beds'])
    baths.append(l[i]['baths'])
    sqft.append(l[i]['sqft'])
    types.append(l[i]['type'])
    address.append(l[i]['address'])


# Create Price Column
house_df['Price'] = cost

# Create Beds Column
house_df['Beds'] = beds

# Create Baths Column
house_df['Baths'] = baths

# Create Square Feet Column
house_df['Square_Feet'] = sqft

# Create Type Column
house_df['Type'] = types

# Create Address Column
house_df['Address'] = address


# Format Columns as String
house_df['Price'] = house_df['Price'].astype('str')
house_df['Beds'] = house_df['Beds'].astype('str')
house_df['Baths'] = house_df['Baths'].astype('str')
house_df['Square_Feet'] = house_df['Square_Feet'].astype('str')
house_df['Type'] = house_df['Type'].astype('str')
house_df['Address'] = house_df['Address'].astype('str')

house_df.drop_duplicates()

# Print to Check Output
display(house_df)


#### Export to Excel File

In [None]:
# Create an export to Excel File for Furthur Analysis

# Create File Name
fileName = 'SC - RealEstateData.xlsx'

# Export to Excel; Sheet Named: 'Real Estate Data';
with pd.ExcelWriter(fileName, engine="openpyxl", mode="w") as writer:
    house_df.to_excel(writer, 'SC - Real Estate Data', index=False)
