## Google URL Scraper
This is a script designed to automate the extraction of Google Maps URLs for BMO, TD, RBC, CIBC, and Scotiabank listings in Calgary. It was used in a group project to obtain the coordinates of these businesses for Geo-Spatial Analysis, which can be found in the final report of our 'Analysis of the Canadian Banking Industry' at https://www.lukes-lab.com/.

In [1]:
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import math as math
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains
import pyautogui
import pandas as pd

In [7]:
driver = webdriver.Firefox()

def url_scraper(bank_query):
        url = "https://www.google.com/maps/@51.0803529,-114.140266,13z?entry=ttu"
        driver.get(url)
        time.sleep(3)

        search_bar = driver.find_element(By.XPATH, '//*[@id="searchboxinput"]')
        search_bar.click()
        time.sleep(3)


        search_bar.send_keys(bank_query)
        search_bar.send_keys(Keys.RETURN)

        wait = WebDriverWait(driver, 20)  # Wait for up to 20 seconds
        scroll_container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m6QErb.DxyBCb.kA9KIf.dS8AEf.ecceSd')))


        # Get count of current locations in the list
        current_count = len(scroll_container.find_elements(By.CSS_SELECTOR, '.hfpxzc'))
        print(current_count)

        actions = ActionChains(driver)

        while True:
            actions.move_to_element(scroll_container).click().send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(1)
            pyautogui.press('f12')
            time.sleep(1)
            pyautogui.press('f12')
            try:
                end_of_page = driver.find_element(By.CSS_SELECTOR,".HlvSq")
                if "You've reached the end of the list." in end_of_page.text:
                    break
            except NoSuchElementException:
                pass

        response = BeautifulSoup(driver.page_source, 'html.parser')
        links = response.find_all('a', class_='hfpxzc')

        bank_links = []
        for link in links:
            link = link.get('href')
            bank_links.append(link)

        return bank_links

In [None]:
bmo = url_scraper("BMO")
td = url_scraper("TD")
scotiabank = url_scraper("scotiabank")
rbc = url_scraper("RBC")
cibc = url_scraper("CIBC")

In [None]:
# Concatenate the lists with an additional column for the bank names

order = ['bmo','cibc', 'rbc', 'scotiabank','td']
df = pd.DataFrame({
    'url': bmo + cibc + rbc + scotiabank + td,
    'bank': ['bmo']*len(bmo) + ['scotiabank']*len(scotiabank) + ['rbc']*len(rbc) + ['cibc']*len(cibc) + ['td']*len(td)
})

# Convert the bank name column to a categorical type with a defined order
df['bank'] = pd.Categorical(df['bank'], categories=order, ordered=True)

# Sort by the bank name
df_sorted = df.sort_values(by='bank').reset_index(drop=True)

print(df_sorted)

In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.max_colwidth', 1000,  # or use a large number like 1000
                       'display.width', 1000  # adjust as per your needs
                       ):
    print(df_sorted)

In [None]:
# I want to add a category field 
# branch, atm, wealth, branch_advisor, mortgage specialist
import re

# Extract name/description, latitude, and longitude
df_sorted['name'] = df_sorted['url'].apply(lambda x: re.search(r'place/([^/]+)', x).group(1).replace('+', ' ') if re.search(r'place/([^/]+)', x) else None)
df_sorted['latitude'] = df_sorted['url'].apply(lambda x: re.search(r'!3d([\d\.-]+)', x).group(1) if re.search(r'!3d([\d\.-]+)', x) else None)
df_sorted['longitude'] = df_sorted['url'].apply(lambda x: re.search(r'!4d([\d\.-]+)', x).group(1) if re.search(r'!4d([\d\.-]+)', x) else None)

# Categorize rows containing 'bank' or 'branch' as 'branch'
df_sorted.loc[df_sorted['name'].str.contains('bank|branch|Advice', case=False, na=False), 'name'] = 'branch'

# Categorize rows containing 'atm' as 'ATM'
df_sorted.loc[df_sorted['name'].str.contains('atm', case=False, na=False), 'name'] = 'ATM'

df_sorted.loc[df_sorted['name'].str.contains('Nesbitt|wealth|Dominion|Counsel', case=False, na=False), 'name'] = 'wealth'

df_sorted.loc[df_sorted['name'].str.contains('mortgage', case=False, na=False), 'name'] = 'mortgage_specialist'

df_sorted.loc[df_sorted['name'].str.contains('planner|Investment Specialist', case=False, na=False), 'name'] = 'financial_planner'

df_sorted.loc[df_sorted['name'].str.contains('Financial Advisor', case=False, na=False), 'name'] = 'financial_advisor'

df_sorted.loc[df_sorted['name'].str.contains('Business Advisor|Small Business', case=False, na=False), 'name'] = 'business_advisor'

df_sorted.loc[df_sorted['name'].str.contains('Direct Investing', case=False, na=False), 'name'] = 'broker'

df_sorted.loc[df_sorted['name'].str.contains('Mellon|World Markets', case=False, na=False), 'name'] = 'Investment Banking'

df_sorted


In [None]:
# Create a new column for categories
df_sorted['category'] = df_sorted['name']

# Define known categories
known_categories = ['branch', 'ATM', 'wealth', 'mortgage_specialist', 'financial_planner', 'financial_advisor', 'business_advisor','broker','Investment Banking']

# Assign 'unknown' to rows not in known categories
df_sorted.loc[~df_sorted['category'].isin(known_categories), 'category'] = 'unknown'

# Now you can filter out the 'unknown' category to manually investigate
subset_to_investigate = df_sorted[df_sorted['category'] == 'unknown']

#print(subset_to_investigate)


with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.max_colwidth', 1000,  # or use a large number like 1000
                       'display.width', 1000  # adjust as per your needs
                       ):
    print(subset_to_investigate)


# Drop the rows that are in the subset_to_investigate from df_sorted
df_sorted = df_sorted.drop(subset_to_investigate.index)
df_sorted = df_sorted.drop(columns = "category")

df_sorted

In [86]:
df_sorted.to_csv(r'C:\Users\lgbra\OneDrive\Desktop\VScode\Urls.csv', index=False)