In [None]:
#Import the needed libraries
import time
import random
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Random sleep between 2 to 5 seconds
def random_sleep():
    time.sleep(random.uniform(2, 5))  

def click_load_more(driver):
    while True:
        try:
            # Locate the "Load More" button
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//a[@id='storage-site-directory-load-more-link']"))  
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)  # Scroll to button
            load_more_button.click()  # Click the button
            random_sleep()  # Random sleep to avoid being blocked/ethical scraping

        except TimeoutException:
            break  # Break the loop if the button is not found or not clickable
        except NoSuchElementException:
            break  # Break if the button is no longer in the DOM
        except Exception as e:
            print(f"An error occurred: {e}")
            break  # Break on any other exception
#Store Required information         
storagecenter=[]
storagelocation=[]
links_to_follow = []
contact_details=[]
website_links=[]

#Use Selenium to get the full page source
def getpagesource():
    chrome_options = Options()
    # Uncomment this if you want to run in headless mode
    # chrome_options.add_argument("--headless")
    # Setup ChromeDriver
    CHROMEDRIVER_PATH = ChromeDriverManager().install()
    driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=chrome_options)

    # Define the URL
    url = 'https://www.ssauk.com/storage-site-directory.html?q=London&radius=5'

    # Load the page
    driver.get(url)
    driver.implicitly_wait(6)  # Add implicit wait

    # Click "Load More" until all content is loaded
    click_load_more(driver)

    # After clicking "Load More," scrape the content
    pagesource = driver.page_source
    bs = BeautifulSoup(pagesource, 'html.parser')
    driver.quit()  # Ensure the driver is closed
    return bs
bs=getpagesource() #call the getpagesource function

#Extract each store name and its address
def get_store_and_address():
    h3=bs.find_all('h3', {'class':'article-heading'})
    for elem in h3:
        storename=elem.get_text()
        storagecenter.append(storename)
    p=bs.find_all('p', {'class':'article-text'})
    if p:
        for elem in p:
            address=elem.get_text()
            storagelocation.append(address)
    else:
        storagelocation.append("No location address")
get_store_and_address() #call the get_store_and_address function

# Define the links to crawl
def links():
    # Find all links with "storage-site-directory/_storage-site" in their href
    for link in bs.find_all('a', {'href': re.compile(r'^https://www.ssauk.com/storage-site-directory/_storage-site')}):
        href = link.get('href')  # Extract the href attribute
        # Avoid duplicate entries
        if href and href not in links_to_follow:
            links_to_follow.append(href)
links() #call the links() function

#Generate the crawler
def crawl():
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
    }
    for link in links_to_follow:
        time.sleep(2)  # Adding a delay between requests to avoid overloading the server
        #Extract phone number
        retries = 3
        for attempt in range(retries):
            try:
                # Request with a timeout
                response = requests.get(link, headers=headers, timeout=10)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    break  # Exit the retry loop on successful request
                else:
                    print(f"Error {response.status_code} for {link}")
                    break
            except requests.exceptions.Timeout:
                print(f"Timeout on attempt {attempt + 1} for {link}. Retrying...")
                if attempt == retries - 1:
                    print(f"Failed to load {link} after {retries} attempts.")
        p_tag=soup.find('p')
        if p_tag:
           tel=p_tag.get_text()
           contact_details.append(tel)
        else:
            contact_details.append("no contact given")
        #Extract the Website link
        a_tags = soup.find_all('a')
        # Flag to check if any 'visit' link is found
        found_link = False
        # Look for the href values of <a> tags that contain "visit" in their text
        for link in a_tags:
            if link.text and 'visit' in link.text.lower():
               web_link = link.get('href')
               website_links.append(web_link)
               found_link = True  # Set the flag to True if a link is found
        # If no link containing 'visit' is found, append 'no web_link'
        if not found_link:
             website_links.append("no web_link given")
crawl() #call the crawl function

#Print data in excel format
df=pd.DataFrame({'Store Name':storagecenter, 'Store Address':storagelocation, 
                 'Contact Details':contact_details, 'Website Link':website_links})
df.to_excel(r"E:\PYTHON- DATA SCIENCE\Data hub\Scraping-ssauk2.xlsx") #Change this to reflect your local storage
# Print success message
print("Data downloaded successfully and stored in E:\\PYTHON-DATA SCIENCE\\Data hub\\Scraping-ssauk2.xlsx")
             