In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Ensure the 'data' directory exists
output_dir = "./data"
os.makedirs(output_dir, exist_ok=True)

BASE_URL = "https://oneclick.az/business/Restaurant/Restaurant/Restoranlar"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"}

def scrape_page(page):
    """Scrape a single page."""
    url = f"{BASE_URL}?city=784&page={page}"
    try:
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to fetch page {page}. Status code: {response.status_code}")
            return None

        soup = BeautifulSoup(response.text, 'html.parser')
        restaurants = []

        # Adjust selectors based on your inspection
        for item in soup.select(".wrap"):
            # Extract name
            h3 = item.select_one("h3")
            name = h3.select_one("a span").text.strip() if h3 and h3.select_one("a span") else "No Name"

            # Extract address
            address = item.select_one(".f16").text.strip() if item.select_one(".f16") else "No Address"

            # Extract phone and site
            ul = item.select_one("ul")
            phone = ul.select_one(".phone").text.strip() if ul and ul.select_one(".phone") else "No Phone"
            site = ul.select_one(".site a")['href'].strip() if ul and ul.select_one(".site a") else "No Site"

            # Skip this entry if the site URL is longer than 50 characters
            if len(site) > 100:
                continue

            # Collect rating stars
            star_container = item.select_one(".right .star.hover")
            rating_stars = star_container.select_one("span[data-star]")['data-star'] if star_container and star_container.select_one("span[data-star]") else "No Stars"

            # Extract votes dynamically for any star rating (0-5)
            votes = "No Votes"
            right = item.select_one(".right")
            if right:
                for star_num in range(6):  # Loop through star0 to star5
                    star_class = f".star.hover.star{star_num}"
                    star_element = right.select_one(star_class)
                    if star_element:
                        votes = star_element.text.strip()
                        break  # Use the first matching vote count

            # Extract restaurant type (Kind)
            kind = right.select("a:nth-of-type(2)")[0].text.strip() if right and len(right.select("a:nth-of-type(2)")) > 0 else "No Kind"

            # Split the 'Address' field into components
            components = address.split(",")
            city = components[0].strip() if len(components) > 0 else "No City"
            district = components[1].strip() if len(components) > 1 else "No District"
            street = components[2].strip() if len(components) > 2 else "No Street"
            postal_address = components[3].strip() if len(components) > 3 else "No Postal Address"

            restaurants.append({
                "Name": name,
                "City": city,
                "District": district,
                "Street": street,
                "Postal Address": postal_address,
                "Phone": phone,
                "Votes": votes,
                "Rating Stars": rating_stars,
                "Kind": kind,
                "Site": site
            })

        return restaurants

    except Exception as e:
        print(f"Error scraping page {page}: {e}")
        return None

def scrape_pages_in_parallel(start_page, end_page, max_threads=10):
    """Scrape multiple pages in parallel using ThreadPoolExecutor."""
    all_restaurants = []
    with ThreadPoolExecutor(max_threads) as executor:
        # Submit tasks to executor
        futures = {executor.submit(scrape_page, page): page for page in range(start_page, end_page + 1)}

        for future in futures:
            page = futures[future]
            try:
                data = future.result()
                if data:  # Only extend if valid data is returned
                    all_restaurants.extend(data)
                else:
                    print(f"No data returned for page {page}.")
            except Exception as e:
                print(f"Error processing page {page}: {e}")

    return all_restaurants

# Parameters
start_page = 1
end_page = 50  # Adjust the range as necessary
max_threads = 10  # Number of threads

# Scrape pages
print(f"Scraping pages {start_page} to {end_page} with {max_threads} threads...")
all_restaurants = scrape_pages_in_parallel(start_page, end_page, max_threads)

# Save the data directly to a CSV file
output_path = os.path.join(output_dir, "scraped_data_team_13.csv")
columns = ["Name", "City", "District", "Street", "Postal Address", "Phone", "Votes", "Rating Stars", "Kind", "Site"]
df = pd.DataFrame(all_restaurants, columns=columns)
df.to_csv(output_path, index=False, encoding="utf-8")

print(f"Scraped data saved to {output_path}. Total restaurants: {len(df)}.")


Scraping pages 1 to 50 with 10 threads...
Scraped data saved to ./data/scraped_data_team_13.csv. Total restaurants: 471.
