In [2]:

import mysql.connector
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException
import time

# MySQL Connection Setup
mydb = mysql.connector.connect(
    host="localhost",
    user="root",
    password=""
)

mycursor = mydb.cursor(buffered=True)
mycursor.execute("CREATE DATABASE IF NOT EXISTS Redbus_Scrapping")
mycursor.execute("USE Redbus_Scrapping")

# Create table if not exists
mycursor.execute("""
CREATE TABLE IF NOT EXISTS bus_details (
    id INT AUTO_INCREMENT PRIMARY KEY,
    route_name TEXT,
    route_link TEXT,
    bus_name TEXT,
    bus_type TEXT,
    departure_time TIME,
    reaching_time TIME,
    total_duration TEXT,
    price DECIMAL(10, 2),
    seats_available INT,
    rating FLOAT
);
""")

# Function to insert bus details into MySQL
def insert_bus_details(mycursor, route_name, route_link, bus_name, bus_type, departure_time, arrival_time, total_duration, price, seats_available, rating):
    sql = """INSERT INTO bus_details
             (route_name, route_link, bus_name, bus_type, departure_time, reaching_time, total_duration, price, seats_available, rating)
             VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
    values = (route_name, route_link, bus_name, bus_type, departure_time, arrival_time, total_duration, price, seats_available, rating)
    try:
        mycursor.execute(sql, values)
        mydb.commit()
    except mysql.connector.Error as err:
        print(f"Error: {err}")
        mydb.rollback()

# Selenium setup
driver = webdriver.Chrome()
driver.maximize_window()

# State transport links
state_links = [
    "https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/ksrtc-kerala/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/tsrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/rsrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/south-bengal-state-transport-corporation-sbstc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/hrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/astc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile"
]

# Function to retrieve route links and names from the state page
def state_link_route(path):
    LINKS_ROUTES = []
    ROUTE_TITLE = []
    wait = WebDriverWait(driver, 10)

    while True:
        try:
            paths = driver.find_elements(By.XPATH, path)
            for links in paths:
                d = links.get_attribute("href")
                if d:
                    LINKS_ROUTES.append(d)
           
            for route in paths:
                ROUTE_TITLE.append(route.text)
           
            # Paginate through route pages
            try:
                active_page_element = driver.find_element(By.XPATH, "//div[@class='DC_117_pageTabs DC_117_pageActive']")
                active_page_number = active_page_element.text
                next_page_number = str(int(active_page_number) + 1)
                next_page_button_xpath = f"//div[@class='DC_117_paginationTable']//div[text()='{next_page_number}']"
                next_page_button = wait.until(EC.presence_of_element_located((By.XPATH, next_page_button_xpath)))
                driver.execute_script("arguments[0].scrollIntoView(true);", next_page_button)
                time.sleep(1)
                next_page_button.click()
                print(f"Navigating to page {next_page_number}")
                time.sleep(10)
            except (NoSuchElementException, TimeoutException):
                print("No more pages to paginate.")
                break
        except Exception as e:
            print(f"Error occurred: {str(e)}")
            break

    return LINKS_ROUTES, ROUTE_TITLE

# Iterate over each state link and scrape bus data
for state_link in state_links:
    driver.get(state_link)
    time.sleep(3)
   
    # Retrieve route links and names
    Route_links, Route_names = state_link_route("//a[@class='route']")
   
    for route_name, link in zip(Route_names, Route_links):
        driver.get(link)  # Load the route page
        time.sleep(2)  # Wait for the page to load

        # Click "View Buses" button
        try:
            clicks = driver.find_element(By.XPATH, "//div[@class='button']")
            clicks.click()
        except:
            continue
        time.sleep(2)

        # Scroll through the page to reveal more bus details
        scrolling = True
        while scrolling:
            old_page_source = driver.page_source
            ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(5)
            new_page_source = driver.page_source
           
            if new_page_source == old_page_source:
                scrolling = False

        # Scrape bus details and insert into MySQL
        bus_names = [elem.text for elem in driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']") if elem.text.strip() != '']
        bus_types = [elem.text for elem in driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']") if elem.text.strip() != '']
        start_times = [elem.text for elem in driver.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']") if elem.text.strip() != '']
        end_times = [elem.text for elem in driver.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']") if elem.text.strip() != '']
        total_durations = [elem.text for elem in driver.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']") if elem.text.strip() != '']
        prices = [elem.text for elem in driver.find_elements(By.XPATH, '//div[@class="fare d-block"]//span') if elem.text.strip() != '']
        seats = [elem.text for elem in driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]") if elem.text.strip() != '']
        ratings = [elem.text for elem in driver.find_elements(By.XPATH, "//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']") if elem.text.strip() != '']
       
        for bus_name, bus_type, dep_time, arr_time, duration, price, seats, rating in zip(bus_names, bus_types, start_times, end_times, total_durations, prices, seats, ratings):
            # Extract the number of seats from the string
            seats_available = int(seats.split()[0])
            # Attempt to clean the rating string and convert to float
            try:
                rating_cleaned = float(rating.split()[0])
            except ValueError:
                
                rating_cleaned = 0.0  # Default to 0.0 if rating is not a number
            insert_bus_details(
                mycursor, route_name, link, bus_name, bus_type,
                dep_time, arr_time, duration,
                float(price), (seats_available),
                float(rating_cleaned)
            )

# Close MySQL connection and WebDriver
mycursor.close()
mydb.close()
driver.quit()

print("Scraping completed successfully.")



Navigating to page 2
Navigating to page 3
Navigating to page 4
Navigating to page 5
No more pages to paginate.
Navigating to page 2
No more pages to paginate.
Navigating to page 2
Navigating to page 3
No more pages to paginate.
Navigating to page 2
Navigating to page 3
Navigating to page 4
No more pages to paginate.
Navigating to page 2
No more pages to paginate.
Navigating to page 2
Navigating to page 3
Navigating to page 4
Navigating to page 5
No more pages to paginate.
Navigating to page 2
Navigating to page 3
Navigating to page 4
No more pages to paginate.
Navigating to page 2
Navigating to page 3
Navigating to page 4
Navigating to page 5
No more pages to paginate.
Navigating to page 2
Navigating to page 3
Navigating to page 4
Navigating to page 5
No more pages to paginate.
Navigating to page 2
Navigating to page 3
Navigating to page 4
No more pages to paginate.
Scraping completed successfully.
