In [None]:
#code to scrape 10 states route and route link and store it in a mysql database
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
import time
import logging

logging.basicConfig(filename='scraping.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def setup_database():
    conn = pymysql.connect(
        host="localhost",
        user="gokuld",
        password="Guvi",
        database="redbus",
        charset="utf8mb4",
        cursorclass=pymysql.cursors.DictCursor
    )
    cursor = conn.cursor()
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS states (
            state_id INT AUTO_INCREMENT PRIMARY KEY,
            state_name VARCHAR(255) UNIQUE
        )
    ''')
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS routes (
            route_id INT AUTO_INCREMENT PRIMARY KEY,
            state_id INT,
            route_name VARCHAR(255),
            route_link TEXT,
            UNIQUE(state_id, route_name),
            FOREIGN KEY (state_id) REFERENCES states(state_id) ON DELETE CASCADE
        )
    ''')
    conn.commit()
    return conn, cursor

def insert_state(cursor, conn, state_name):
    cursor.execute('''
        INSERT INTO states (state_name)
        VALUES (%s)
        ON DUPLICATE KEY UPDATE state_name = VALUES(state_name)
    ''', (state_name,))
    conn.commit()
    cursor.execute('SELECT state_id FROM states WHERE state_name = %s', (state_name,))
    return cursor.fetchone()['state_id']

def insert_route(cursor, conn, state_id, route_name, route_link):
    try:
        cursor.execute('''
            INSERT INTO routes (state_id, route_name, route_link)
            VALUES (%s, %s, %s)
        ''', (state_id, route_name, route_link))
        conn.commit()
        logging.info(f"Added route: {route_name} with link: {route_link}")
        return True
    except pymysql.IntegrityError:
            return False

def scrape_routes(state_name, conn, cursor):
    state_id = insert_state(cursor, conn, state_name)
    routes_added = 0

    try:
        wait = WebDriverWait(driver, 15)

        while True:
            route_elements = wait.until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'route'))
            )
            if not route_elements:
                logging.info(f"No routes found for state: {state_name}")
                print(f"No routes found for state: {state_name}")
                return

            for element in route_elements:
                route_name = element.text.strip()
                route_link = element.get_attribute("href")  
                if route_name and route_link and insert_route(cursor, conn, state_id, route_name, route_link):
                    routes_added += 1

            try:
                pagination_container = driver.find_element(By.CLASS_NAME, 'DC_117_paginationTable')
                page_tabs = pagination_container.find_elements(By.CLASS_NAME, 'DC_117_pageTabs')
                current_page = pagination_container.find_element(By.CLASS_NAME, 'DC_117_pageActive').text.strip()
                next_page = None
                for page in page_tabs:
                    if page.text.strip() == str(int(current_page) + 1):  
                        next_page = page
                        break

                if next_page:
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_page)
                    time.sleep(1)
                    next_page.click()
                    time.sleep(3)  
                else:
                    logging.info(f"Pagination ended for {state_name}.")
                    break
            except Exception as e:
                logging.info(f"Pagination not found or ended for {state_name}: {e}")
                break

        logging.info(f"Total routes added for {state_name}: {routes_added}")

    except Exception as e:
        logging.error(f"Error scraping routes for {state_name}: {e}")

if __name__ == "__main__":
    conn, cursor = setup_database()
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--disable-notifications')
    chrome_options.add_argument('--disable-popup-blocking')
    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get('https://www.redbus.in/')
        driver.maximize_window()
        time.sleep(2)
        driver.find_element(By.XPATH, '//*[@id="homeV2-root"]/div[3]/div[1]/div[2]/a').click()
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(2)
        states = [
            "Sikkim Nationalised Transport (SNT)", "KSRTC (Kerala)", "KAAC TRANSPORT",
            "Meghalaya Transport Corporation(MTC)", "RSRTC", "PEPSU (Punjab)",
            "Chandigarh Transport Undertaking (CTU)", "TGSRTC", "HRTC",
            "Assam State Transport Corporation (ASTC)"
        ]

        for state in states:
            try:
                print(f"\nCollecting routes for {state}")
                state_element = WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, state))
                )
                print(f"Found state element for: {state}")
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", state_element)
                time.sleep(1)
                state_element.click()
                scrape_routes(state, conn, cursor)
                driver.back()
                time.sleep(2)

            except Exception as e:
                logging.error(f"Error processing state {state}: {e}")
                print(f"Error processing state {state}: {e}")
                continue

    finally:
        conn.close()
        driver.quit()


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pymysql
import time

def setup_database():
    conn = pymysql.connect(
        host="localhost",
        user="gokuld",
        password="Guvi",
        database="redbus",
        charset="utf8mb4",
        cursorclass=pymysql.cursors.DictCursor
    )
    cursor = conn.cursor()
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS bus_details (
            id INT AUTO_INCREMENT PRIMARY KEY,
            route_id INT,
            bus_name VARCHAR(255),
            bus_type VARCHAR(255),
            departing_time VARCHAR(50),
            duration VARCHAR(50),
            reaching_time VARCHAR(50),
            star_rating DECIMAL(3,2),
            price DECIMAL(10,2),
            seat_availability INT,
            FOREIGN KEY (route_id) REFERENCES routes(route_id) ON DELETE CASCADE
        )
    ''')
    conn.commit()
    return conn, cursor

def fetch_routes(cursor):
    cursor.execute("SELECT * FROM routes")
    return cursor.fetchall()

def insert_bus_details(cursor, conn, route_id, bus_data):
    try:
        query = '''
            INSERT INTO bus_details (
                route_id, bus_name, bus_type, departing_time, duration,
                reaching_time, star_rating, price, seat_availability
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
        '''
        cursor.execute(query, (
            route_id,
            bus_data['bus_name'],
            bus_data['bus_type'],
            bus_data['departing_time'],
            bus_data['duration'],
            bus_data['reaching_time'],
            bus_data['star_rating'],
            bus_data['price'],
            bus_data['seat_availability']
        ))
        conn.commit()
        return True
    except Exception as e:
        print(f"Error inserting bus details: {e}")
        return False

def click_view_buses(driver, wait):
    try:
        selectors = [
            (By.CLASS_NAME, "button"),
            (By.XPATH, "//div[contains(@class, 'button') and contains(text(), 'View Buses')]"),
            (By.CSS_SELECTOR, "div.button:contains('View Buses')")
        ]
        
        for by, selector in selectors:
            try:
                button = wait.until(EC.element_to_be_clickable((by, selector)))
                button.click()
                time.sleep(3)
                return True
            except:
                continue
        return False
    except Exception:
        return False

def scrape_bus_details(driver, route_id, route_link, cursor, conn):
    try:
        driver.get(route_link)
        time.sleep(5)

        wait = WebDriverWait(driver, 5)

        if click_view_buses(driver, wait):
            print(f"Clicked 'View Buses' button for route ID: {route_id}")
        else:
            print(f"No 'View Buses' button found for route ID: {route_id}")

        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2) 
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break  
            last_height = new_height

        try:
            bus_items = wait.until(EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, "div.bus-item, li.row-sec.clearfix")
            ))
            
            if not bus_items:
                print(f"No buses found for route ID: {route_id}")
                return
                
            print(f"Found {len(bus_items)} buses for route ID: {route_id}")
            
            buses_processed = 0
            for bus_item in bus_items:
                try:
                    bus_data = {
                        'bus_name': wait.until(EC.presence_of_element_located(
                            (By.CLASS_NAME, "travels"))).text,
                        'bus_type': wait.until(EC.presence_of_element_located(
                            (By.CLASS_NAME, "bus-type"))).text,
                        'departing_time': wait.until(EC.presence_of_element_located(
                            (By.CLASS_NAME, "dp-time"))).text,
                        'duration': wait.until(EC.presence_of_element_located(
                            (By.CLASS_NAME, "dur"))).text,
                        'reaching_time': wait.until(EC.presence_of_element_located(
                            (By.CLASS_NAME, "bp-time"))).text,
                        'star_rating': 0.0,  
                        'price': 0.0, 
                        'seat_availability': 0  
                    }
                    
                    try:
                        rating_element = bus_item.find_element(By.CLASS_NAME, "rating")
                        rating_text = rating_element.find_element(By.TAG_NAME, "span").text
                        bus_data['star_rating'] = float(rating_text) if rating_text else 0.0
                    except:
                        pass

                    try:
                        price_element = bus_item.find_element(By.CLASS_NAME, "fare")
                        price_text = price_element.text.replace("INR ", "").replace(",", "")
                        bus_data['price'] = float(price_text) if price_text else 0.0
                    except:
                        pass

                    try:
                        seats_element = bus_item.find_element(By.CLASS_NAME, "seat-left")
                        seats_text = seats_element.text.split()[0]
                        bus_data['seat_availability'] = int(seats_text) if seats_text else 0
                    except:
                        pass

                    if insert_bus_details(cursor, conn, route_id, bus_data):
                        buses_processed += 1
                        print(f"Processed {buses_processed}/{len(bus_items)} buses for route ID: {route_id}")
                
                except Exception as e:
                    print(f"Error processing bus: {e}")
                    continue
            
            print(f"Successfully processed {buses_processed} buses for route ID: {route_id}")
            
        except TimeoutException:
            print(f"Timeout waiting for buses to load for route ID: {route_id}")
        except Exception as e:
            print(f"Error finding bus items: {e}")
            
    except Exception as e:
        print(f"Error processing route ID {route_id}: {e}")

def main():
    conn, cursor = setup_database()
    
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--disable-notifications')
    chrome_options.add_argument('--disable-popup-blocking')
    chrome_options.add_argument('--start-maximized')
    
    driver = webdriver.Chrome(options=chrome_options)

    try:
        routes = fetch_routes(cursor)
        total_routes = len(routes)
        print(f"Found {total_routes} routes in the database.")

        for index, route in enumerate(routes, 1):
            route_id = route['route_id']
            route_name = route['route_name']
            route_link = route['route_link']

            print(f"\nProcessing route {index}/{total_routes}: {route_name} (ID: {route_id})")
            scrape_bus_details(driver, route_id, route_link, cursor, conn)
            time.sleep(3)  

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        conn.close()
        driver.quit()

if __name__ == "__main__":
    main()
