In [None]:
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import re
from urllib.parse import urljoin

# Base URL for scraping engineering colleges
BASE_URL = "https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page={}&sort_by=4"

# Headers to mimic a browser request
HEADERS = {"User-Agent": "Mozilla/5.0"}

# Regex patterns for extracting course details
FEE_PATTERN = re.compile(r'₹?\s?\d+(\.\d{1,2})?\s?[L|K]', re.IGNORECASE)
DURATION_PATTERN = re.compile(r'(years?|duration)', re.IGNORECASE)
SEATS_PATTERN = re.compile(r'\d+')

# Total pages to scrape
TOTAL_PAGES = 180  # Change this for more pages

# Setup retry strategy for handling connection issues
RETRY_STRATEGY = Retry(
    total=5,  # Retry up to 5 times
    backoff_factor=1,  # Exponential backoff (1s, 2s, 4s, etc.)
    status_forcelist=[500, 502, 503, 504, 429],  # Retry on these HTTP errors
    allowed_methods=["GET"],  # Only apply retries to GET requests
)

# Create a session with retry adapter
def create_session():
    session = requests.Session()
    adapter = HTTPAdapter(max_retries=RETRY_STRATEGY)
    session.mount("https://", adapter)
    session.headers.update(HEADERS)
    return session

# Function to fetch and parse HTML content
def fetch_soup(url, session):
    try:
        response = session.get(url, timeout=10)  # 10s timeout
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Extract text utility function
def extract_text(tag, selector, attr=None):
    element = tag.select_one(selector)
    return element.get(attr) if attr else element.text.strip() if element else "N/A"

# Function to scrape a single page
def scrape_page(page, session):
    soup = fetch_soup(BASE_URL.format(page), session)
    if not soup:
        print(f"Page {page} returned no data. Check if the website structure has changed.")
        return []

    data = []
    for card in soup.select("div.card_block"):
        # Extract basic details
        name = extract_text(card, "h3 a")
        location = extract_text(card, "div.content_block.d-block.d-md-none span")
        rating = extract_text(card, "span.star_text")
        college_type = extract_text(card, "div.content_block.d-none.d-md-block span:nth-of-type(2)")

        # Extract course-related data
        courses, fees, durations, seats = [], [], [], []
        for course in card.select("div.combined_block.d-md-none a"):
            course_url = urljoin(BASE_URL, course["href"])  # Ensure absolute URL
            course_soup = fetch_soup(course_url, session)
            if course_soup:
                for detail in course_soup.select("div.detail"):
                    courses.append(extract_text(detail, "h4 a"))
                    fee_text = next((span.text.strip() for span in detail.select("div.course_detail span") if FEE_PATTERN.match(span.text.strip())), "N/A")
                    duration, seats_count = "N/A", "N/A"
                    for span in detail.select("div.course_detail span"):
                        span_text = span.text.strip()
                        if DURATION_PATTERN.search(span_text):
                            duration = span_text
                        elif SEATS_PATTERN.match(span_text):
                            seats_count = span_text

                    durations.append(duration)
                    seats.append(seats_count)
                    fees.append(fee_text)

        # Store extracted data in a structured format
        data.extend([{ "College Name": name, "Courses": c, "Duration": d, "Fee Structure": f, "Seats": s,} for c, f, d, s in zip(courses, fees, durations, seats)])
    
    return data


# Run the scraper with a session and ThreadPoolExecutor
def main():
    session = create_session()  # Create a session with retries

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(lambda p: scrape_page(p, session), range(1, TOTAL_PAGES + 1)))

    session.close()  # Close session when done

    # Flatten list and convert to DataFrame
    df = pd.DataFrame([item for sublist in results for item in sublist])

    # Debug: Check if DataFrame is empty
    if df.empty:
        print("No data extracted! Check selectors or if the website is blocking requests.")
    else:
        print(f"Extracted {df.shape[0]} rows.")

    # Save to CSV
    df.to_csv("Course Details.csv", index=False, encoding="utf-8-sig")
    print("Scraping completed! Data saved to Course Details.csv")
    print(df.head())

if __name__ == "__main__":
    main()


Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/colleges/agnel-polytechnic-goa/all-questions
Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/colleges/azad-polytechnic-azamgarh/all-questions
Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/university/centurion-university-of-technology-and-management-bhubaneswar/admission
Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/university/centurion-university-of-technology-and-management-bhubaneswar/courses
Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/colleges/chamarajendra-government-visual-arts-college-mysore/all-questions
Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/colleges/college-of-veterinary-and-animal-sciences-wayanad/all-questions
Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/colleges/dpg-polytechnic-gurgaon/a