In [1]:
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import re
from urllib.parse import urljoin

# Base URL for scraping engineering colleges
BASE_URL = "https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page={}&sort_by=4"

# Headers to mimic a browser request
HEADERS = {"User-Agent": "Mozilla/5.0"}

# Total pages to scrape
TOTAL_PAGES = 180  # Change this for more pages

# Setup retry strategy for handling connection issues
RETRY_STRATEGY = Retry(
    total=5,  # Retry up to 5 times
    backoff_factor=1,  # Exponential backoff (1s, 2s, 4s, etc.)
    status_forcelist=[500, 502, 503, 504, 429],  # Retry on these HTTP errors
    allowed_methods=["GET"],  # Only apply retries to GET requests
)

# Create a session with retry adapter
def create_session():
    session = requests.Session()
    adapter = HTTPAdapter(max_retries=RETRY_STRATEGY)
    session.mount("https://", adapter)
    session.headers.update(HEADERS)
    return session

# Function to fetch and parse HTML content
def fetch_soup(url, session):
    try:
        response = session.get(url, timeout=10)  # 10s timeout
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Extract text utility function
def extract_text(tag, selector, attr=None):
    element = tag.select_one(selector)
    return element.get(attr) if attr else element.text.strip() if element else "N/A"

# Extract first <p> element from the college description page
def extract_first_paragraph(soup):
    about_section = soup.find("div", {"data-testid": "college_about"})
    if about_section:
        first_p = about_section.find("p")
        return first_p.text.strip() if first_p else "N/A"
    return "N/A"

# Function to scrape a single page
def scrape_page(page, session):
    soup = fetch_soup(BASE_URL.format(page), session)
    if not soup:
        print(f"Page {page} returned no data. Check if the website structure has changed.")
        return []

    data = []
    for card in soup.select("div.card_block"):
        # Extract basic details
        name = extract_text(card, "h3 a")

        # Extract college details URL and parse it
        college_url = urljoin(BASE_URL, extract_text(card, "h3 a", "href"))
        college_soup = fetch_soup(college_url, session)
        college_description = extract_first_paragraph(college_soup) if college_soup else "N/A"

        # Store extracted data in a structured format
        data.append({
            "College Name": name,
            "Description": college_description,  # Extracted first <p> element
        })
    
    return data

# Run the scraper with a session and ThreadPoolExecutor
def main():
    session = create_session()  # Create a session with retries

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(lambda p: scrape_page(p, session), range(1, TOTAL_PAGES + 1)))

    session.close()  # Close session when done

    # Flatten list and convert to DataFrame
    df = pd.DataFrame([item for sublist in results for item in sublist])

    # Debug: Check if DataFrame is empty
    if df.empty:
        print("No data extracted! Check selectors or if the website is blocking requests.")
    else:
        print(f"Extracted {df.shape[0]} rows.")

    # Save to CSV
    df.to_csv("College Info.csv", index=False, encoding="utf-8-sig")
    print("Scraping completed! Data saved to College Info.csv")
    print(df.head())

if __name__ == "__main__":
    main()


Request failed: HTTPSConnectionPool(host='www.careers360.com', port=443): Max retries exceeded with url: /colleges/dev-samaj-college-for-women-chandigarh (Caused by ResponseError('too many 500 error responses'))
Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/colleges/ha-degree-college-farrukhabad
Request failed: HTTPSConnectionPool(host='www.careers360.com', port=443): Max retries exceeded with url: /colleges/indo-global-college-of-engineering-mohali (Caused by ResponseError('too many 500 error responses'))
Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/colleges/ramappa-engineering-college-waranga
Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/colleges/school-of-architecture-and-planning-gd-goenka-university-gurugram
Request failed: 404 Client Error: Not Found for url: https://www.careers360.com/university/7-star-academy-kolkata
Request failed: HTTPSConnectionPool(host='www.careers360.com