In [None]:
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures

# Base URL
BASE_URL = "https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page={}&sort_by=4"

# Headers to mimic a browser request
HEADERS = {"User-Agent": "Mozilla/5.0"}

# Total pages to scrape
TOTAL_PAGES = 180

# Setup retry strategy for handling connection issues
RETRY_STRATEGY = Retry(
    total=5,  # Retry up to 5 times
    backoff_factor=1,  # Wait 1s, 2s, 4s, 8s, etc., between retries
    status_forcelist=[500, 502, 503, 504, 429],  # Retry on these HTTP errors
    allowed_methods=["GET"],  # Apply only to GET requests
)

# Create a session with retry adapter
def create_session():
    session = requests.Session()
    adapter = HTTPAdapter(max_retries=RETRY_STRATEGY)
    session.mount("https://", adapter)
    return session

# Function to fetch and parse HTML content
def fetch_soup(url, session):
    try:
        response = session.get(url, headers=HEADERS, timeout=10)  # 10s timeout
        if response.status_code == 200:
            return BeautifulSoup(response.text, "html.parser")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
    return None

# Extract text utility function
def extract_text(tag, selector, attr=None):
    element = tag.select_one(selector)
    return element.get(attr) if attr else element.text.strip() if element else "N/A"

# Function to scrape a single page
def scrape_page(page, session):
    soup = fetch_soup(BASE_URL.format(page), session)
    if not soup:
        return []

    data = []
    for card in soup.select("div.card_block"):
        # Extract basic details
        name = extract_text(card, "h3 a")
        location = extract_text(card, "div.content_block.d-block.d-md-none span")
        rating = extract_text(card, "span.star_text")
        college_type = extract_text(card, "div.content_block.d-none.d-md-block span:nth-of-type(2)")

        # Extract facilities URL but avoid extra requests if unnecessary
        facilities = "N/A"
        facilities_link = next((a["href"] for a in card.select("div.d-none.d-md-block a") if "facilities" in a["href"]), None)
        if facilities_link:
            facilities_url = f"https://www.careers360.com{facilities_link}" if not facilities_link.startswith("http") else facilities_link
            facilities_soup = fetch_soup(facilities_url, session)
            if facilities_soup:
                facilities = ", ".join([f.text.strip() for f in facilities_soup.select("span.facilities_name")])

        # Store extracted data
        data.append({
            "College Name": name,
            "Location": location,
            "College Type": college_type,
            "Facilities": facilities,
            "Rating": rating
        })

    return data

# Run the scraper with a session and ThreadPoolExecutor
def main():
    session = create_session()  # Create a session with retries

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:  # Reduce max_workers
        results = list(executor.map(lambda p: scrape_page(p, session), range(1, TOTAL_PAGES + 1)))

    session.close()  # Close session when done

    # Flatten list and convert to DataFrame
    df = pd.DataFrame([item for sublist in results for item in sublist])

        # Debug: Check if DataFrame is empty
    if df.empty:
        print("No data extracted! Check selectors or if the website is blocking requests.")
    else:
        print(f"Extracted {df.shape[0]} rows.")

    # Save to CSV
    df.to_csv("College Details.csv", index=False, encoding="utf-8-sig")
    print("Scraping completed! Data saved to College Details.csv")
    print(df.head())

if __name__ == "__main__":
    main()


✅ Extracted 440 rows.
Scraping completed! Data saved to Course Detail.csv
                                        College Name  \
0  AAA College of Engineering and Technology, Siv...   
1  AAA College of Engineering and Technology, Siv...   
2  AAA College of Engineering and Technology, Siv...   
3  AAA College of Engineering and Technology, Siv...   
4  AAA College of Engineering and Technology, Siv...   

                   Location College Type Rating  \
0  Virudhunagar, Tamil Nadu      Private  5.0/5   
1  Virudhunagar, Tamil Nadu      Private  5.0/5   
2  Virudhunagar, Tamil Nadu      Private  5.0/5   
3  Virudhunagar, Tamil Nadu      Private  5.0/5   
4  Virudhunagar, Tamil Nadu      Private  5.0/5   

                                        Courses Duration Fee Structure  \
0           BE Computer Science and Engineering  4 Years           N/A   
1                     BE Mechanical Engineering  4 Years           N/A   
2  BE Electronics and Communication Engineering  4 Years    