In [None]:
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures

# Base URL for pagination
BASE_URL = "https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page={}&sort_by=3&stream=1"

# Headers to mimic a real browser (to prevent blocking)
HEADERS = {"User-Agent": "Mozilla/5.0"}

# Total number of pages to scrape
TOTAL_PAGES = 3

# Setup retry strategy
RETRY_STRATEGY = Retry(
    total=5,  # Retry up to 5 times
    backoff_factor=1,  # Waits 1s, 2s, 4s, 8s, etc.
    status_forcelist=[500, 502, 503, 504, 429],  # Retries on these HTTP errors
    allowed_methods=["GET"],
)

# Create a session with retry adapter
def create_session():
    session = requests.Session()
    adapter = HTTPAdapter(max_retries=RETRY_STRATEGY)
    session.mount("https://", adapter)
    return session

# Function to scrape a single page
def scrape_page(page, session):
    url = BASE_URL.format(page)
    try:
        response = session.get(url, headers=HEADERS, timeout=10)  # 10s timeout
        if response.status_code != 200:
            print(f"Failed to fetch page {page} - Status Code: {response.status_code}")
            return []
    except requests.RequestException as e:
        print(f"Request failed for page {page}: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    colleges = []

    for card in soup.find_all("div", class_="card_block"):
        name = card.find("h3", class_="college_name d-md-none")
        rank = card.find("div", class_="ranking_strip d-md-none")
        nirf_rank = rank.find("strong") if rank else None

        colleges.append({
            "College Name": name.get_text(strip=True) if name else "N/A",
            "NIRF Ranking": nirf_rank.get_text(strip=True) if nirf_rank else "N/A"
        })

    return colleges

# Run scraper with session and ThreadPoolExecutor
def main():
    session = create_session()

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = executor.map(lambda p: scrape_page(p, session), range(1, TOTAL_PAGES + 1))

    session.close()  # Close session after use

    # Flatten list and save data
    df = pd.DataFrame([college for result in results for college in result])
    df.to_csv("Ranking.csv", index=False, encoding="utf-8-sig")

    print("Scraping completed! Data saved to Ranking.csv")
    print(df.head())

if __name__ == "__main__":
    main()


Scraping completed! Data saved to optimized_NIRF_ranking.csv
                                        College Name NIRF Ranking
0  IIT Madras (IITM) - Indian Institute of Techno...            1
1   IIT Delhi - Indian Institute of Technology Delhi            2
2  IIT Bombay - Indian Institute of Technology Bo...            3
3  IIT Kanpur - Indian Institute of Technology Ka...            4
4  IIT Kharagpur - Indian Institute of Technology...            5
