In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [4]:
def get_details(college_name):
    base_url = "https://nces.ed.gov/collegenavigator/?"
    params = {
        "q": college_name,
        "s": 1}
    
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        college_id_element = soup.find("div", id="school_id_num")
        zipcode_element = soup.find("span", class_="primary_info_text")
        city_state_element = soup.find("span", class_="primary_info_text")

        if college_id_element and zipcode_element and city_state_element:
            college_id = college_id_element.text.strip()
            zipcode = zipcode_element.text.strip()
            city_state = city_state_element.text.strip().split(", ")
            city = city_state[0]
            state = city_state[1]

            return {
                "college_id": college_id,
                "zipcode": zipcode,
                "city": city,
                "state": state
            }
        else:
            print(f"College '{college_name}' not found or details unavailable on NCES website.")
            return None
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching details for '{college_name}': {e}")
        return None


In [5]:
def main():
    """
    Reads college names from two CSV files, scrapes details from NCES College Navigator, and saves combined data to a new CSV file.
    """

    # Read college names from CSV files (replace with your file paths)
    df1 = pd.read_csv("List1.csv")
    college_names_1 = df1["INSTNM"].tolist()

    df2 = pd.read_csv("processed_List2.csv")
    college_names_2 = df2["inst"].tolist()

    # Combine college names from both files
    college_names = list(set(college_names_1 + college_names_2))

    # Create an empty DataFrame to store scraped details
    college_data = pd.DataFrame(columns=["college_name", "college_id", "zipcode", "city", "state"])

    for college_name in college_names:
        details = get_details(college_name)

        if details:  # Add details only if successfully scraped
            details["college_name"] = college_name
            college_data = college_data.append(details, ignore_index=True)

    # Save combined data to a new CSV file
    college_data.to_csv("combined_college_data.csv", index=False)

    print("College details scraped and saved to 'combined_college_data.csv'.")

if __name__ == "__main__":
    main()


College 'Western Connecticut State University' not found or details unavailable on NCES website.
College 'Atlanta Metropolitan State College' not found or details unavailable on NCES website.
College 'Carlow University' not found or details unavailable on NCES website.
College 'Ferrum College' not found or details unavailable on NCES website.
College 'Marymount Manhattan College' not found or details unavailable on NCES website.
College 'Premier Barber School' not found or details unavailable on NCES website.
College 'Academy of Professional Cosmetology' not found or details unavailable on NCES website.
College 'Beal University' not found or details unavailable on NCES website.
College 'United Theological Seminary' not found or details unavailable on NCES website.
College 'Monmouth University' not found or details unavailable on NCES website.
College 'Fountain of Youth Academy of Cosmetology' not found or details unavailable on NCES website.
College 'University of the Cumberlands' not 

KeyboardInterrupt: 