In [None]:
!pip install requests beautifulsoup4




In [None]:
import requests
from bs4 import BeautifulSoup
import json

# URL of the UC Davis Electrical Engineering course catalog
URL = "https://catalog.ucdavis.edu/courses-subject-code/eec/"

# Request the page
response = requests.get(URL)
soup = BeautifulSoup(response.text, "html.parser")

# List to store extracted course data
course_data = []

# Find all course containers (adjust class if needed)
course_sections = soup.find_all("div", class_="courseblock")

for course in course_sections:
    # Extract Course ID (EEC 234A, EEC 150, etc.)
    course_id_section = course.find("span", class_="text courseblockdetail detail-code margin--span text--semibold text--big")
    course_id = course_id_section.text.strip() if course_id_section else "N/A"

    # Extract Course Title
    title_section = course.find("span", class_="text courseblockdetail detail-title margin--span text--semibold text--big")
    course_title = title_section.text.strip() if title_section else "N/A"

    # Extract Course Units
    units_section = course.find("span", class_="text courseblockdetail detail-hours_html margin--span text--semibold text--big")
    course_units = units_section.text.strip() if units_section else "N/A"

    # Extract Course Description
    desc_section = course.find("div", class_="courseblockextra noindent")
    course_desc = desc_section.text.strip() if desc_section else "N/A"

    # Extract Prerequisites
    prereq_section = course.find("p", class_="text courseblockdetail detail-prerequisite")
    course_prereqs = prereq_section.text.strip() if prereq_section else "N/A"

    if prereq_section and "Prerequisite" in prereq_section.text:
        course_prereqs = prereq_section.text.replace("Prerequisite(s):", "").strip()

    # Store structured course data
    course_data.append({
        "course_id": course_id,
        "title": course_title,
        "units": course_units,
        "description": course_desc,
        "prerequisites": course_prereqs
    })

# Save to JSON for later use
with open("ucd_eec_courses.json", "w") as f:
    json.dump(course_data, f, indent=4)

print("Scraping complete. Data saved to 'ucd_eec_courses.json'.")


Scraping complete. Data saved to 'ucd_eec_courses.json'.


In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time

# List of all UC Davis major abbreviations
majors = [
    "AAS", "ABG", "ABI", "ABT", "ACC", "AED", "AGC", "AHI", "AMS", "ANB", "ANE", "ANG", "ANS", "ANT", "APC", "ARB", "ARE", "ART", "ASA", "AST", "ATM", "AVS", "BAX", "BCB", "BCM", "BIM", "BIS", "BIT", "BPH", "BPT", "BST",
    "CAR", "CDM", "CGS", "CHA", "CHE", "CHI", "CHN", "CLA", "CLH", "CMN", "CNS", "COM", "CRD", "CRI", "CST", "CTS", "DEB", "DER", "DES", "DRA", "DSC", "DVM", "EAD", "EAE", "EAP", "EAS", "EBS", "ECH", "ECI", "ECL", "ECN",
    "ECS", "EDU", "EEC", "EGG", "EME", "EMR", "EMS", "ENG", "ENH", "ENL", "ENM", "ENT", "ENV", "EPI", "ESM", "ESP", "ETX", "EVE", "EVH", "EXB", "EXS", "FAH", "FAP", "FMS", "FOR", "FPS", "FRE", "FRS", "FSE", "FSM", "FST",
    "GAS", "GDB", "GEL", "GEO", "GER", "GGG", "GLO", "GMD", "GRD", "GRK", "GSW", "HDE", "HEB", "HIN", "HIS", "HMR", "HND", "HNR", "HON", "HPH", "HRT", "HUM", "HYD", "IAD", "ICL", "IDI", "IMD", "IMM", "IRE", "IST", "ITA",
    "JPN", "JST", "LAT", "LAW", "LDA", "LED", "LIN", "LTS", "MAE", "MAT", "MCB", "MCN", "MCP", "MDD", "MDS", "MGB", "MGP", "MGT", "MGV", "MHI", "MIB", "MIC", "MMG", "MMI", "MPM", "MSA", "MSC", "MST", "MUS", "NAS", "NEM",
    "NEP", "NEU", "NPB", "NRS", "NSC", "NSU", "NUB", "NUT", "OBG", "OPT", "OSU", "OTO", "PAS", "PBG", "PBI", "PDF", "PED", "PER", "PFS", "PHA", "PHI", "PHR", "PHY", "PLB", "PLP", "PLS", "PMD", "PMI", "PMR", "POL", "POR",
    "PSC", "PSU", "PSY", "PTX", "PUL", "PUN", "RAL", "RDI", "REL", "RNU", "RON", "RST", "RUS", "SAF", "SAS", "SOC", "SPA", "SPH", "SSB", "SSC", "STA", "STH", "STS", "SUR", "TAE", "TCS", "TTP", "TXC", "URD", "URO", "UWP",
    "VEN", "VET", "VMB", "VME", "VSR", "WAS", "WFC", "WLD", "WMS"
]


# Base URL pattern for UC Davis course catalog
BASE_URL = "https://catalog.ucdavis.edu/courses-subject-code/{}/"

def scrape_major_courses(major):
    """Scrapes all courses for a given major and saves to a JSON file."""
    url = BASE_URL.format(major.lower())  # Convert major abbreviation to lowercase for URL
    print(f"Scraping {major} from {url}...")

    # Request the page
    response = requests.get(url)

    # Check if page is valid
    if response.status_code != 200:
        print(f"Failed to retrieve {major}. Skipping...")
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    course_data = []

    # Find all course containers
    course_sections = soup.find_all("div", class_="courseblock")

    for course in course_sections:
        # Extract Course ID (e.g., ECS 150, EEC 234A)
        course_id_section = course.find("span", class_="text courseblockdetail detail-code margin--span text--semibold text--big")
        course_id = course_id_section.text.strip() if course_id_section else "N/A"

        # Extract Course Title
        title_section = course.find("span", class_="text courseblockdetail detail-title margin--span text--semibold text--big")
        course_title = title_section.text.strip() if title_section else "N/A"

        # Extract Course Units
        units_section = course.find("span", class_="text courseblockdetail detail-hours_html margin--span text--semibold text--big")
        course_units = units_section.text.strip() if units_section else "N/A"

        # Extract Course Description
        desc_section = course.find("div", class_="courseblockextra noindent")
        course_desc = desc_section.text.strip() if desc_section else "N/A"

        # Extract Prerequisites
        prereq_section = course.find("p", class_="text courseblockdetail detail-prerequisite")
        course_prereqs = prereq_section.text.strip() if prereq_section else "N/A"

        if prereq_section and "Prerequisite" in prereq_section.text:
            course_prereqs = prereq_section.text.replace("Prerequisite(s):", "").strip()

        # Store structured data
        course_data.append({
            "course_id": course_id,
            "title": course_title,
            "units": course_units,
            "description": course_desc,
            "prerequisites": course_prereqs
        })

    # Save each major's course data to a JSON file
    filename = f"ucd_courses_{major}.json"
    with open(filename, "w") as f:
        json.dump(course_data, f, indent=4)

    print(f"✅ Saved {len(course_data)} courses for {major} in {filename}")

    return course_data

# Loop through each major and scrape courses
for major in majors:
    scrape_major_courses(major)
    time.sleep(1)  # Prevent rate-limiting

print("🎉 Scraping complete for all majors!")


Scraping AAS from https://catalog.ucdavis.edu/courses-subject-code/aas/...
✅ Saved 75 courses for AAS in ucd_courses_AAS.json
Scraping ABG from https://catalog.ucdavis.edu/courses-subject-code/abg/...
✅ Saved 16 courses for ABG in ucd_courses_ABG.json
Scraping ABI from https://catalog.ucdavis.edu/courses-subject-code/abi/...
✅ Saved 14 courses for ABI in ucd_courses_ABI.json
Scraping ABT from https://catalog.ucdavis.edu/courses-subject-code/abt/...
✅ Saved 27 courses for ABT in ucd_courses_ABT.json
Scraping ACC from https://catalog.ucdavis.edu/courses-subject-code/acc/...
✅ Saved 17 courses for ACC in ucd_courses_ACC.json
Scraping AED from https://catalog.ucdavis.edu/courses-subject-code/aed/...
✅ Saved 19 courses for AED in ucd_courses_AED.json
Scraping AGC from https://catalog.ucdavis.edu/courses-subject-code/agc/...
✅ Saved 3 courses for AGC in ucd_courses_AGC.json
Scraping AHI from https://catalog.ucdavis.edu/courses-subject-code/ahi/...
✅ Saved 94 courses for AHI in ucd_courses_AH

In [None]:
import pandas as pd
import json
import glob
import os

# Define the directory where JSON files are stored
data_dir = "/content/"

# Function to clean and process data
def clean_units(units):
    return units.replace('(', '').replace(')', '').replace(' units', '') if isinstance(units, str) else None

def clean_prerequisites(prerequisites):
    return prerequisites.replace('\u00a0', ' ') if isinstance(prerequisites, str) else None

def clean_title(title):
    return title.replace('\u2014', '').strip() if isinstance(title, str) else None

def classify_course_level(course_id):
    try:
        course_number = int(''.join(filter(str.isdigit, course_id)))
        if course_number < 100:
            return 'Lower Division'
        elif 100 <= course_number < 200:
            return 'Upper Division'
        else:
            return 'Graduate Level'
    except ValueError:
        return 'Unknown'

# Function to process all JSON files
def process_all_courses(data_dir):
    all_courses = []
    json_files = glob.glob(os.path.join(data_dir, "ucd_courses_*.json"))

    for file_path in json_files:
        major = os.path.basename(file_path).replace("ucd_courses_", "").replace(".json", "")

        with open(file_path, 'r', encoding='utf-8') as f:
            courses = json.load(f)

        df = pd.DataFrame(courses)

        # Ensure required columns exist before processing
        df['units'] = df['units'].apply(clean_units) if 'units' in df else None
        df['title'] = df['title'].apply(clean_title) if 'title' in df else None
        df['prerequisites'] = df['prerequisites'].apply(clean_prerequisites) if 'prerequisites' in df else None
        df['course_level'] = df['course_id'].apply(classify_course_level) if 'course_id' in df else None
        df['major'] = major  # Add the major column

        all_courses.append(df)

    return pd.concat(all_courses, ignore_index=True) if all_courses else pd.DataFrame()

# Process all JSON files
df_all_courses = process_all_courses(data_dir)

# Display the cleaned DataFrame
df_all_courses.head()


Unnamed: 0,course_id,title,units,description,prerequisites,course_level,major
0,MST 020A,Early Medieval Culture,4,"Learning Activities: Lecture 3 hour(s), Extens...",,Lower Division,MST
1,MST 020B,The Culture of the High Middle Ages,4,"Learning Activities: Lecture 3 hour(s), Extens...",,Lower Division,MST
2,MST 098,Directed Group Study,1-5,Learning Activities: Variable.\nGrade Mode: Pa...,,Lower Division,MST
3,MST 098F,Student Facilitated Course,1-4,Learning Activities: Variable 1-4 hour(s).\nGr...,Consent of instructor.,Lower Division,MST
4,MST 099,Special Study for Undergraduates,1-5,Learning Activities: Variable.\nGrade Mode: Pa...,,Lower Division,MST


In [None]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=df_all_courses)

https://docs.google.com/spreadsheets/d/1uADnqDvhh2fyQLQJNoZCCKQ3wsgZQRcogbvnk_9tJ6w#gid=0
