In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL of the website
base_url = "https://catalog.clemson.edu/search_advanced.php"

# Common query parameters
common_params = {
    "cur_cat_oid": "40",
    "cpage": "1",
    "ppage": "1",
    "pcpage": "1",
    "spage": "1",
    "tpage": "1",
    "search_database": "Search",
    "filter[keyword]": "cpsc",
    "filter[3]": "1",
    "filter[31]": "1"
}

# Course prefixes to remove from printing
prefixes_to_remove = set([
    "AAH", "ACCT", "AGED", "AGM", "AGRB", "AGSC", "AL", "AMFG", "ANTH", "APEC",
    "ARCH", "ART", "ASTR", "AUD", "AUE", "AVS", "BCHM", "BDSI", "BE", "BIOE",
    "BIOL", "BMOL", "BSHS", "CE", "CH", "CHE", "COMM", "COOP", "CRP",
    "CSM", "DPA", "DSA", "ECE", "ECON", "ED", "EDC", "EDEC", "EDEL", "EDF", "EDHD",
    "EDIS", "EDL", "EDLL", "EDLT", "EDML", "EDSA", "EDSC", "EDSP", "EES", "ELE",
    "ENGL", "ENR", "ENSP", "ENT", "ESED", "ETOX", "FCS", "FDSC", "FIN", "FNPS",
    "FNR", "FOR", "FR", "GBS", "GC", "GEN", "GEOG", "GEOL", "GER", "GRAD", "HCC",
    "HCG", "HIST", "HLTH", "HORT", "HP", "HRD", "HSPV", "HUM", "IE", "INT", "JUST",
    "LANG", "LARC", "LAW", "MATH", "MBA", "MBIO", "ME", "MGT", "MHA", "MICR", "MKT",
    "MSE", "MTSA", "MUSC", "NURS", "NUTR", "PADM", "PDBE", "PES", "PHIL", "PHYS",
    "PKSC", "PLPA", "POSC", "POST", "PRTM", "PSYC", "RCID", "RED", "REL", "RIES",
    "RS", "RUD", "SAP", "SOC", "SPAN", "SSCI", "STAT", "THEA", "TSAP", "TTT", "VTED",
    "WCIN", "WFB", "WS", "YDP"
])

# Set to store unique CPSC-related lines
unique_cpsc_lines = set()

# Loop through pages from 1 to 15
for ecpage in range(1, 16):
    # Add ecpage parameter to the common parameters
    params = common_params.copy()
    params["ecpage"] = str(ecpage)

    # Sending a GET request to the website
    response = requests.get(base_url, params=params)

    # Checking if the request was successful
    if response.status_code == 200:
        # Parsing the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Find and add unique lines containing "CPSC" text to the set
        page_text = soup.get_text()
        cpsc_related_data = [line.strip() for line in page_text.split('\n') if "CPSC" in line]

        # Filter out lines with prefixes to remove
        filtered_cpsc_data = [line for line in cpsc_related_data if not any(prefix in line for prefix in prefixes_to_remove)]

        unique_cpsc_lines.update(filtered_cpsc_data)
    else:
        print(f"Failed to fetch the website content for ecpage={ecpage}.")

# Sort the class titles alphabetically
sorted_lines = sorted(unique_cpsc_lines)

# Print the sorted CPSC-related lines
for line in sorted_lines:
    print(line)
    
# Create a DataFrame from the sorted_lines
data = {"CPSC_Related_Lines": sorted_lines}
df = pd.DataFrame(data)

# Specify the Excel file path
excel_file_path = "cpsc_related_data.xlsx"

# Save the DataFrame to Excel
df.to_excel(excel_file_path, index=False)

print(f"Data saved to {excel_file_path}")


CPSC  6030 - Data Visualization
CPSC  6040 - Computer Graphics Images
CPSC  6050 - Computer Graphics
CPSC  6070 - Applied Computer Vision
CPSC  6110 - Virtual Reality Systems
CPSC  6120 - Eye Tracking Methodology and Applications
CPSC  6140 - Human and Computer Interaction
CPSC  6150 - Mobile Device Software Development
CPSC  6160 - 2-D Game Engine Construction
CPSC  6170 - 3D Game Programming: Algorithms and Techniques
CPSC  6180 - Usable Privacy and Security
CPSC  6190 - Physical Modeling and Animation
CPSC  6200 - Computer Security Principles
CPSC  6240 - System Administration and Security
CPSC  6280 - Design and Implementation of Programming Languages
CPSC  6300 - Applied Data Science
CPSC  6420 - Artificial Intelligence
CPSC  6430 - Machine Learning: Implementation and Evaluation
CPSC  6440 - Cloud Computing Architecture
CPSC  6550 - Computational Science
CPSC  6620 - Database Management Systems
CPSC  6720 - Software Development Methodology
CPSC  6770 - Distributed and Cluster Com