In [11]:
!pip install reportlab





**Web Scraping Data from** https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/

In [31]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

# Function to fetch hyperlinks from a URL
def fetch_hyperlinks(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract hyperlinks using BeautifulSoup
        links = set()
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href and not href.startswith('#'):
                full_url = urllib.parse.urljoin(url, href)
                if full_url.startswith(('http://', 'https://')):
                    links.add(full_url)
        return list(links)
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return []
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return []

# Main page URL
main_url = 'https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/'

# Fetch hyperlinks from the main page only
all_hyperlinks = fetch_hyperlinks(main_url)
num_hyperlinks = len(all_hyperlinks)
print(f"Total unique hyperlinks found on the main page: {num_hyperlinks}")

# Save hyperlinks to a text file
output_file = 'hyperlinks.txt'
with open(output_file, 'w') as file:
    for hyperlink in all_hyperlinks:
        file.write(hyperlink + "\n")

print(f"All hyperlinks saved to {output_file}")


Total unique hyperlinks found on the main page: 87
All hyperlinks saved to hyperlinks.txt


In [29]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

# Function to fetch and filter specific hyperlinks from a URL
def fetch_specific_hyperlinks(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract specific hyperlinks based on patterns or attributes
        links = set()
        for a in soup.find_all('a', href=True):
            href = a['href']
            # Filter links based on specific criteria
            if href and not href.startswith('#') and 'faculty' in href:
                full_url = urllib.parse.urljoin(url, href)
                if full_url.startswith(('http://', 'https://')):
                    links.add(full_url)

        return list(links)
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return []
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return []

# Main page URL
main_url = 'https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/'

# Fetch specific hyperlinks from the main page
specific_hyperlinks = fetch_specific_hyperlinks(main_url)
num_specific_hyperlinks = len(specific_hyperlinks)
print(f"Total specific hyperlinks fetched from the main page: {num_specific_hyperlinks}")

# Print a sample of the hyperlinks
print("Sample specific hyperlinks:")
for hyperlink in specific_hyperlinks[:10]:  # Print first 10 hyperlinks
    print(hyperlink)
output_file = '/content/website_hyperlinks.txt'
with open(output_file, 'w') as file:
    for hyperlink in specific_hyperlinks:
        file.write(hyperlink + "\n")

print(f"All specific hyperlinks saved to {output_file}")


Total specific hyperlinks fetched from the main page: 10
Sample specific hyperlinks:
https://www.csus.edu/faculty/m/pmuyan/
https://www.csus.edu/faculty/j/jiny/
https://www.csus.edu/faculty/s/asalem/
https://www.csus.edu/faculty/b/badruddoja/
https://www.csus.edu/faculty/o/jouyang/
https://www.csus.edu/faculty/f/faroughi/
https://www.csus.edu/faculty/z/zhangc/
https://www.csus.edu/faculty/a/arad/
https://www.csus.edu/faculty/s/shankar.swamy/
https://www.csus.edu/faculty/s/ghassan.shobaki
All specific hyperlinks saved to /content/website_hyperlinks.txt


In [30]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

# Function to fetch and filter specific hyperlinks from a URL
def fetch_specific_hyperlinks(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract specific hyperlinks based on patterns or attributes
        links = set()
        for a in soup.find_all('a', href=True, attrs={'aria-describedby': True}):
            href = a['href']
            aria_describedby = a.get('aria-describedby', '')
            # Filter links based on specific criteria
            if href and not href.startswith('#'):
                full_url = urllib.parse.urljoin(url, href)
                # Include URLs with specific patterns in the aria-describedby attribute
                if 'department' in aria_describedby:
                    if full_url.startswith(('http://', 'https://')):
                        links.add(full_url)

        return list(links)
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return []
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return []

# Main page URL
main_url = 'https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/'

# Fetch specific hyperlinks from the main page
specific_hyperlinks = fetch_specific_hyperlinks(main_url)
num_specific_hyperlinks = len(specific_hyperlinks)
print(f"Total specific hyperlinks fetched from the main page: {num_specific_hyperlinks}")

# Print a sample of the hyperlinks
print("Sample specific hyperlinks:")
for hyperlink in specific_hyperlinks[:10]:  # Print first 10 hyperlinks
    print(hyperlink)
output_file = '/content/specific_hyperlinks.txt'
with open(output_file, 'w') as file:
    for hyperlink in specific_hyperlinks:
        file.write(hyperlink + "\n")

print(f"All specific hyperlinks saved to {output_file}")

Total specific hyperlinks fetched from the main page: 2
Sample specific hyperlinks:
https://athena.ecs.csus.edu/~chenh/index.html
https://www.csus.edu/faculty/o/jouyang/
All specific hyperlinks saved to /content/specific_hyperlinks.txt


In [32]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

# Function to read URLs from a text file
def read_urls_from_file(filename):
    try:
        with open(filename, 'r') as file:
            urls = [line.strip() for line in file if line.strip()]
        return urls
    except FileNotFoundError:
        print(f"File {filename} not found.")
        return []

# Function to fetch and parse data from a URL
def scrape_data_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(response.text, 'html.parser')
        # Example of extracting main content - adjust as needed
        main_content = soup.get_text(separator='\n', strip=True)
        return main_content
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return ""
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return ""

# File names
files = ['specific_hyperlinks.txt', 'website_hyperlinks.txt', 'hyperlinks.txt']
urls = []

# Collect all URLs from the files
for file in files:
    urls.extend(read_urls_from_file(file))

# Add the main URL
main_url = 'https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/'
urls.append(main_url)

# Scrape data from all URLs and save to a file
output_file = 'scraped_data.txt'
with open(output_file, 'w') as file:
    for url in urls:
        print(f"Scraping data from {url}")
        data = scrape_data_from_url(url)
        if data:
            file.write(f"URL: {url}\n")
            file.write(data + "\n\n")

print(f"Scraped data saved to {output_file}")


Scraping data from https://athena.ecs.csus.edu/~chenh/index.html
Scraping data from https://www.csus.edu/faculty/o/jouyang/
Scraping data from https://www.csus.edu/faculty/m/pmuyan/
Scraping data from https://www.csus.edu/faculty/j/jiny/
Scraping data from https://www.csus.edu/faculty/s/asalem/
Scraping data from https://www.csus.edu/faculty/b/badruddoja/
Scraping data from https://www.csus.edu/faculty/o/jouyang/
Scraping data from https://www.csus.edu/faculty/f/faroughi/
Scraping data from https://www.csus.edu/faculty/z/zhangc/
Scraping data from https://www.csus.edu/faculty/a/arad/
Scraping data from https://www.csus.edu/faculty/s/shankar.swamy/
Scraping data from https://www.csus.edu/faculty/s/ghassan.shobaki
Scraping data from https://www.csus.edu/faculty/m/pmuyan/
Scraping data from https://www.tiktok.com/@sacstate
Scraping data from https://www.csus.edu/student-life/student-organizations/sports-recreation/index.html
Scraping data from http://directory.csus.edu
Scraping data from 

In [41]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

def fetch_professor_details(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        professors = []

        # Adjust this selector to match the actual professor information container
        for group_member in soup.find_all('div', class_='group-member faculty-full-time'):
            name = group_member.find('h3').get_text(strip=True) if group_member.find('h3') else 'N/A'
            job_title = group_member.find('p', class_='job-title').get_text(strip=True) if group_member.find('p', class_='job-title') else 'N/A'

            contact_block = group_member.find('ul', class_='contact-block')
            img_container = group_member.find('div', class_='img-container')

            if contact_block:
                email = contact_block.find('li', class_='email').find('a').get_text(strip=True) if contact_block.find('li', class_='email') else 'N/A'
                website = contact_block.find('li', class_='website').find('a')['href'] if contact_block.find('li', class_='website') else 'N/A'
                location = contact_block.find('li', class_='location').get_text(strip=True).replace('Location:', '').strip() if contact_block.find('li', class_='location') else 'N/A'
                phone = contact_block.find('li', class_='phone').get_text(strip=True).replace('Phone:', '').strip() if contact_block.find('li', class_='phone') else 'N/A'

                # Extract image URL
                img_url = img_container.find('img')['src'] if img_container and img_container.find('img') else 'N/A'
                # Construct full image URL if it's a relative path
                img_url = urllib.parse.urljoin(url, img_url)

                professors.append((name, job_title, email, website, location, phone, img_url))

        return professors
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return []
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return []

# Main page URL
main_url = 'https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/'

# Fetch professor details from the main page
professor_data = fetch_professor_details(main_url)

# Print the number of fetched professors and a sample
print(f"Fetched data for {len(professor_data)} professors.")
for prof in professor_data[:20]:  # Print first 5 records
    print(prof)


Fetched data for 20 professors.
('Jinsong Ouyang', 'Department Chair', 'ouyangj@ecs.csus.edu', 'https://www.csus.edu/faculty/o/jouyang/', 'RVR 3018G', '(916) 278-7628', 'https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/_internal/photos/ouyang-sm.jpg')
('Anna Baynes', 'Associate Chair', 'abaynes@ecs.csus.edu', 'N/A', 'RVR 3018H', '(916) 278-7947', 'https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/_internal/photos/baynes-sm.jpg')
('Haiquan (Victor) Chen', 'Graduate Coordinator', 'chenh@ecs.csus.edu', 'https://athena.ecs.csus.edu/~chenh/index.html', 'RVR 5018 / 3018I', '(916) 278-6087 / 278-5769', 'https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/_internal/photos/chen-sm.jpg')
('Behnam Arad', 'Professor', 'arad@csus.edu', 'https://www.csus.edu/faculty/a/arad/', 'RVR 5044', '(916) 278-7160', 'https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/_internal/photo

In [43]:
import sqlite3

def update_database_schema():
    conn = sqlite3.connect('professors.db')
    cursor = conn.cursor()

    # Drop the table if it exists
    cursor.execute('DROP TABLE IF EXISTS professors')

    # Create the table with the correct schema
    cursor.execute('''
        CREATE TABLE professors (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT,
            job_title TEXT,
            email TEXT,
            website TEXT,
            location TEXT,
            phone TEXT,
            img_url TEXT
        )
    ''')

    conn.commit()
    conn.close()

# Update the schema
update_database_schema()


In [44]:
import sqlite3

def save_to_database(data):
    conn = sqlite3.connect('professors.db')
    cursor = conn.cursor()

    # Insert data into the table
    cursor.executemany('''
        INSERT INTO professors (name, job_title, email, website, location, phone, img_url)
        VALUES (?, ?, ?, ?, ?, ?, ?)
    ''', data)

    conn.commit()
    conn.close()

# Save data to database
save_to_database(professor_data)


In [46]:
import requests
from bs4 import BeautifulSoup
import sqlite3

def fetch_faculty_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract data for full-time faculty
        full_time_faculty = []
        for faculty_div in soup.find_all('div', class_='group-member faculty-full-time'):
            name = faculty_div.find('h3').text.strip() if faculty_div.find('h3') else 'N/A'
            job_title = faculty_div.find('p', class_='job-title').text.strip() if faculty_div.find('p', class_='job-title') else 'N/A'
            img_url = faculty_div.find('div', class_='img-container').find('img')['src'] if faculty_div.find('div', class_='img-container') else 'N/A'
            location = faculty_div.find('li', class_='location').text.split(':')[1].strip() if faculty_div.find('li', class_='location') else 'N/A'
            phone = faculty_div.find('li', class_='phone').text.split(':')[1].strip() if faculty_div.find('li', class_='phone') else 'N/A'
            email = faculty_div.find('li', class_='email').find('a')['href'].replace('mailto:', '') if faculty_div.find('li', class_='email') else 'N/A'
            website = faculty_div.find('li', class_='website').find('a')['href'] if faculty_div.find('li', class_='website') else 'N/A'

            full_time_faculty.append({
                'name': name,
                'job_title': job_title,
                'email': email,
                'website': website,
                'location': location,
                'phone': phone,
                'img_url': img_url
            })

        # Extract data for part-time faculty
        part_time_faculty = []
        for faculty_div in soup.find_all('div', class_='group-member faculty-part-time'):
            name = faculty_div.find('h3').text.strip() if faculty_div.find('h3') else 'N/A'
            job_title = faculty_div.find('p', class_='job-title').text.strip() if faculty_div.find('p', class_='job-title') else 'N/A'
            img_url = faculty_div.find('div', class_='img-container').find('img')['src'] if faculty_div.find('div', class_='img-container') else 'N/A'
            location = faculty_div.find('li', class_='location').text.split(':')[1].strip() if faculty_div.find('li', class_='location') else 'N/A'
            email = faculty_div.find('li', class_='email').find('a')['href'].replace('mailto:', '') if faculty_div.find('li', class_='email') else 'N/A'
            specialty = faculty_div.find('ul', class_='sec-info-block').find('li').text.split(':')[1].strip() if faculty_div.find('ul', class_='sec-info-block') else 'N/A'

            part_time_faculty.append({
                'name': name,
                'job_title': job_title,
                'email': email,
                'website': 'N/A',
                'location': location,
                'phone': 'N/A',
                'img_url': img_url,
                'specialty': specialty
            })

        return full_time_faculty, part_time_faculty
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return [], []
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return [], []

def save_to_database(full_time_faculty, part_time_faculty):
    # Connect to SQLite database (or create it)
    conn = sqlite3.connect('faculty_data.db')
    cursor = conn.cursor()

    # Create table if not exists
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS faculty (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT,
            job_title TEXT,
            email TEXT,
            website TEXT,
            location TEXT,
            phone TEXT,
            img_url TEXT,
            specialty TEXT
        )
    ''')

    # Insert data into the table
    for faculty in full_time_faculty + part_time_faculty:
        cursor.execute('''
            INSERT INTO faculty (name, job_title, email, website, location, phone, img_url, specialty)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            faculty['name'],
            faculty['job_title'],
            faculty['email'],
            faculty.get('website', 'N/A'),
            faculty['location'],
            faculty.get('phone', 'N/A'),
            faculty['img_url'],
            faculty.get('specialty', 'N/A')
        ))

    # Commit and close the connection
    conn.commit()
    conn.close()

# Main URL
main_url = 'https://www.csus.edu/college/engineering-computer-science/computer-science/meet-us/'

# Fetch faculty data and save to database
full_time_faculty, part_time_faculty = fetch_faculty_data(main_url)
save_to_database(full_time_faculty, part_time_faculty)
print(f"Fetched and saved data for {len(full_time_faculty) + len(part_time_faculty)} faculty members.")


Fetched and saved data for 42 faculty members.


In [47]:
import sqlite3

def display_table():
    # Connect to SQLite database
    conn = sqlite3.connect('faculty_data.db')
    cursor = conn.cursor()

    # Fetch all rows from the table
    cursor.execute('SELECT * FROM faculty')
    rows = cursor.fetchall()

    # Print table contents
    print(f"{'ID':<5} {'Name':<30} {'Job Title':<20} {'Email':<30} {'Website':<30} {'Location':<20} {'Phone':<15} {'Image URL':<50} {'Specialty':<30}")
    print("-" * 150)
    for row in rows:
        print(f"{row[0]:<5} {row[1]:<30} {row[2]:<20} {row[3]:<30} {row[4]:<30} {row[5]:<20} {row[6]:<15} {row[7]:<50} {row[8]:<30}")

    # Close the connection
    conn.close()

# Display the table
display_table()


ID    Name                           Job Title            Email                          Website                        Location             Phone           Image URL                                          Specialty                     
------------------------------------------------------------------------------------------------------------------------------------------------------
1     Jinsong Ouyang                 Department Chair     ouyangj@ecs.csus.edu           https://www.csus.edu/faculty/o/jouyang/ RVR 3018G            (916) 278-7628  _internal/photos/ouyang-sm.jpg                     N/A                           
2     Anna Baynes                    Associate Chair      abaynes@ecs.csus.edu           N/A                            RVR 3018H            (916) 278-7947  _internal/photos/baynes-sm.jpg                     N/A                           
3     Haiquan (Victor) Chen          Graduate Coordinator chenh@ecs.csus.edu             https://athena.ecs.csus.edu/~chenh

In [48]:
!pip install fpdf2 Pillow requests




In [51]:
import sqlite3
import requests
from fpdf import FPDF
from PIL import Image
from io import BytesIO

# Define the PDF generation function
def save_table_to_pdf(pdf_path):
    # Connect to SQLite database
    conn = sqlite3.connect('faculty_data.db')
    cursor = conn.cursor()

    # Fetch all rows from the table
    cursor.execute('SELECT * FROM faculty')
    rows = cursor.fetchall()

    # Create a PDF document
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Helvetica", size=12)  # Use Helvetica as Arial is deprecated

    # Define column widths
    col_widths = [10, 40, 30, 40, 40, 30, 20, 50, 30]
    headers = ["ID", "Name", "Job Title", "Email", "Website", "Location", "Phone", "Image URL", "Specialty"]

    # Add header row
    for i, header in enumerate(headers):
        pdf.cell(col_widths[i], 10, header, border=1, align='C')
    pdf.ln()

    # Add data rows
    for row in rows:
        for i, cell in enumerate(row):
            if i == 7:  # Image URL column
                # Download the image
                try:
                    response = requests.get(cell)
                    img = Image.open(BytesIO(response.content))
                    img_path = 'temp_image.jpg'
                    img.save(img_path)
                    pdf.image(img_path, x=pdf.get_x(), y=pdf.get_y(), w=30)  # Adjust image width
                    pdf.cell(col_widths[i], 30, "", border=1, align='C')  # Adjust cell height
                except Exception as e:
                    pdf.cell(col_widths[i], 30, "Image Error", border=1, align='C')
            else:
                pdf.cell(col_widths[i], 10, str(cell), border=1, align='C')
        pdf.ln()

    # Save the PDF file
    pdf.output(pdf_path)

    # Close the database connection
    conn.close()

# Specify the path to save the PDF
pdf_output_path = "/content/faculty_data.pdf"
save_table_to_pdf(pdf_output_path)
print(f"PDF saved to {pdf_output_path}")


PDF saved to /content/faculty_data.pdf


**Web Scraping Data from** - https://www.csus.edu/information-resources-technology/my-sac-state-portal/

In [57]:
!pip uninstall -y fpdf
!pip install fpdf2


Found existing installation: fpdf 1.7.2
Uninstalling fpdf-1.7.2:
  Successfully uninstalled fpdf-1.7.2


In [58]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

# Function to fetch hyperlinks from a URL
def fetch_hyperlinks(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract hyperlinks using BeautifulSoup
        links = set()
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href and not href.startswith('#'):
                full_url = urllib.parse.urljoin(url, href)
                if full_url.startswith(('http://', 'https://')):
                    links.add(full_url)

        return list(links)
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return []
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return []

# Main page URL
main_url = 'https://www.csus.edu/information-resources-technology/my-sac-state-portal/'

# Fetch hyperlinks from the main page
hyperlinks = fetch_hyperlinks(main_url)
num_hyperlinks = len(hyperlinks)
print(f"Total unique hyperlinks found on the main page: {num_hyperlinks}")

# Save hyperlinks to a text file
with open("my_sac_state_hyperlinks.txt", "w") as file:
    for hyperlink in hyperlinks:
        file.write(hyperlink + "\n")

print("All hyperlinks saved to my_sac_state_hyperlinks.txt")


Total unique hyperlinks found on the main page: 73
All hyperlinks saved to my_sac_state_hyperlinks.txt


In [63]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
from google.colab import files

# Function to fetch hyperlinks from a URL
def fetch_hyperlinks(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        links = set()
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href and not href.startswith('#'):
                full_url = urllib.parse.urljoin(url, href)
                if full_url.startswith(('http://', 'https://')):
                    links.add(full_url)
        return list(links)
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return []
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return []

# Function to scrape data from a URL
def scrape_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        data = {
            'url': url,
            'title': soup.title.string if soup.title else 'No title',
            'content': soup.get_text()
        }
        return data
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return None

# Function to save data to a text file
def save_to_txt(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for item in data:
            file.write(f"URL: {item['url']}\n")
            file.write(f"Title: {item['title']}\n")
            file.write(f"Content:\n{item['content']}\n")
            file.write("-" * 80 + "\n")
    print(f"Data saved as {filename}")

# Main URL
main_url = 'https://www.csus.edu/information-resources-technology/my-sac-state-portal/'

# Fetch all hyperlinks from the main URL
all_hyperlinks = fetch_hyperlinks(main_url)
all_hyperlinks.append(main_url)
print(f"Fetched {len(all_hyperlinks)} URLs")

# Scrape data from each URL
all_data = []
for url in all_hyperlinks:
    data = scrape_data(url)
    if data:
        all_data.append(data)
    time.sleep(1)

# Save data to a text file
txt_filename = "/content/MySacPortal_data.txt"
save_to_txt(all_data, txt_filename)

# Download the text file
files.download(txt_filename)


Fetched 74 URLs
Error fetching URL https://hornetsports.com/landing/index: 404 Client Error: Not Found for url: https://hornetsports.com/landing/index
Error fetching URL https://www.hornetsports.com: 404 Client Error: Not Found for url: https://hornetsports.com/
Error fetching URL https://www.trumba.com/calendars/sacramento-state-events?trumbaEmbed=filterview%3DFeaturedEvents: HTTPSConnectionPool(host='events.csus.edu', port=443): Max retries exceeded with url: /?trumbaEmbed=filterview%3DFeaturedEvents (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))
Error fetching URL https://www.enterprises.csus.edu/: HTTPSConnectionPool(host='www.enterprises.csus.edu', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))
Data save

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [64]:
import sqlite3
from bs4 import BeautifulSoup
import requests
import urllib.parse
import time

# Connect to SQLite database
conn = sqlite3.connect('sac_state_data.db')
cursor = conn.cursor()

# Create tables
cursor.execute('''
CREATE TABLE IF NOT EXISTS departments (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT,
    location TEXT,
    phone TEXT,
    email TEXT,
    website TEXT
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS sac_state_sites (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    url TEXT,
    title TEXT
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS holiday_list (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    date TEXT,
    holiday_name TEXT
)
''')

# Function to fetch hyperlinks
def fetch_hyperlinks(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        links = set()
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href and not href.startswith('#'):
                full_url = urllib.parse.urljoin(url, href)
                if full_url.startswith(('http://', 'https://')):
                    links.add(full_url)
        return list(links)
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return []
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return []

# Function to scrape data from a URL
def scrape_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        data = {
            'url': url,
            'title': soup.title.string if soup.title else 'No title',
            'content': soup.get_text()
        }
        return data
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return None

# Function to save departments info
def save_departments(departments):
    cursor.executemany('''
    INSERT INTO departments (name, location, phone, email, website)
    VALUES (?, ?, ?, ?, ?)
    ''', departments)
    conn.commit()

# Function to save Sacramento State sites
def save_sac_state_sites(sites):
    cursor.executemany('''
    INSERT INTO sac_state_sites (url, title)
    VALUES (?, ?)
    ''', sites)
    conn.commit()

# Function to save holiday list
def save_holiday_list(holidays):
    cursor.executemany('''
    INSERT INTO holiday_list (date, holiday_name)
    VALUES (?, ?)
    ''', holidays)
    conn.commit()

# Main URL
main_url = 'https://www.csus.edu/information-resources-technology/my-sac-state-portal/'

# Fetch all hyperlinks from the main URL
all_hyperlinks = fetch_hyperlinks(main_url)
all_hyperlinks.append(main_url)
print(f"Fetched {len(all_hyperlinks)} URLs")

# Scrape data from each URL
departments = []
sac_state_sites = []
holidays = []

for url in all_hyperlinks:
    data = scrape_data(url)
    if data:
        # Example logic to extract specific information
        # This part will vary depending on the structure of the website's HTML
        if 'department' in data['title'].lower():
            # Extract department information (for demonstration)
            departments.append(('Department Name', 'Location', 'Phone', 'Email', 'Website'))
        elif 'holiday' in data['title'].lower():
            # Extract holiday list (for demonstration)
            holidays.append(('2024-12-25', 'Christmas'))
        else:
            sac_state_sites.append((url, data['title']))

    time.sleep(1)

# Save data to the database
save_departments(departments)
save_sac_state_sites(sac_state_sites)
save_holiday_list(holidays)

# Close database connection
conn.close()


Fetched 74 URLs
Error fetching URL https://hornetsports.com/landing/index: 404 Client Error: Not Found for url: https://hornetsports.com/landing/index
Error fetching URL https://www.hornetsports.com: 404 Client Error: Not Found for url: https://hornetsports.com/
Error fetching URL https://www.trumba.com/calendars/sacramento-state-events?trumbaEmbed=filterview%3DFeaturedEvents: HTTPSConnectionPool(host='events.csus.edu', port=443): Max retries exceeded with url: /?trumbaEmbed=filterview%3DFeaturedEvents (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))
Error fetching URL https://www.enterprises.csus.edu/: HTTPSConnectionPool(host='www.enterprises.csus.edu', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))


In [65]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('sac_state_data.db')
cursor = conn.cursor()

# Function to fetch and display data from a table
def display_table(table_name):
    print(f"\nContents of table: {table_name}")
    cursor.execute(f"SELECT * FROM {table_name}")
    rows = cursor.fetchall()

    # Get column names
    columns = [description[0] for description in cursor.description]
    print(" | ".join(columns))

    # Print rows
    for row in rows:
        print(" | ".join(str(cell) for cell in row))

# Display contents of each table
display_table('departments')
display_table('sac_state_sites')
display_table('holiday_list')

# Close the database connection
conn.close()



Contents of table: departments
id | name | location | phone | email | website

Contents of table: sac_state_sites
id | url | title
1 | https://www.tiktok.com/@sacstate | TikTok - Make Your Day
2 | https://www.csus.edu/information-resources-technology/my-sac-state-portal/index.html | My Sac State Portal | Sacramento State
3 | https://www.csus.edu/give/ | Sacramento State Giving | Sacramento State
4 | https://scholars.csus.edu | Research Portal
5 | https://www.csus.edu/information-resources-technology/ati/accessibility-statement.html | Accessibility Statement | Sacramento State
6 | https://www.csus.edu/student-life/student-organizations/sports-recreation/index.html | Sports & Recreation | Sacramento State
7 | https://www.csus.edu/compliance/index.html | Compliance | Sacramento State
8 | https://www.csus.edu/student-affairs/index.html | Student Affairs | Sacramento State
9 | https://www.csus.edu/information-resources-technology/accounts-access/index.html | Accounts & Access | Sacramento 

Web Scraping from -- https://catalog.csus.edu/academic-calendar/#spring2024text

In [66]:
import requests
from bs4 import BeautifulSoup

# Function to fetch and parse the academic calendar page
def fetch_academic_calendar(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract holidays data
    holidays = []
    calendar_section = soup.find('section', {'id': 'academic-calendar'})

    if calendar_section:
        for item in calendar_section.find_all('li'):
            date = item.find('strong').text.strip()
            description = item.get_text(strip=True).split('\n', 1)[-1].strip()
            holidays.append((date, description))

    return holidays

# URL for academic calendar
calendar_url = 'https://catalog.csus.edu/academic-calendar/#spring2024text'
holidays = fetch_academic_calendar(calendar_url)
print("Holidays:", holidays)


Holidays: []


In [67]:
def fetch_links_from_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('http'):
            links.append(href)

    return links

# Fetch links from the academic calendar page
links = fetch_links_from_page(calendar_url)
print("Links:", links)


Links: ['http://www.csus.edu/', 'https://sacramentostate.policystat.com/', 'http://www.csus.edu/commencement/', 'http://www.csus.edu/commencement/', 'http://www.csus.edu/commencement/', 'https://twitter.com/sacstate', 'https://www.facebook.com/sacstate', 'https://www.linkedin.com/school/sacstate', 'https://www.instagram.com/sacstate/', 'https://www.csus.edu/', 'https://www2.calstate.edu/', 'https://www.csus.edu/compliance/', 'https://www.csus.edu/campus-safety/', 'https://www.csus.edu/title-ix/']


In [68]:
def fetch_course_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    courses = []
    for course_section in soup.find_all('div', class_='course-section'):
        course_name = course_section.find('h3').text.strip()
        course_description = course_section.find('p', class_='description').text.strip()
        courses.append((course_name, course_description))

    return courses

# Fetch course information from all links
all_courses = []
for link in links:
    courses = fetch_course_info(link)
    all_courses.extend(courses)
    print("Courses from link:", link, courses)


Courses from link: http://www.csus.edu/ []
Courses from link: https://sacramentostate.policystat.com/ []
Courses from link: http://www.csus.edu/commencement/ []
Courses from link: http://www.csus.edu/commencement/ []
Courses from link: http://www.csus.edu/commencement/ []
Courses from link: https://twitter.com/sacstate []
Courses from link: https://www.facebook.com/sacstate []
Courses from link: https://www.linkedin.com/school/sacstate []
Courses from link: https://www.instagram.com/sacstate/ []
Courses from link: https://www.csus.edu/ []
Courses from link: https://www2.calstate.edu/ []
Courses from link: https://www.csus.edu/compliance/ []
Courses from link: https://www.csus.edu/campus-safety/ []
Courses from link: https://www.csus.edu/title-ix/ []


In [69]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('csus_data.db')
cursor = conn.cursor()

# Create tables for holidays and courses
cursor.execute('''
    CREATE TABLE IF NOT EXISTS holidays (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        date TEXT,
        description TEXT
    )
''')

cursor.execute('''
    CREATE TABLE IF NOT EXISTS courses (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        name TEXT,
        description TEXT,
        link TEXT
    )
''')

# Insert holidays into database
cursor.executemany('''
    INSERT INTO holidays (date, description)
    VALUES (?, ?)
''', holidays)

# Insert courses into database
cursor.executemany('''
    INSERT INTO courses (name, description, link)
    VALUES (?, ?, ?)
''', [(course_name, course_description, link) for course_name, course_description in all_courses])

# Commit changes and close connection
conn.commit()
conn.close()


In [72]:
import requests
from bs4 import BeautifulSoup

# Function to fetch and parse the academic calendar page
def fetch_academic_calendar(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract holidays data
    holidays = []

    # Find all rows in the table
    rows = soup.find_all('tr', {'class': 'even firstrow'})  # Adjust class as necessary
    for row in rows:
        cols = row.find_all('td')
        if len(cols) >= 2:  # Ensure there are enough columns
            date = cols[0].get_text(strip=True)
            description = cols[1].get_text(strip=True)
            holidays.append({'date': date, 'description': description})

    return holidays

# Example usage
url = 'https://catalog.csus.edu/academic-calendar/#spring2024text'
holidays = fetch_academic_calendar(url)

# Display the holidays
for holiday in holidays:
    print(f"Date: {holiday['date']}, Description: {holiday['description']}")


Date: June 2023, Description: Spring 2024 Schedule Available at My Sac State
Date: March 2024, Description: Fall 2024 Schedule Available at My Sac State
Date: June 2024, Description: Spring 2025 Schedule Available at My Sac State


In [73]:
import requests
from bs4 import BeautifulSoup

# Function to fetch all hyperlinks from a given URL
def fetch_hyperlinks(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all anchor tags
    links = soup.find_all('a', href=True)

    # Extract and normalize the href attributes
    hyperlinks = []
    for link in links:
        href = link['href']
        # Normalize relative URLs to absolute URLs if needed
        if not href.startswith('http'):
            href = requests.compat.urljoin(url, href)
        hyperlinks.append(href)

    return hyperlinks

# Example usage
url = 'https://www.csus.edu/'  # Replace with the actual URL containing the link
hyperlinks = fetch_hyperlinks(url)

# Display the hyperlinks
for hyperlink in hyperlinks:
    print(hyperlink)


https://www.csus.edu/#skip
https://www.csus.edu
https://www.csus.edu/apply/index.html
https://www.csus.edu/experience/index.html
https://www.csus.edu/giving
https://my.csus.edu
https://www.csus.edu/return-to-campus/
https://www.csus.edu/experience/innovation-creativity/oried/index.html
https://www.csus.edu/apply/financial-aid-scholarships/scholarships/index.html
https://www.trumba.com/calendars/sacramento-state-events?trumbaEmbed=filterview%3DFeaturedEvents
https://www.csus.edu/parking-transportation/
https://www.csus.edu/campusmap/
https://www.csus.edu/#accordion-student-life
https://www.csus.edu/student-life/academic-advising/index.html
http://www.asi.csus.edu
https://www.csus.edu/student-life/career-center/index.html
https://www.csus.edu/student-life/class-schedules/index.html
https://www.csus.edu/student-life/health-counseling/index.html
https://www.csus.edu/student-life/housing/index.html
https://www.csus.edu/student-life/records-transcripts/index.html
https://www.csus.edu/student

In [76]:
import requests
from bs4 import BeautifulSoup

# Function to fetch hyperlinks from a given URL
def fetch_hyperlinks(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all anchor tags with href attributes
    links = soup.find_all('a', href=True)

    # Extract and normalize the href attributes
    hyperlinks = []
    for link in links:
        href = link['href']
        # Normalize relative URLs to absolute URLs if needed
        if not href.startswith('http'):
            href = requests.compat.urljoin(url, href)
        hyperlinks.append(href)

    return hyperlinks

# Function to fetch text content from a URL
def fetch_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()  # Extract text content
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to save data to a .txt file
def save_data_to_txt(filename, data):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(data)

# Function to fetch and save text data from all hyperlinks on a page
def fetch_and_save_text_from_hyperlinks(main_url, txt_filename):
    # Fetch and parse the main page
    hyperlinks = fetch_hyperlinks(main_url)

    all_text_data = []

    for hyperlink in hyperlinks:
        print(f"Fetching text from: {hyperlink}")
        text_data = fetch_text_from_url(hyperlink)
        if text_data:
            all_text_data.append(f"URL: {hyperlink}\n{text_data}\n{'='*80}\n")

    # Save all fetched text data to a .txt file
    save_data_to_txt(txt_filename, '\n'.join(all_text_data))
    print(f"Data saved to {txt_filename}")

# Example usage
main_url = 'https://www.csus.edu/'  # Replace with the actual URL
txt_filename = '/content/fetched_data.txt'
fetch_and_save_text_from_hyperlinks(main_url, txt_filename)


Fetching text from: https://www.csus.edu/#skip
Fetching text from: https://www.csus.edu
Fetching text from: https://www.csus.edu/apply/index.html
Fetching text from: https://www.csus.edu/experience/index.html
Fetching text from: https://www.csus.edu/giving
Fetching text from: https://my.csus.edu
Fetching text from: https://www.csus.edu/return-to-campus/
Fetching text from: https://www.csus.edu/experience/innovation-creativity/oried/index.html
Fetching text from: https://www.csus.edu/apply/financial-aid-scholarships/scholarships/index.html
Fetching text from: https://www.trumba.com/calendars/sacramento-state-events?trumbaEmbed=filterview%3DFeaturedEvents
Error fetching https://www.trumba.com/calendars/sacramento-state-events?trumbaEmbed=filterview%3DFeaturedEvents: HTTPSConnectionPool(host='events.csus.edu', port=443): Max retries exceeded with url: /?trumbaEmbed=filterview%3DFeaturedEvents (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate ver

In [79]:
import sqlite3

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('academic_calendar.db')
cursor = conn.cursor()

# Create a table for storing calendar events
cursor.execute('''
CREATE TABLE IF NOT EXISTS events (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    date TEXT,
    description TEXT
)
''')
conn.commit()

# Sample data fetched
data = [
    {"date": "June 2023", "description": "Spring 2024 Schedule Available at My Sac State"},
    {"date": "March 2024", "description": "Fall 2024 Schedule Available at My Sac State"},
    {"date": "June 2024", "description": "Spring 2025 Schedule Available at My Sac State"}
]

def insert_data(data):
    conn = sqlite3.connect('academic_calendar.db')
    cursor = conn.cursor()

    for entry in data:
        cursor.execute('''
        INSERT INTO events (date, description)
        VALUES (?, ?)
        ''', (entry['date'], entry['description']))

    conn.commit()
    conn.close()

# Insert the data
insert_data(data)


In [80]:
import sqlite3

def display_table():
    # Connect to the SQLite database
    conn = sqlite3.connect('academic_calendar.db')
    cursor = conn.cursor()

    # Execute a query to fetch all data from the events table
    cursor.execute('SELECT * FROM events')

    # Fetch all rows from the executed query
    rows = cursor.fetchall()

    # Display the data
    print("ID | Date       | Description")
    print("-- | ---------- | -----------------------------------------------")
    for row in rows:
        print(f"{row[0]:<2} | {row[1]:<10} | {row[2]}")

    # Close the database connection
    conn.close()

# Call the function to display the table
display_table()


ID | Date       | Description
-- | ---------- | -----------------------------------------------
1  | June 2023  | Spring 2024 Schedule Available at My Sac State
2  | March 2024 | Fall 2024 Schedule Available at My Sac State
3  | June 2024  | Spring 2025 Schedule Available at My Sac State


**Web Scraping from** -- https://catalog.csus.edu/courses-a-z/

In [81]:
import requests
from bs4 import BeautifulSoup

# Function to fetch hyperlinks from a given URL
def fetch_hyperlinks(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all anchor tags with href attributes
        links = soup.find_all('a', href=True)

        # Extract and normalize the href attributes
        hyperlinks = []
        for link in links:
            href = link['href']
            # Normalize relative URLs to absolute URLs if needed
            if not href.startswith('http'):
                href = requests.compat.urljoin(url, href)
            hyperlinks.append(href)

        return hyperlinks
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

# Function to save hyperlinks to a .txt file
def save_hyperlinks_to_txt(filename, hyperlinks):
    with open(filename, 'w', encoding='utf-8') as file:
        for link in hyperlinks:
            file.write(link + '\n')

# URL to fetch hyperlinks from
url = 'https://catalog.csus.edu/courses-a-z/'

# Fetch hyperlinks
hyperlinks = fetch_hyperlinks(url)

# Save hyperlinks to a .txt file
txt_filename = '/content/Courseslinks.txt'  # Change the path if needed
save_hyperlinks_to_txt(txt_filename, hyperlinks)

print(f"Hyperlinks saved to {txt_filename}")


Hyperlinks saved to /content/Courseslinks.txt


In [82]:
import requests
from bs4 import BeautifulSoup

# Function to fetch hyperlinks from a given URL
def fetch_hyperlinks(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all anchor tags with href attributes
        links = soup.find_all('a', href=True)

        # Extract and normalize the href attributes
        hyperlinks = []
        for link in links:
            href = link['href']
            # Normalize relative URLs to absolute URLs if needed
            if not href.startswith('http'):
                href = requests.compat.urljoin(url, href)
            hyperlinks.append(href)

        return hyperlinks
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

# Function to fetch text content from a URL
def fetch_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()  # Extract text content
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to save data to a .txt file
def save_data_to_txt(filename, data):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(data)

# Function to fetch and save text data from all hyperlinks on a page
def fetch_and_save_text_from_hyperlinks(main_url, txt_filename):
    # Fetch and parse the main page
    hyperlinks = fetch_hyperlinks(main_url)

    all_text_data = []

    for hyperlink in hyperlinks:
        print(f"Fetching text from: {hyperlink}")
        text_data = fetch_text_from_url(hyperlink)
        if text_data:
            all_text_data.append(f"URL: {hyperlink}\n{text_data}\n{'='*80}\n")

    # Save all fetched text data to a .txt file
    save_data_to_txt(txt_filename, '\n'.join(all_text_data))
    print(f"Data saved to {txt_filename}")

# Example usage
main_url = 'https://catalog.csus.edu/courses-a-z/'  # Replace with the actual URL
txt_filename = '/content/fetched_data.txt'
fetch_and_save_text_from_hyperlinks(main_url, txt_filename)


Fetching text from: https://catalog.csus.edu/courses-a-z/#content
Fetching text from: https://catalog.csus.edu/azindex/
Fetching text from: https://catalog.csus.edu/
Fetching text from: https://catalog.csus.edu/
Fetching text from: http://www.csus.edu/
Fetching text from: https://catalog.csus.edu/academic-programs/
Fetching text from: https://catalog.csus.edu/courses-a-z/
Fetching text from: https://sacramentostate.policystat.com/
Fetching text from: https://catalog.csus.edu/colleges/academic-affairs/general-education/
Fetching text from: https://catalog.csus.edu/archives/
Fetching text from: https://catalog.csus.edu/azindex/
Fetching text from: https://catalog.csus.edu/
Fetching text from: https://catalog.csus.edu/presidents-welcome/
Fetching text from: https://catalog.csus.edu/california-promise/
Fetching text from: https://catalog.csus.edu/csu-system/
Fetching text from: https://catalog.csus.edu/csu-system/campuses-california-state-university/
Fetching text from: https://catalog.csu

In [2]:
import sqlite3
import requests
from bs4 import BeautifulSoup

# Function to fetch hyperlinks from a given URL
def fetch_hyperlinks(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all anchor tags with href attributes
        links = soup.find_all('a', href=True)

        # Extract and normalize the href attributes
        hyperlinks = []
        for link in links:
            href = link['href']
            # Normalize relative URLs to absolute URLs if needed
            if not href.startswith('http'):
                href = requests.compat.urljoin(url, href)
            hyperlinks.append(href)

        return hyperlinks
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

# Function to fetch text content from a URL
def fetch_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()  # Extract text content
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to parse and extract course details from text
def parse_course_details(text):
    lines = text.splitlines()
    courses = []

    for line in lines:
        if 'Units' in line:
            parts = line.split('.')
            if len(parts) >= 3:
                course_code = parts[0].strip()
                course_title = parts[1].strip()
                # Check if there are enough elements before accessing them
                units_part = parts[2].strip().split()
                units = units_part[0] if units_part else "N/A"  # Default to "N/A" if units not found

                terms_offered = next((l.strip() for l in lines if 'Typically Offered' in l), None)

                courses.append((course_code, course_title, units, terms_offered))
            else:
                print(f"Skipping line due to unexpected format: {line}")

    return courses

# Function to store course data in SQLite database
def store_courses_in_db(courses, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Create the courses table if it doesn't exist
    cursor.execute('''CREATE TABLE IF NOT EXISTS courses (
                      id INTEGER PRIMARY KEY AUTOINCREMENT,
                      course_code TEXT,
                      course_title TEXT,
                      units TEXT,
                      terms_offered TEXT)''')

    # Insert course data into the table
    cursor.executemany('''INSERT INTO courses (course_code, course_title, units, terms_offered)
                          VALUES (?, ?, ?, ?)''', courses)

    conn.commit()
    conn.close()

# Main function to fetch, parse, and store course data
def main():
    main_url = 'https://catalog.csus.edu/courses-a-z/'  # Replace with the actual URL
    db_path = 'courses.db'  # Path to the SQLite database

    hyperlinks = fetch_hyperlinks(main_url)

    all_courses = []

    for hyperlink in hyperlinks:
        print(f"Fetching text from: {hyperlink}")
        text_data = fetch_text_from_url(hyperlink)
        if text_data:
            courses = parse_course_details(text_data)
            all_courses.extend(courses)

    store_courses_in_db(all_courses, db_path)
    print(f"Data stored in {db_path}")

# Run the main function
main()


Fetching text from: https://catalog.csus.edu/courses-a-z/#content
Fetching text from: https://catalog.csus.edu/azindex/
Fetching text from: https://catalog.csus.edu/
Fetching text from: https://catalog.csus.edu/
Fetching text from: http://www.csus.edu/
Fetching text from: https://catalog.csus.edu/academic-programs/
Fetching text from: https://catalog.csus.edu/courses-a-z/
Fetching text from: https://sacramentostate.policystat.com/
Fetching text from: https://catalog.csus.edu/colleges/academic-affairs/general-education/
Skipping line due to unexpected format: On This PageObjectivesCourse RequirementsArea A: Basic Subjects (9 units)Area B: The Physical Universe and Its Life Forms (12 Units)Area C: The Arts and Humanities (12 Units)Area D: The Individual and Society (9 Units) Area E: Understanding Personal Development (3 Units)Area F: Ethnic Studies (3 Units)Additional Graduation Requirements Second Semester Composition Requirement Foreign Language Graduation Requirement American Institut

In [3]:
import sqlite3

def display_table(database_path, table_name):
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()

    # Fetch data from the table
    cursor.execute(f"SELECT * FROM {table_name}")
    rows = cursor.fetchall()

    # Display the data
    for row in rows:
        print(row)

    conn.close()

# Display the courses table
display_table('courses.db', 'courses')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(9213, 'RPTA 284', 'Hospitality Administration', '3', 'Term Typically Offered: Spring, Summer')
(9214, 'RPTA 295', 'Practicum', '3', 'Term Typically Offered: Spring, Summer')
(9215, 'RPTA 299', 'Individual Study', '1', 'Term Typically Offered: Spring, Summer')
(9216, 'RPTA 500A', 'Culminating Experience: Thesis', '1', 'Term Typically Offered: Spring, Summer')
(9217, 'RPTA 500B', 'Culminating Experience: Project', '1', 'Term Typically Offered: Spring, Summer')
(9218, 'RPTA 500C', 'Culminating Experience: Comprehensive Exam', '1', 'Term Typically Offered: Spring, Summer')
(9219, 'AERO 99', 'Special Problems', '1', 'Term Typically Offered: Fall, Spring')
(9220, 'AERO 135A', 'Leading People and Effective Communication I', '3', 'Term Typically Offered: Fall, Spring')
(9221, 'AERO 135B', 'Leading People and Effective Communication II', '3', 'Term Typically Offered: Fall, Spring')
(9222, 'AERO 145A', 'National Security and Prepa