In [2]:
# !pip install selenium pandas webdriver-manager

### Data Scrapping

In [4]:
import requests
import pandas as pd
import logging
from bs4 import BeautifulSoup
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial

# Setup logging to show timestamps
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(message)s',
    datefmt='%H:%M:%S'
)
logger = logging.getLogger(__name__)

# Enhanced day mapping
DAY_MAPPING = {
    'monday': 'Monday',
    'tuesday': 'Tuesday',
    'wednesday': 'Wednesday',
    'thursday': 'Thursday',
    'friday': 'Friday'
}

def get_cookies(base_url):
    """Fetch session cookies."""
    print("\n=== Step 1: Getting Session Cookies ===")
    url = f"{base_url}/term/search"
    headers = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"}
    body = {"term": "202530"}
    
    logger.info("Attempting to get cookies...")
    response = requests.post(url, headers=headers, data=body)
    
    if response.ok:
        logger.info("Successfully got cookies")
        return response.cookies.get_dict()
    else:
        logger.error(f"Failed to get cookies. Status code: {response.status_code}")
        return None

def get_faculty_info(base_url, cookies, crn):
    """Get faculty and meeting time information."""
    logger.info(f"Getting faculty info for course {crn}")
    url = f"{base_url}/searchResults/getFacultyMeetingTimes"
    headers = {"Cookie": "; ".join([f"{key}={value}" for key, value in cookies.items()])}
    params = {"term": "202530", "courseReferenceNumber": crn}
    
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.ok:
            data = response.json().get("fmt", [{}])[0]
            faculty = data.get("faculty", [])
            meeting_time = data.get("meetingTime", {})
            
            # Get full day names
            days = []
            for day, full_name in DAY_MAPPING.items():
                if meeting_time.get(day):
                    days.append(full_name)
            
            return {
                "faculty_name": faculty[0].get("displayName", "") if faculty else "",
                "begin_time": meeting_time.get("beginTime", ""),
                "end_time": meeting_time.get("endTime", ""),
                "days": ", ".join(days)  # Join with commas for readability
            }
    except Exception as e:
        logger.error(f"Error getting faculty info for {crn}: {str(e)}")
        return {"faculty_name": "", "begin_time": "", "end_time": "", "days": ""}

def process_course_batch(base_url, cookies, courses):
    """Process a batch of courses in parallel."""
    results = []
    for course in courses:
        try:
            result = process_course(base_url, cookies, course)
            results.append(result)
        except Exception as e:
            logger.error(f"Error processing course batch: {str(e)}")
    return results

def split_into_batches(data, batch_size=5):
    """Split the data into batches."""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

def get_course_description(base_url, cookies, crn):
    """Get course description."""
    logger.info(f"Getting description for course {crn}")
    url = f"{base_url}/searchResults/getCourseDescription"
    headers = {"Cookie": "; ".join([f"{key}={value}" for key, value in cookies.items()])}
    params = {"term": "202530", "courseReferenceNumber": crn}
    
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.ok:
            soup = BeautifulSoup(response.text, "html.parser")
            description = soup.find("section", {"aria-labelledby": "courseDescription"})
            if description:
                return " ".join(description.get_text().split())
    except Exception as e:
        logger.error(f"Error getting description for {crn}: {str(e)}")
    return ""

def get_prerequisites(base_url, cookies, crn):
    """Get course prerequisites."""
    logger.info(f"Getting prerequisites for course {crn}")
    url = f"{base_url}/searchResults/getSectionPrerequisites"
    headers = {"Cookie": "; ".join([f"{key}={value}" for key, value in cookies.items()])}
    params = {"term": "202530", "courseReferenceNumber": crn}
    
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.ok:
            soup = BeautifulSoup(response.text, "html.parser")
            prereqs = []
            table = soup.find("table", class_="basePreqTable")
            if table and table.find("tbody"):
                for row in table.find("tbody").find_all("tr"):
                    cells = row.find_all("td")
                    prereq = {
                        "and_or": cells[0].text.strip(),
                        "subject": cells[4].text.strip(),
                        "course_number": cells[5].text.strip()
                    }
                    prereqs.append(prereq)
            return str(prereqs)
    except Exception as e:
        logger.error(f"Error getting prerequisites for {crn}: {str(e)}")
    return "[]"

def process_course(base_url, cookies, course):
    """Process a single course."""
    crn = course.get('courseReferenceNumber')
    logger.info(f"\nProcessing course: {crn}")
    
    # Get all course information
    faculty_info = get_faculty_info(base_url, cookies, crn)
    description = get_course_description(base_url, cookies, crn)
    prerequisites = get_prerequisites(base_url, cookies, crn)
    
    # Create course data dictionary
    course_data = {
        'CRN': crn,
        'Campus Description': course.get('campusDescription'),
        'Course Title': course.get('courseTitle'),
        'Subject Course': course.get('subjectCourse'),
        'Faculty Name': faculty_info['faculty_name'],
        'Course Description': description,
        'Term': 'Spring 2025',
        'Begin Time': faculty_info['begin_time'],
        'End Time': faculty_info['end_time'],
        'Days': faculty_info['days'],
        'Prerequisites': prerequisites
    }
    
    logger.info(f"Completed processing course: {crn}")
    return course_data

def get_courses(base_url, cookies):
    """Get course list and process each course using parallel processing."""
    print("\n=== Step 2: Getting Course List ===")
    url = f"{base_url}/searchResults/searchResults"
    headers = {"Cookie": "; ".join([f"{key}={value}" for key, value in cookies.items()])}
    params = {
        "txt_subject": "CS",
        "txt_term": "202530",
        "pageOffset": 0,
        "pageMaxSize": 100000000
    }
    
    logger.info("Fetching course list...")
    response = requests.get(url, headers=headers, params=params)
    
    if response.ok:
        courses = response.json().get('data', [])
        total_courses = len(courses)
        logger.info(f"Found {total_courses} courses to process")
        
        print("\n=== Step 3: Processing Courses in Parallel ===")
        all_course_data = []
        processed_count = 0
        
        # Process courses in parallel batches
        with ThreadPoolExecutor(max_workers=5) as executor:
            batch_processor = partial(process_course_batch, base_url, cookies)
            futures = [
                executor.submit(batch_processor, batch)
                for batch in split_into_batches(courses, batch_size=5)
            ]
            
            for future in as_completed(futures):
                try:
                    batch_results = future.result()
                    all_course_data.extend(batch_results)
                    processed_count += len(batch_results)
                    logger.info(f"Progress: {processed_count}/{total_courses} courses completed ({(processed_count/total_courses)*100:.1f}%)")
                except Exception as e:
                    logger.error(f"Error processing batch: {str(e)}")
        
        return pd.DataFrame(all_course_data)
    else:
        logger.error("Failed to fetch course list")
        return pd.DataFrame()

def main():
    print("\n=== Starting Course Data Collection ===")
    base_url = "https://nubanner.neu.edu/StudentRegistrationSsb/ssb"
    
    # Get cookies
    cookies = get_cookies(base_url)
    if not cookies:
        print("Failed to get cookies. Exiting.")
        return
    
    # Process all courses with parallel processing
    df = get_courses(base_url, cookies)
    
    if not df.empty:
        print("\n=== Step 4: Saving Data ===")
        df.to_csv('./Data/courses.csv', index=False)
        print(f"\nSuccessfully saved data for {len(df)} courses to courses.csv")
        
        # Display sample of the data
        print("\nSample of collected data:")
        print(df[['CRN', 'Course Title', 'Faculty Name', 'Days']].head())
    else:
        print("\nNo course data was collected")
    
    print("\n=== Course Data Collection Complete ===")

if __name__ == "__main__":
    main()

23:19:11 - Attempting to get cookies...
23:19:11 - Successfully got cookies
23:19:11 - Fetching course list...



=== Starting Course Data Collection ===

=== Step 1: Getting Session Cookies ===

=== Step 2: Getting Course List ===


23:19:16 - Found 500 courses to process
23:19:16 - 
Processing course: 34154
23:19:16 - 
Processing course: 30306
23:19:16 - Getting faculty info for course 34154
23:19:16 - 
Processing course: 32831
23:19:16 - 
Processing course: 34157
23:19:16 - 
Processing course: 32215
23:19:16 - Getting faculty info for course 30306
23:19:16 - Getting faculty info for course 32831
23:19:16 - Getting faculty info for course 34157
23:19:16 - Getting faculty info for course 32215
23:19:16 - Getting description for course 34154



=== Step 3: Processing Courses in Parallel ===


23:19:16 - Getting description for course 32831
23:19:16 - Getting description for course 30306
23:19:16 - Getting description for course 34157
23:19:16 - Getting description for course 32215
23:19:16 - Getting prerequisites for course 34154
23:19:16 - Getting prerequisites for course 34157
23:19:16 - Getting prerequisites for course 32831
23:19:16 - Getting prerequisites for course 30306
23:19:16 - Getting prerequisites for course 32215
23:19:16 - Completed processing course: 34154
23:19:16 - 
Processing course: 32828
23:19:16 - Getting faculty info for course 32828
23:19:16 - Completed processing course: 30306
23:19:16 - Completed processing course: 34157
23:19:16 - Completed processing course: 32215
23:19:16 - Completed processing course: 32831
23:19:16 - 
Processing course: 30305
23:19:16 - 
Processing course: 35407
23:19:16 - Getting faculty info for course 35407
23:19:16 - 
Processing course: 33264
23:19:16 - Getting faculty info for course 30305
23:19:16 - 
Processing course: 35


=== Step 4: Saving Data ===

Successfully saved data for 500 courses to courses.csv

Sample of collected data:
     CRN                           Course Title       Faculty Name    Days
0  34154  Computer Science and Its Applications   Lieberherr, Karl        
1  32828                        Lab for CS 1100   Lieberherr, Karl  Friday
2  32829                        Lab for CS 1100   Lieberherr, Karl  Friday
3  32830                        Lab for CS 1100   Lieberherr, Karl  Friday
4  34155                     First Year Seminar  Wassinger, Claire        

=== Course Data Collection Complete ===
