# Course Data Analysis

This notebook analyzes course catalog and class section data from Vanderbilt University.

## Purpose
- Load and examine course catalog data
- Analyze course distributions by subject and school
- Study course attributes and requirements
- Generate insights about course offerings

## Data Sources
- Course catalog CSV from data/processed/
- Class section data CSV from data/processed/

## Outputs
- Course distribution statistics
- Subject analysis
- Prerequisite and corequisite patterns
- Visualizations of course data

## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Set plotting style
plt.style.use('default')
sns.set_palette('husl')

In [None]:
# Load course data
courses_df = None
classes_df = None

try:
    courses_df = pd.read_csv('../data/processed/vanderbilt_courses.csv')
    print(f"Loaded {len(courses_df)} course records")
except FileNotFoundError:
    print("Course catalog file not found.")

try:
    classes_df = pd.read_csv('../data/processed/vanderbilt_class_data.csv')
    print(f"Loaded {len(classes_df)} class section records")
except FileNotFoundError:
    print("Class section file not found.")

## Course Catalog Analysis

In [None]:
if courses_df is not None:
    print("Course Catalog Overview:")
    print(f"Total courses: {len(courses_df)}")
    print(f"Unique subjects: {courses_df['subject'].nunique()}")
    print(f"Unique schools: {courses_df['school_code'].nunique()}")
    print("\nFirst few records:")
    display(courses_df.head())

In [None]:
if courses_df is not None:
    # Subject distribution
    subject_counts = courses_df['subject'].value_counts().head(20)
    
    plt.figure(figsize=(12, 8))
    subject_counts.plot(kind='bar')
    plt.title('Top 20 Subjects by Number of Courses')
    plt.xlabel('Subject Code')
    plt.ylabel('Number of Courses')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
if courses_df is not None:
    # School distribution
    school_counts = courses_df['school_code'].value_counts()
    
    plt.figure(figsize=(10, 6))
    school_counts.plot(kind='pie', autopct='%1.1f%%')
    plt.title('Course Distribution by School')
    plt.ylabel('')
    plt.show()

## Class Section Analysis

In [None]:
if classes_df is not None:
    print("Class Sections Overview:")
    print(f"Total class sections: {len(classes_df)}")
    print(f"Unique subjects: {classes_df['subject'].nunique()}")
    print(f"Unique components: {classes_df['component_code'].nunique()}")
    print("\nComponent distribution:")
    print(classes_df['component_code'].value_counts())

In [None]:
if classes_df is not None:
    # Units earned distribution
    plt.figure(figsize=(10, 6))
    classes_df['units_earned'].hist(bins=20, edgecolor='black')
    plt.title('Distribution of Course Units')
    plt.xlabel('Units Earned')
    plt.ylabel('Number of Classes')
    plt.grid(True, alpha=0.3)
    plt.show()

## Requirements Analysis

In [None]:
if courses_df is not None:
    # Analyze prerequisites
    prereq_courses = courses_df[courses_df['prerequisites'].notna()]
    print(f"Courses with prerequisites: {len(prereq_courses)} ({len(prereq_courses)/len(courses_df)*100:.1f}%)")
    
    # Analyze corequisites
    coreq_courses = courses_df[courses_df['corequisites'].notna()]
    print(f"Courses with corequisites: {len(coreq_courses)} ({len(coreq_courses)/len(courses_df)*100:.1f}%)")

In [None]:
if courses_df is not None:
    # Term offering analysis
    term_analysis = courses_df['term_offered'].value_counts()
    print("Course offerings by term:")
    print(term_analysis)
    
    plt.figure(figsize=(8, 6))
    term_analysis.plot(kind='bar')
    plt.title('Course Offerings by Term')
    plt.xlabel('Terms Offered')
    plt.ylabel('Number of Courses')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()