In [1]:
from scripts.data_extraction import *
from scripts.categories import *

from typing import *

import pandas as pd
import re as regex
import numpy as np

In [2]:
RAW_FOLDER = './data/raw/'
PREPROCESSED_FOLDER = './data/preprocessed/'
FINALIZED_FOLDER = './data/finalized/'

file_path = r'' # path to html file
table_xpath = '/html/body/main/div/section/form/div[4]/div[2]/table'
    # usually works, generated by Microsoft Edge

data_file_name = '' # name of saved file

In [None]:
def save(df, folder, name):
    df.to_csv(f'{folder}{name}', index=False)

## Quick note on data

#### STEM
- BILD
- CHEM
- CSE
- ECE
- ENG/BENG/CENG
- MATH
- Subgroups:
  - Proof based: Courses that are mostly based on proof and/or abstract concepts. Usually LLMs cannot handle these courses.
  - Calculation based: Courses that are mostly based on calculations and/or non-abstract concepts. Usually LLMs can handle these courses.
- LLMs are involved in identifying different these two subgroups. LLMs are asked to classify if a course is proof based or not based on its catalog. Human are involved in reviewing the result to check if there are false positives. However, there might be false negatives. (i.e. A proof based course might be classified to be a calculation based course, but a calculation based course will never be classified to be a proof based course)

#### Arts
- LT
- MUS
- PHIL
- VIS
- Subgroups:
  - Writing based: Courses that are mostly focusing on written contents
  - Non-writing based: Courses that are mostly focusing on non-written contents
  - LT and PHIL are classified as writing based
  - VIS and MUS are classified as non-writing based

## Extract raw data from html files

In [None]:
tree = read_html(file_path)
table = tree.xpath(table_xpath)[0]
rows = table.xpath('.//tr')

header = interpret_header(rows[0])
data = interpret_table(rows[1:], header)

save(pd.DataFrame(data), RAW_FOLDER, data_file_name)

## Preprocess data
- Remove courses from summer sessions
- Remove courses containing missing values
- Convert strings to numbers (if needed)
- Reformat grades (only keep grade point average)

In [None]:
final_process(data)
data = pd.DataFrame(data)
data = data.dropna() # remove courses from summer sessions
                     # and/or courses that contain missing values

save(data, PREPROCESSED_FOLDER, data_file_name)

## Finalize data
- Tag if the course is in pre-GPT era or post-GPT era
  - Column isPreGPT, dtype=bool
- Tag the category of data
  - Column isAlgebra, dtype=bool
  - Column isProofBased, dtype=bool
- Tag if the course is upper division
  - Column isUD, dtype=bool

#### Pre/Post-GPT tagging

In [None]:
def extract_year(term):
    year = regex.search(r'\d+', term)
    if year:
        return int(year[0])
    else:
        raise RuntimeWarning(f'Improper term: {term}, setting value to nan')
        return np.nan

data['isPreGPT'] = data['Term'].apply(extract_year)
data['isPreGPT'] = data['isPreGPT'] < 22

#### Category tagging

In [None]:
# STEM tagging
data['isSTEM'] = data['Course'].apply(
    lambda x: True if 'LT' in x or 'PHIL' in x\
                    or 'VIS' in x or 'MUS' in x\
                    else False
)

# for STEM courses
data['isAbstract'] = data['Course'].apply(
    # note STEM_ABSTRACT stores course number
    lambda x: True if x in STEM_ABSTRACT else False
)

# for Arts courses
data['isWritten'] = data['Course'].apply(
    lambda x: True if 'LT' in x or 'PHIL' in x else False
)

#### Upper Division tagging

In [None]:
def extract_course_number(course):
    course_number = regex.search(r'\d+', course)
    if course_number:
        return int(course_number[0])
    else:
        raise RuntimeWarning(f'Improper course name: {course}, setting value to nan')
        return np.nan

data['isUD'] = data['Course'].apply(extract_course_number)
data['isUD'] = data['isUD'] >= 100

#### Saving

In [None]:
save(data, FINALIZED_FOLDER, data_file_name)