In [6]:
from pathlib import Path
import pandas as pd

raw_dir = Path('gpa-raw')
csv_paths = sorted(raw_dir.glob('*.csv'))

def read_csv_with_debug(csv_path):
    try:
        df = pd.read_csv(csv_path)
        return df
    except UnicodeDecodeError as exc:
        # Locate the problematic byte to help fix the source CSV
        with open(csv_path, 'rb') as fh:
            raw = fh.read()
        start = max(exc.start - 40, 0)
        end = min(exc.end + 40, len(raw))
        snippet = raw[start:end]
        print(f'UnicodeDecodeError in {csv_path.name} at bytes {exc.start}:{exc.end}')
        print('Raw bytes near error:', snippet)
        print('Raw bytes as latin-1:', snippet.decode('latin-1', errors='replace'))
        raise

dfs = []
for csv_path in csv_paths:
    df = read_csv_with_debug(csv_path)
    df['source_file'] = csv_path.name
    dfs.append(df)

gpa_raw = pd.concat(dfs, ignore_index=True)
gpa_raw


Unnamed: 0,CRN,Course Subject,Course Number,Course Title,Course Section,Sched Type,Term,Primary Instructor,A+,A,...,% 4.0's,source_file,Subject,Course,A Range,B Range,C Range,D Range,Course.1,Section
0,41758.0,AAS,100.0,Intro Asian American Studies,AD1,DIS,120108.0,"Winkelmann, Marie T",1.0,12.0,...,38%,fa2010.csv,,,,,,,,
1,47102.0,AAS,100.0,Intro Asian American Studies,AD3,DIS,120108.0,"Kwon, Yaejoon",2.0,6.0,...,25%,fa2010.csv,,,,,,,,
2,51248.0,AAS,100.0,Intro Asian American Studies,AD4,DIS,120108.0,"Kwon, Yaejoon",4.0,4.0,...,28%,fa2010.csv,,,,,,,,
3,51249.0,AAS,100.0,Intro Asian American Studies,AD5,DIS,120108.0,"Arnaldo, Constancio",0.0,9.0,...,26%,fa2010.csv,,,,,,,,
4,51932.0,AAS,100.0,Intro Asian American Studies,AD6,DIS,120108.0,"Arnaldo, Constancio",0.0,13.0,...,37%,fa2010.csv,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122578,10060.0,PSYC,204.0,Intro to Brain and Cognition,A,ONL,120250.0,"Dolcos, Sanda M",5.0,15.0,...,,wi2024_2025.csv,,,,,,,,
122579,10106.0,REL,110.0,World Religions,AL1,ONL,120250.0,"Ebel, Jonathan H",0.0,31.0,...,,wi2024_2025.csv,,,,,,,,
122580,10039.0,SPAN,130.0,Intermediate Spanish,A,ONL,120250.0,"Abbott, Annie R",10.0,9.0,...,,wi2024_2025.csv,,,,,,,,
122581,10102.0,SPAN,130.0,Intermediate Spanish,B,ONL,120250.0,"Hughes, Brady A",6.0,4.0,...,,wi2024_2025.csv,,,,,,,,


In [7]:
# Verify every Primary Instructor has exactly one comma
# (ignoring any trailing comma with nothing after it)
instructor_series = gpa_raw['Primary Instructor'].astype(str).str.strip()
instructor_series = instructor_series.str.replace(r',$', '', regex=True)
comma_counts = instructor_series.str.count(',')

invalid_instructors = gpa_raw.loc[comma_counts != 1, 'Primary Instructor'].unique()
invalid_instructors


array([nan], dtype=object)

In [18]:
# Build a master df with one row per course-instructor and average GPA over sections
required_cols = ['Course Subject', 'Course Number', 'Primary Instructor']
gpa_filtered = gpa_raw.dropna(subset=required_cols).copy()

course_subject = gpa_filtered['Course Subject'].astype(str).str.strip()
course_number = gpa_filtered['Course Number'].astype(str).str.strip()
course_number = course_number.str.replace(r'\.0$', '', regex=True)
course_name = gpa_filtered['Course Title'].astype(str).str.strip()

instructor_clean = gpa_filtered['Primary Instructor'].astype(str).str.strip()
instructor_clean = instructor_clean.str.replace(r',$', '', regex=True)

instructor_split = instructor_clean.str.split(',', n=1, expand=True)
instructor_last = instructor_split[0].str.strip()
instructor_first = instructor_split[1].str.strip()

gpa_filtered = gpa_filtered.assign(
    course_subject=course_subject,
    course_number=course_number,
    course_name=course_name,
    instructor_last=instructor_last,
    instructor_first=instructor_first,
)

gpa_filtered['Average Grade'] = pd.to_numeric(gpa_filtered['Average Grade'], errors='coerce')

master_df = (
    gpa_filtered
    .groupby(
        ['course_subject', 'course_number', 'course_name', 'instructor_last', 'instructor_first'],
        as_index=False,
    )['Average Grade']
    .mean()
    .rename(columns={'Average Grade': 'avg_gpa'})
)

master_df


Unnamed: 0,course_subject,course_number,course_name,instructor_last,instructor_first,avg_gpa
0,AAS,100,Intro Asian American Studies,Arai,Sayuri,3.640000
1,AAS,100,Intro Asian American Studies,Arnaldo,Constancio,3.597500
2,AAS,100,Intro Asian American Studies,Carter,Regina S,3.745000
3,AAS,100,Intro Asian American Studies,Charity,Hannah L,
4,AAS,100,Intro Asian American Studies,Davis,Thomas E,3.697143
...,...,...,...,...,...,...
33248,ZULU,403,Intermediate Zulu I,Hlongwa,Tholani S,
33249,ZULU,404,Intermediate Zulu II,Hlongwa,Tholani S,
33250,ZULU,404,Intermediate Zulu II,Mkhatshwa,Telamisile P,
33251,ZULU,406,Advanced Zulu II,Hlongwa,Tholani S,


In [20]:
filtered_master_df = master_df[master_df['avg_gpa'] > 0]


filtered_master_df

Unnamed: 0,course_subject,course_number,course_name,instructor_last,instructor_first,avg_gpa
0,AAS,100,Intro Asian American Studies,Arai,Sayuri,3.640000
1,AAS,100,Intro Asian American Studies,Arnaldo,Constancio,3.597500
2,AAS,100,Intro Asian American Studies,Carter,Regina S,3.745000
4,AAS,100,Intro Asian American Studies,Davis,Thomas E,3.697143
5,AAS,100,Intro Asian American Studies,Geng,Zhe,3.750000
...,...,...,...,...,...,...
33218,VM,655,SA Medicine and Surgery III,Ridgway,Marcella D,2.760000
33219,VM,656,LA Medicine and Surgery III,Garrett,Edgar F,2.875000
33239,YDSH,220,Jewish Storytelling,Harris,Rachel S,3.560000
33242,YDSH,320,Lit Responses to the Holocaust,Elliott,Jeffrey P,3.920000


In [23]:
filtered_master_df.to_csv('filtered_master_df_01122026.csv', index=False)

In [None]:
# Export master_df to a JSON grouped by course_name
import json
from pathlib import Path

output_path = Path('/Users/aneeshkalla/Desktop/scheduler/gpa_stuff/gpa.json')

grouped = {}
for _, row in filtered_master_df.iterrows():
    course = row['course_subject'] + ' ' + row['course_number']
    entry = {
        'last': row['instructor_last'],
        'first': row['instructor_first'],
        'gpa': row['avg_gpa'],
    }
    grouped.setdefault(course, []).append(entry)

output_path.write_text(json.dumps(grouped, indent=2))
output_path


PosixPath('/Users/aneeshkalla/Desktop/scheduler/gpa_stuff/gpa.json')