In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
import gzip

def trim_all_columns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

In [2]:
df_gpa = pd.read_csv("https://github.com/wadefagen/datasets/raw/master/gpa/uiuc-gpa-dataset.csv").rename(columns={"Year":"year","Term":"term","Subject":"subject","Number":"number","Primary Instructor":"instructor"})

df_gpa["instructor"] = df_gpa['instructor'].str.extract(r'(\w+, \w)')

df_gpa['total_students'] = df_gpa['A+'] + df_gpa['A'] + df_gpa['A-'] + df_gpa['B'] + df_gpa['B+'] + df_gpa['B-'] + df_gpa['C+'] + df_gpa['C'] + df_gpa['C-'] + df_gpa['D+'] + df_gpa['D'] + df_gpa['D-'] + df_gpa['F']

df_gpa['gpa'] = (df_gpa['A+'] * 4 + df_gpa['A'] * 4 + df_gpa['A-'] * 3.67 + df_gpa['B'] * 3 + df_gpa['B+'] * 3.33 + df_gpa['B-'] * 2.67 + df_gpa['C+'] * 2.33 + df_gpa['C'] * 2 + df_gpa['C-'] * 1.67 + df_gpa['D+'] * 1.33 + df_gpa['D'] + df_gpa['D-'] * 0.67) / df_gpa['total_students']

df_gpa = df_gpa.groupby(["year", "term", "subject", "number", "instructor"], as_index=False).agg({"gpa": "mean", "total_students": "sum", "A+": "sum", "A": "sum", "A-": "sum", "B+": "sum", "B": "sum", "B-": "sum", "C+": "sum", "C": "sum", "C-": "sum", "D+": "sum", "D": "sum", "D-": "sum", "F": "sum"})

df_gpa["course"] = df_gpa["subject"] + " " + df_gpa["number"].astype(str)

df_gpa

Unnamed: 0,year,term,subject,number,instructor,gpa,total_students,A+,A,A-,...,B,B-,C+,C,C-,D+,D,D-,F,course
0,2010,Fall,AAS,100,"Arnaldo, C",3.463613,69,0,22,21,...,8,3,1,2,1,0,0,1,0,AAS 100
1,2010,Fall,AAS,100,"Kwon, Y",3.358982,61,6,10,14,...,5,2,3,1,0,0,0,1,1,AAS 100
2,2010,Fall,AAS,100,"Manalansan, M",3.980294,34,21,12,0,...,0,0,0,0,0,0,0,0,0,AAS 100
3,2010,Fall,AAS,100,"Winkelmann, M",3.422059,34,1,12,11,...,0,0,1,2,0,0,1,0,1,AAS 100
4,2010,Fall,AAS,120,"Lee, A",3.127315,65,8,11,3,...,13,5,2,5,1,0,4,0,0,AAS 120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39791,2020,Summer,TAM,212,"Chang, W",2.808077,52,5,7,7,...,6,6,0,5,2,3,1,0,4,TAM 212
39792,2020,Summer,TAM,251,"Kim, S",3.417083,48,5,14,8,...,10,2,1,3,1,0,0,0,0,TAM 251
39793,2020,Summer,TAM,335,"Ramlawi, N",3.097500,24,2,4,4,...,3,3,1,1,0,0,1,0,1,TAM 335
39794,2020,Summer,THEA,101,"Morrissette, J",3.586707,59,35,4,4,...,2,5,2,1,0,1,0,0,1,THEA 101


In [4]:
terms = [
    "../raw/{}-{}.csv".format(year, term)
    for year in range(df_gpa["year"].min(), df_gpa["year"].max() + 2) 
      for term in ["Winter", "Spring", "Summer", "Fall"] 
        if Path("../raw/{}-{}.csv".format(year, term)).is_file()
]

df_catalogs = pd.concat([pd.read_csv(term) for term in terms], ignore_index=True)
df_catalogs["term"] = pd.Categorical(df_catalogs["term"], ["Fall","Summer","Spring","Winter"], ordered=True)
df_catalogs.sort_values(by=["year", "term", "subject", "number", "crn", "meeting"], ascending=[False, True, True, True, True, True], ignore_index=True, inplace=True)
df_catalogs["course"] = df_catalogs["subject"] + " " + df_catalogs["number"].astype(str)

# Fix typos in descriptions
df_catalogs.loc[df_catalogs["course"]=="HIST 574", "description"]="Immerses students in major works of recent American religious history. Written from multiple disciplinary perspectives and wrestling with the knotty problems in which religion has been interwoven, these books will give the student a solid foundation in American religious history. 4 graduate hours. No professional credit."
df_catalogs.loc[df_catalogs["course"]=="ASST 104", "description"] = "Same as REL 104. See REL 104."
df_catalogs.loc[df_catalogs["course"]=="EPOL 551", "description"] = "Same as EOL 570. See EOL 570."
df_catalogs = trim_all_columns(df_catalogs)

df_catalogs

Unnamed: 0,year,term,college,subject,subject_name,number,name,description,credit_hours,gen_ed,...,meeting,type,type_name,start_time,end_time,days,room,building,instructor,course
0,2021,Spring,KV,AAS,Asian American Studies,100,Intro Asian American Studies,Interdisciplinary introduction to the basic co...,3 hours.,1US,...,0,OLC,Online Lecture,01:00 PM,01:50 PM,MW,,,"Tabares, L",AAS 100
1,2021,Spring,KV,AAS,Asian American Studies,100,Intro Asian American Studies,Interdisciplinary introduction to the basic co...,3 hours.,1SS,...,0,OLC,Online Lecture,01:00 PM,01:50 PM,MW,,,"Tabares, L",AAS 100
2,2021,Spring,KV,AAS,Asian American Studies,100,Intro Asian American Studies,Interdisciplinary introduction to the basic co...,3 hours.,1US,...,0,OD,Online Discussion,09:00 AM,09:50 AM,F,,,"Boonsripaisal, S",AAS 100
3,2021,Spring,KV,AAS,Asian American Studies,100,Intro Asian American Studies,Interdisciplinary introduction to the basic co...,3 hours.,1SS,...,0,OD,Online Discussion,09:00 AM,09:50 AM,F,,,"Boonsripaisal, S",AAS 100
4,2021,Spring,KV,AAS,Asian American Studies,100,Intro Asian American Studies,Interdisciplinary introduction to the basic co...,3 hours.,1US,...,0,OD,Online Discussion,10:00 AM,10:50 AM,F,,,"Boonsripaisal, S",AAS 100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414301,2010,Spring,KV,YDSH,Germanic Languages and Literatures,420,Jewish Life-Writing,Jewish life-writing from the late 18th century...,3 OR 4 hours.,,...,0,LCD,Lecture-Discussion,02:00 PM,04:50 PM,M,313,Davenport Hall,"Harris, R",YDSH 420
414302,2010,Spring,KV,ZULU,Linguistics,404,Intermediate Zulu II,Continuation of ZULU 403; emphasis on increasi...,4 hours.,,...,0,LCD,Lecture-Discussion,10:00 AM,10:50 AM,MTWR,243,Armory,"Bokamba, E",ZULU 404
414303,2010,Spring,KV,ZULU,Linguistics,404,Intermediate Zulu II,Continuation of ZULU 403; emphasis on increasi...,4 hours.,,...,0,LCD,Lecture-Discussion,10:00 AM,10:50 AM,MTWR,243,Armory,"Hlongwa, T",ZULU 404
414304,2010,Spring,KV,ZULU,Linguistics,406,Advanced Zulu II,Continuation of Zulu 405 with increased emphas...,3 hours.,,...,0,LCD,Lecture-Discussion,11:00 AM,12:20 PM,TR,57,Everitt Laboratory,"Bokamba, E",ZULU 406


In [21]:
df_subjects = df_catalogs[["subject", "subject_name"]].drop_duplicates(ignore_index=True).dropna().rename(columns={"subject": "ID", "subject_name": "name"})
df_subjects.drop(233, axis=0, inplace=True)
df_subjects[df_subjects["ID"]=="MACS"]
df_subjects.to_csv("../neo4j/nodes/subject_nodes.csv", index=False)
df_subjects

Unnamed: 0,ID,name
0,AAS,Asian American Studies
1,ABE,Agricultural and Biological Engineering
2,ACCY,Accountancy
3,ACE,Agricultural and Consumer Economics
4,ACES,"Agricultural, Consumer and Environmental Sciences"
...,...,...
238,GWS,Gender and Women's Studies Program
239,HEBR,"Religion, Program for the Study of"
240,LGLA,Linguistics
241,VB,Veterinary Biosciences
