In [1]:
# Modifications:
#   20190530 (JTW): remove SESSION from term_id

import numpy as np
import pandas as pd
from datetime import date, datetime
from pathlib import Path


In [2]:
output_path = Path(r"\\psc-data\E\Applications\Starfish\Files\workingfiles\sections")
sfn_output = output_path / "sections.txt"
tfn_output = output_path / "teaching.txt"
catalog_path = Path(
    r"\\psc-data\E\Applications\Starfish\Files\workingfiles\course_catalog"
)
catalog_fn = catalog_path / "course_catalog.txt"


In [3]:
# local connection information
import local_db
connection = local_db.connection()


In [4]:
sections_begin_year = "2011"


In [5]:
sql_str = (
    "SELECT * FROM SECTIONS WHERE "
    + f"ACADEMIC_YEAR >= '{sections_begin_year}' "
    + "AND ACADEMIC_TERM IN ('FALL', 'SPRING', 'SUMMER') "
    + "AND ACADEMIC_SESSION IN ('MAIN', 'CULN', 'EXT', 'FNRR', 'HEOP',"
    + " 'SLAB', 'BLOCK A', 'BLOCK AB', 'BLOCK B') "
)
df_sections = pd.read_sql_query(sql_str, connection)

df = df_sections[
    [
        "EVENT_ID",
        "EVENT_SUB_TYPE",
        "EVENT_MED_NAME",
        "SECTION",
        "CREDITS",
        "MAX_PARTICIPANT",
        "ACADEMIC_YEAR",
        "ACADEMIC_TERM",
        "ACADEMIC_SESSION",
        "START_DATE",
        "END_DATE",
        "CIP_CODE",
        "REVISION_DATE",
        "REVISION_TIME",
    ]
]

print(df.shape)

(6733, 14)


In [6]:
df = df[~(df["EVENT_ID"].str.contains("REG", case=False))]
df = df[~(df["EVENT_ID"].str.contains("STDY", case=False))]

df = df.rename(
    columns={
        "EVENT_MED_NAME": "course_section_name",
        "CREDITS": "credit_hours",
        "MAX_PARTICIPANT": "maximum_enrollment_count",
        "START_DATE": "start_dt",
        "END_DATE": "end_dt",
        "CIP_CODE": "course_cip_code",
    }
)

print(df.shape)

(6503, 14)


In [7]:
crs_id = (
    lambda c: (str(c["EVENT_ID"]).replace(" ", "") + str(c["EVENT_SUB_TYPE"]).upper())
    if ((c["EVENT_SUB_TYPE"] == "LAB") | (c["EVENT_SUB_TYPE"] == "SI"))
    else (str(c["EVENT_ID"]).replace(" ", ""))
)
df.loc[:, "course_id"] = df.apply(crs_id, axis=1)


In [8]:
df.loc[:, "course_section_id"] = (
    df["EVENT_ID"]
    + "."
    + df["EVENT_SUB_TYPE"]
    + "."
    + df["ACADEMIC_YEAR"]
    + "."
    + df["ACADEMIC_TERM"].str.title()
    + "."
    + df["SECTION"]
)
df.loc[:, "integration_id"] = df.loc[:, "course_section_id"]


In [9]:
term_id = (
    lambda c: (c["ACADEMIC_YEAR"] + "." + str(c["ACADEMIC_TERM"]).title())
    # if (c["ACADEMIC_SESSION"] == "MAIN")
    # else (
    #     c["ACADEMIC_YEAR"]
    #     + "."
    #     + str(c["ACADEMIC_TERM"]).title()
    #     + "."
    #     + c["ACADEMIC_SESSION"]
    # )
)
df.loc[:, "term_id"] = df.apply(term_id, axis=1)


In [10]:
# temporarily use academic year as catalog year
df["AY"] = (
    pd.to_numeric(df["ACADEMIC_YEAR"], errors="coerce")
    .fillna(sections_begin_year)
    .astype(np.int64)
)
cat_yr = lambda c: c["AY"] if (c["ACADEMIC_TERM"] == "FALL") else (c["AY"] - 1)
df.loc[:, "catalog_year"] = df.apply(cat_yr, axis=1)


In [11]:
crs_sect_delv = (
    lambda c: "03"
    if str(c["SECTION"])[:2] == "HY"
    else ("02" if str(c["SECTION"])[:2] == "ON" else "01")
)
df.loc[:, "course_section_delivery"] = df.apply(crs_sect_delv, axis=1)


In [12]:
crs_integ_id = (
    lambda c: (c["EVENT_ID"] + "." + str(c["catalog_year"]))
    if (c["EVENT_SUB_TYPE"] == "")
    else (c["EVENT_ID"] + "." + c["EVENT_SUB_TYPE"] + "." + str(c["catalog_year"]))
)
df.loc[:, "course_integration_id"] = df.apply(crs_integ_id, axis=1)


In [13]:
# read course_catalog.txt to find the correct catalog year
dfcat = pd.read_csv(catalog_fn)
dfcat = dfcat[["course_id", "integration_id"]].rename(
    {"integration_id": "cat_integ_id"}, axis="columns"
)
df = pd.merge(df, dfcat, on=["course_id"], how="left")


In [14]:
# keep catalog_year before course year
df = df.loc[(df["course_integration_id"] >= df["cat_integ_id"])]

df = df.sort_values(
    ["course_section_id", "course_integration_id"], ascending=[True, True]
).drop_duplicates(["course_section_id"], keep="last")


In [15]:
df.loc[(df['EVENT_ID']=='COM 198')]

Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id,cat_integ_id
9503,COM 198,LEC,InteractiveStorytell,ON01,1.0,20,2020,SPRING,MAIN,2020-03-30,...,1900-01-01 07:51:19.793,COM198,COM 198.LEC.2020.Spring.ON01,COM 198.LEC.2020.Spring.ON01,2020.Spring,2020,2019,2,COM 198.LEC.2019,COM 198.LEC.2019
9504,COM 198,LEC,InteractiveStorytell,ON02,0.0,10,2020,SPRING,MAIN,2020-03-30,...,1900-01-01 08:49:14.143,COM198,COM 198.LEC.2020.Spring.ON02,COM 198.LEC.2020.Spring.ON02,2020.Spring,2020,2019,2,COM 198.LEC.2019,COM 198.LEC.2019


In [17]:
df.loc[(df['EVENT_ID']=='CUL 198')]

Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id,cat_integ_id
9512,CUL 198,LEC,30 Ways to Presv Fd,ON01,1.0,20,2020,SPRING,MAIN,2020-03-30,...,1900-01-01 10:30:21.183,CUL198,CUL 198.LEC.2020.Spring.ON01,CUL 198.LEC.2020.Spring.ON01,2020.Spring,2020,2019,2,CUL 198.LEC.2019,CUL 198.LEC.2019
9513,CUL 198,LEC,30 Ways to Presv Fd,ON02,0.0,10,2020,SPRING,MAIN,2020-03-30,...,1900-01-01 13:20:16.043,CUL198,CUL 198.LEC.2020.Spring.ON02,CUL 198.LEC.2020.Spring.ON02,2020.Spring,2020,2019,2,CUL 198.LEC.2019,CUL 198.LEC.2019


In [18]:
df.loc[(df['EVENT_ID']=='HUM 198')]

Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id,cat_integ_id
9656,HUM 198,LEC,Movies & Psy of Cris,ON01,1.0,20,2020,SPRING,MAIN,2020-03-30,...,1900-01-01 10:30:21.233,HUM198,HUM 198.LEC.2020.Spring.ON01,HUM 198.LEC.2020.Spring.ON01,2020.Spring,2020,2019,2,HUM 198.LEC.2019,HUM 198.LEC.2019
9657,HUM 198,LEC,Movies & Psy of Cris,ON02,0.0,10,2020,SPRING,MAIN,2020-03-30,...,1900-01-01 08:38:17.323,HUM198,HUM 198.LEC.2020.Spring.ON02,HUM 198.LEC.2020.Spring.ON02,2020.Spring,2020,2019,2,HUM 198.LEC.2019,HUM 198.LEC.2019


In [19]:
df.loc[(df['EVENT_ID']=='HOS 199')]

Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id,cat_integ_id
9632,HOS 199,LEC,ArtOfTable Etiquette,ON01,1.0,20,2020,SPRING,MAIN,2020-03-30,...,1900-01-01 08:43:51.167,HOS199,HOS 199.LEC.2020.Spring.ON01,HOS 199.LEC.2020.Spring.ON01,2020.Spring,2020,2019,2,HOS 199.LEC.2019,HOS 199.LEC.2019
9633,HOS 199,LEC,ArtOfTable Etiquette,ON02,0.0,10,2020,SPRING,MAIN,2020-03-30,...,1900-01-01 08:17:40.487,HOS199,HOS 199.LEC.2020.Spring.ON02,HOS 199.LEC.2020.Spring.ON02,2020.Spring,2020,2019,2,HOS 199.LEC.2019,HOS 199.LEC.2019


In [20]:
df.loc[(df['EVENT_ID']=='SCI 198')]

Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id,cat_integ_id
9759,SCI 198,LEC,Sci & Techn Today,ON01,1.0,20,2020,SPRING,MAIN,2020-03-30,...,1900-01-01 09:53:35.773,SCI198,SCI 198.LEC.2020.Spring.ON01,SCI 198.LEC.2020.Spring.ON01,2020.Spring,2020,2019,2,SCI 198.LEC.2019,SCI 198.LEC.2018
