In [1]:
import numpy as np
import pandas as pd
from datetime import date, datetime


In [2]:
# local connection information
import local_db

connection = local_db.connection()


In [3]:
sections_begin_year = "2011"


In [4]:
sql_str = (
    "SELECT * FROM SECTIONS WHERE "
    + "EVENT_SUB_TYPE NOT IN ('ADV') "
    + f"AND ACADEMIC_YEAR >= '{sections_begin_year}' "
    + "AND ACADEMIC_TERM IN ('FALL', 'SPRING', 'SUMMER') "
    + "AND ACADEMIC_SESSION IN ('MAIN', 'CULN', 'EXT', 'FNRR', 'HEOP',"
    + " 'SLAB', 'BLOCK A', 'BLOCK AB', 'BLOCK B') "
)
df_sections = pd.read_sql_query(sql_str, connection)


In [5]:
df = df_sections[
    [
        "EVENT_ID",
        "EVENT_SUB_TYPE",
        "EVENT_MED_NAME",
        "SECTION",
        "CREDITS",
        "MAX_PARTICIPANT",
        "ACADEMIC_YEAR",
        "ACADEMIC_TERM",
        "ACADEMIC_SESSION",
        "START_DATE",
        "END_DATE",
        "CIP_CODE",
        "REVISION_DATE",
        "REVISION_TIME",
    ]
]


print("ACADEMIC_TERM: ", df["ACADEMIC_TERM"].unique())


print("ACADEMIC_SESSION: ", df["ACADEMIC_SESSION"].unique())


In [6]:
print(df.shape)
df.head()


(6150, 14)


Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,EVENT_MED_NAME,SECTION,CREDITS,MAX_PARTICIPANT,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,START_DATE,END_DATE,CIP_CODE,REVISION_DATE,REVISION_TIME
0,ACC 101,LEC,Financial Accounting,1,3.0,35,2011,FALL,MAIN,2011-08-31,2011-12-16,,2013-08-19,1900-01-01 12:26:41.477
1,ACC 101,LEC,Financial Accounting,2,3.0,36,2011,FALL,MAIN,2011-08-31,2011-12-16,,2012-05-07,1900-01-01 13:59:40.743
2,ACC 101,LEC,Financial Accounting,3,3.0,30,2011,FALL,MAIN,2011-08-31,2011-12-16,,2011-12-16,1900-01-01 17:22:31.393
3,ACC 201,LEC,Small Bus Acct,1,3.0,0,2011,FALL,MAIN,2011-08-31,2011-12-16,,2011-09-28,1900-01-01 13:25:25.420
4,ACC 301,LEC,Small Business Acc,1,3.0,25,2011,FALL,MAIN,2011-08-31,2011-12-16,,2012-01-19,1900-01-01 12:47:48.237


In [7]:
df = df[~(df["EVENT_ID"].str.contains("REG", case=False))]
df = df[~(df["EVENT_ID"].str.contains("STDY", case=False))]


In [8]:
print(df.shape)


(5920, 14)


In [9]:
df = df.rename(
    columns={
        "EVENT_MED_NAME": "course_section_name",
        "CREDITS": "credit_hours",
        "MAX_PARTICIPANT": "maximum_enrollment_count",
        "START_DATE": "start_dt",
        "END_DATE": "end_dt",
        "CIP_CODE": "course_cip_code",
    }
)


In [10]:
crs_id = (
    lambda c: (str(c["EVENT_ID"]).replace(" ", "") + str(c["EVENT_SUB_TYPE"]).upper())
    if ((c["EVENT_SUB_TYPE"] == "LAB") | (c["EVENT_SUB_TYPE"] == "SI"))
    else (str(c["EVENT_ID"]).replace(" ", ""))
)
df.loc[:, "course_id"] = df.apply(crs_id, axis=1)


In [11]:
df.loc[:, "course_section_id"] = (
    df["EVENT_ID"]
    + "."
    + df["EVENT_SUB_TYPE"]
    + "."
    + df["ACADEMIC_YEAR"]
    + "."
    + df["ACADEMIC_TERM"].str.title()
    + "."
    + df["SECTION"]
)
df.loc[:, "integration_id"] = df.loc[:, "course_section_id"]


In [12]:
term_id = (
    lambda c: (c["ACADEMIC_YEAR"] + "." + str(c["ACADEMIC_TERM"]).title())
    if (c["ACADEMIC_SESSION"] == "MAIN")
    else (
        c["ACADEMIC_YEAR"]
        + "."
        + str(c["ACADEMIC_TERM"]).title()
        + "."
        + c["ACADEMIC_SESSION"]
    )
)
df.loc[:, "term_id"] = df.apply(term_id, axis=1)


In [13]:
df["AY"] = (
    pd.to_numeric(df["ACADEMIC_YEAR"], errors="coerce")
    .fillna(sections_begin_year)
    .astype(np.int64)
)
cat_yr = lambda c: c["AY"] if (c["ACADEMIC_TERM"] == "FALL") else (c["AY"] - 1)
df.loc[:, "catalog_year"] = df.apply(cat_yr, axis=1)


In [14]:
crs_sect_delv = (
    lambda c: "03"
    if str(c["SECTION"])[:2] == "HY"
    else ("02" if str(c["SECTION"])[:2] == "ON" else "01")
)
df.loc[:, "course_section_delivery"] = df.apply(crs_sect_delv, axis=1)


In [15]:
print(df.shape)
df.head()


(5920, 21)


Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,course_cip_code,REVISION_DATE,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery
0,ACC 101,LEC,Financial Accounting,1,3.0,35,2011,FALL,MAIN,2011-08-31,...,,2013-08-19,1900-01-01 12:26:41.477,ACC101,ACC 101.LEC.2011.Fall.01,ACC 101.LEC.2011.Fall.01,2011.Fall,2011,2011,1
1,ACC 101,LEC,Financial Accounting,2,3.0,36,2011,FALL,MAIN,2011-08-31,...,,2012-05-07,1900-01-01 13:59:40.743,ACC101,ACC 101.LEC.2011.Fall.02,ACC 101.LEC.2011.Fall.02,2011.Fall,2011,2011,1
2,ACC 101,LEC,Financial Accounting,3,3.0,30,2011,FALL,MAIN,2011-08-31,...,,2011-12-16,1900-01-01 17:22:31.393,ACC101,ACC 101.LEC.2011.Fall.03,ACC 101.LEC.2011.Fall.03,2011.Fall,2011,2011,1
3,ACC 201,LEC,Small Bus Acct,1,3.0,0,2011,FALL,MAIN,2011-08-31,...,,2011-09-28,1900-01-01 13:25:25.420,ACC201,ACC 201.LEC.2011.Fall.01,ACC 201.LEC.2011.Fall.01,2011.Fall,2011,2011,1
4,ACC 301,LEC,Small Business Acc,1,3.0,25,2011,FALL,MAIN,2011-08-31,...,,2012-01-19,1900-01-01 12:47:48.237,ACC301,ACC 301.LEC.2011.Fall.01,ACC 301.LEC.2011.Fall.01,2011.Fall,2011,2011,1


In [None]:
df[(df["SECTION"].str[:2] == "HY")]


In [16]:
crs_integ_id = (
    lambda c: (c["EVENT_ID"] + "." + str(c["catalog_year"]))
    if (c["EVENT_SUB_TYPE"] == "")
    else (c["EVENT_ID"] + "." + c["EVENT_SUB_TYPE"] + "." + str(c["catalog_year"]))
)
df.loc[:, "course_integration_id"] = df.apply(crs_integ_id, axis=1)


In [17]:
print(df.shape)
df.head()


(5920, 22)


Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,REVISION_DATE,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id
0,ACC 101,LEC,Financial Accounting,1,3.0,35,2011,FALL,MAIN,2011-08-31,...,2013-08-19,1900-01-01 12:26:41.477,ACC101,ACC 101.LEC.2011.Fall.01,ACC 101.LEC.2011.Fall.01,2011.Fall,2011,2011,1,ACC 101.LEC.2011
1,ACC 101,LEC,Financial Accounting,2,3.0,36,2011,FALL,MAIN,2011-08-31,...,2012-05-07,1900-01-01 13:59:40.743,ACC101,ACC 101.LEC.2011.Fall.02,ACC 101.LEC.2011.Fall.02,2011.Fall,2011,2011,1,ACC 101.LEC.2011
2,ACC 101,LEC,Financial Accounting,3,3.0,30,2011,FALL,MAIN,2011-08-31,...,2011-12-16,1900-01-01 17:22:31.393,ACC101,ACC 101.LEC.2011.Fall.03,ACC 101.LEC.2011.Fall.03,2011.Fall,2011,2011,1,ACC 101.LEC.2011
3,ACC 201,LEC,Small Bus Acct,1,3.0,0,2011,FALL,MAIN,2011-08-31,...,2011-09-28,1900-01-01 13:25:25.420,ACC201,ACC 201.LEC.2011.Fall.01,ACC 201.LEC.2011.Fall.01,2011.Fall,2011,2011,1,ACC 201.LEC.2011
4,ACC 301,LEC,Small Business Acc,1,3.0,25,2011,FALL,MAIN,2011-08-31,...,2012-01-19,1900-01-01 12:47:48.237,ACC301,ACC 301.LEC.2011.Fall.01,ACC 301.LEC.2011.Fall.01,2011.Fall,2011,2011,1,ACC 301.LEC.2011


In [15]:
# read course_catalog.txt to find the correct catalog year
dfcat = pd.read_csv("../course_catalog/course_catalog.txt")
print(dfcat.shape)


(652, 8)


In [None]:
dfcat.head()


In [16]:
dfcat = dfcat[["course_id", "integration_id"]].rename(
    {"integration_id": "cat_integ_id"}, axis="columns"
)


In [None]:
print(dfcat.shape)
dfcat.head()


In [17]:
df = pd.merge(df, dfcat, on=["course_id"], how="left")


In [None]:
print(df.shape)
df.head()


In [18]:
df = df.sort_values(["integration_id", "course_integration_id"], ascending=[True, True])

# keep catalog_year before course year
df = df.loc[(df["course_integration_id"] >= df["cat_integ_id"])]


In [None]:
print(df.shape)
df.head(40)


In [None]:
df = df.sort_values(
    ["course_section_id", "course_integration_id"], ascending=[True, True]
)
# df[df.duplicated(['integration_id'])]


In [None]:
print(df.shape)
df.head(40)


In [None]:
df[df.duplicated(["integration_id"])]


In [19]:
print(df.shape)
df = df.sort_values(
    ["course_section_id", "course_integration_id"], ascending=[True, True]
).drop_duplicates(["course_section_id"], keep="last")
print(df.shape)


(4916, 23)
(4415, 23)


In [None]:
print(df.shape)
df.head(40)


In [20]:
df.loc[:, "course_integration_id"] = df.loc[:, "cat_integ_id"]


In [None]:
print(df.shape)
df.head(40)


In [21]:
# save for teaching.txt below
dfs = df.copy()


In [22]:
dfs.head()


Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id,cat_integ_id
0,ACC 101,LEC,Financial Accounting,1,3.0,35,2011,FALL,MAIN,2011-08-31,...,1900-01-01 12:26:41.477,ACC101,ACC 101.LEC.2011.Fall.01,ACC 101.LEC.2011.Fall.01,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010
1,ACC 101,LEC,Financial Accounting,2,3.0,36,2011,FALL,MAIN,2011-08-31,...,1900-01-01 13:59:40.743,ACC101,ACC 101.LEC.2011.Fall.02,ACC 101.LEC.2011.Fall.02,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010
2,ACC 101,LEC,Financial Accounting,3,3.0,30,2011,FALL,MAIN,2011-08-31,...,1900-01-01 17:22:31.393,ACC101,ACC 101.LEC.2011.Fall.03,ACC 101.LEC.2011.Fall.03,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010
414,ACC 101,LEC,Financial Accounting,1,3.0,37,2011,SPRING,MAIN,2011-01-24,...,1900-01-01 16:19:39.223,ACC101,ACC 101.LEC.2011.Spring.01,ACC 101.LEC.2011.Spring.01,2011.Spring,2011,2010,1,ACC 101.LEC.2010,ACC 101.LEC.2010
415,ACC 101,LEC,Financial Accounting,2,3.0,40,2011,SPRING,MAIN,2011-01-24,...,1900-01-01 11:55:12.630,ACC101,ACC 101.LEC.2011.Spring.02,ACC 101.LEC.2011.Spring.02,2011.Spring,2011,2010,1,ACC 101.LEC.2010,ACC 101.LEC.2010


In [23]:
df = df.loc[
    :,
    [
        "integration_id",
        "course_section_name",
        "course_section_id",
        "start_dt",
        "end_dt",
        "term_id",
        "course_integration_id",
        "course_section_delivery",
        "maximum_enrollment_count",
        "credit_hours",
    ],
]

df = df.sort_values(["integration_id"])


In [None]:
print(df.shape)
df.head()


In [24]:
today = datetime.now().strftime("%Y%m%d")
fn_output = f"{today}_sections.txt"
df.to_csv(fn_output, index=False)


## teaching.txt


In [25]:
sql_str = (
    "SELECT * FROM SECTIONPER WHERE "
    + "EVENT_SUB_TYPE NOT IN ('ADV') "
    + f"AND ACADEMIC_YEAR >= '{sections_begin_year}' "
    + "AND ACADEMIC_TERM IN ('FALL', 'SPRING', 'SUMMER') "
    + "AND ACADEMIC_SESSION IN ('MAIN', 'CULN', 'EXT', 'FNRR', 'HEOP',"
    + " 'SLAB', 'BLOCK A', 'BLOCK AB', 'BLOCK B') "
)
df_sectionper = pd.read_sql_query(sql_str, connection)


In [26]:
df_sectionper = df_sectionper[
    [
        "ACADEMIC_YEAR",
        "ACADEMIC_TERM",
        "ACADEMIC_SESSION",
        "EVENT_ID",
        "EVENT_SUB_TYPE",
        "SECTION",
        "PERSON_CODE_ID",
    ]
]


In [27]:
print(df_sectionper.shape)
print(dfs.shape)


(4940, 7)
(4415, 23)


In [31]:
dft = pd.merge(
    dfs,
    df_sectionper,
    on=[
        "ACADEMIC_YEAR",
        "ACADEMIC_TERM",
        "ACADEMIC_SESSION",
        "EVENT_ID",
        "EVENT_SUB_TYPE",
        "SECTION",
    ],
    how="left",
)


In [32]:
print(dft.shape)
dft.head()


(4620, 24)


Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id,cat_integ_id,PERSON_CODE_ID
0,ACC 101,LEC,Financial Accounting,1,3.0,35,2011,FALL,MAIN,2011-08-31,...,ACC101,ACC 101.LEC.2011.Fall.01,ACC 101.LEC.2011.Fall.01,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010,P000000065
1,ACC 101,LEC,Financial Accounting,2,3.0,36,2011,FALL,MAIN,2011-08-31,...,ACC101,ACC 101.LEC.2011.Fall.02,ACC 101.LEC.2011.Fall.02,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010,P000000065
2,ACC 101,LEC,Financial Accounting,3,3.0,30,2011,FALL,MAIN,2011-08-31,...,ACC101,ACC 101.LEC.2011.Fall.03,ACC 101.LEC.2011.Fall.03,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010,P000000065
3,ACC 101,LEC,Financial Accounting,1,3.0,37,2011,SPRING,MAIN,2011-01-24,...,ACC101,ACC 101.LEC.2011.Spring.01,ACC 101.LEC.2011.Spring.01,2011.Spring,2011,2010,1,ACC 101.LEC.2010,ACC 101.LEC.2010,P000000065
4,ACC 101,LEC,Financial Accounting,2,3.0,40,2011,SPRING,MAIN,2011-01-24,...,ACC101,ACC 101.LEC.2011.Spring.02,ACC 101.LEC.2011.Spring.02,2011.Spring,2011,2010,1,ACC 101.LEC.2010,ACC 101.LEC.2010,P000000065


In [33]:
dft[dft["PERSON_CODE_ID"].isnull()].head()


Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id,cat_integ_id,PERSON_CODE_ID
55,BAK 150,COMB,Found In Baking A3,1,4.0,0,2012,FALL,MAIN,2012-08-29,...,BAK150,BAK 150.COMB.2012.Fall.01,BAK 150.COMB.2012.Fall.01,2012.Fall,2012,2012,1,BAK 150.COMB.2010,BAK 150.COMB.2010,
164,BAK 270,COMB,InternatBakPastryA3,3,4.0,0,2012,FALL,MAIN,2012-08-29,...,BAK270,BAK 270.COMB.2012.Fall.03,BAK 270.COMB.2012.Fall.03,2012.Fall,2012,2012,1,BAK 270.COMB.2011,BAK 270.COMB.2011,
182,BAK 275,COMB,Confect&DecorWkB3,3,4.0,0,2012,FALL,MAIN,2012-10-17,...,BAK275,BAK 275.COMB.2012.Fall.03,BAK 275.COMB.2012.Fall.03,2012.Fall,2012,2012,1,BAK 275.COMB.2010,BAK 275.COMB.2010,
317,BIO 101,LEC,Biology I,2,4.0,0,2012,FALL,MAIN,2012-08-29,...,BIO101,BIO 101.LEC.2012.Fall.02,BIO 101.LEC.2012.Fall.02,2012.Fall,2012,2012,1,BIO 101.LEC.2011,BIO 101.LEC.2011,
339,BIO 102,LAB,Biology II,1,0.0,0,2013,SPRING,MAIN,2013-01-23,...,BIO102lab,BIO 102.LAB.2013.Spring.01,BIO 102.LAB.2013.Spring.01,2013.Spring,2013,2012,1,BIO 102.LAB.2010,BIO 102.LAB.2010,


In [34]:
print(dft.shape)
dft = dft[~dft["PERSON_CODE_ID"].isnull()]
print(dft.shape)


(4620, 24)
(4509, 24)


In [35]:
dft = dft[["course_section_id", "PERSON_CODE_ID"]].rename(
    {
        "course_section_id": "course_section_integration_id",
        "PERSON_CODE_ID": "user_integration_id",
    },
    axis="columns",
)


In [36]:
print(dft.shape)
dft.head()


(4509, 2)


Unnamed: 0,course_section_integration_id,user_integration_id
0,ACC 101.LEC.2011.Fall.01,P000000065
1,ACC 101.LEC.2011.Fall.02,P000000065
2,ACC 101.LEC.2011.Fall.03,P000000065
3,ACC 101.LEC.2011.Spring.01,P000000065
4,ACC 101.LEC.2011.Spring.02,P000000065


In [37]:
dft.loc[:, "user_role"] = "INSTRUCTOR"
dft.loc[:, "available_ind"] = "1"


In [38]:
print(dft.shape)
dft.head()


(4509, 4)


Unnamed: 0,course_section_integration_id,user_integration_id,user_role,available_ind
0,ACC 101.LEC.2011.Fall.01,P000000065,INSTRUCTOR,1
1,ACC 101.LEC.2011.Fall.02,P000000065,INSTRUCTOR,1
2,ACC 101.LEC.2011.Fall.03,P000000065,INSTRUCTOR,1
3,ACC 101.LEC.2011.Spring.01,P000000065,INSTRUCTOR,1
4,ACC 101.LEC.2011.Spring.02,P000000065,INSTRUCTOR,1


In [39]:
dft = dft.sort_values(["course_section_integration_id", "user_integration_id"])


In [40]:
today = datetime.now().strftime("%Y%m%d")
fn_output = f"{today}_teaching.txt"
dft.to_csv(fn_output, index=False)
