In [30]:
import numpy as np
import pandas as pd
import os
from datetime import date, datetime
from sqlalchemy import create_engine


In [31]:
# local connection information
db_user = os.environ.get('DB_USER')
db_pass = os.environ.get('DB_PASS')
engine = create_engine(f'mssql+pyodbc://{db_user}:{db_pass}' +
                       '@PSC-SQLProd/Campus6?' +
                       'driver=ODBC+Driver+13+for+SQL+Server')
connection = engine.connect()


In [32]:
sections_begin_year = '2011'


In [33]:
sql_str = "SELECT * FROM SECTIONS WHERE " + \
          "EVENT_SUB_TYPE NOT IN ('ADV') " + \
          f"AND ACADEMIC_YEAR >= '{sections_begin_year}' " + \
          "AND ACADEMIC_TERM IN ('FALL', 'SPRING', 'SUMMER') " + \
          "AND ACADEMIC_SESSION IN ('MAIN', 'CULN', 'EXT', 'FNRR', 'HEOP'," + \
          " 'SLAB', 'BLOCK A', 'BLOCK AB', 'BLOCK B') "
df_sections = pd.read_sql_query(sql_str, connection)


In [34]:
df = df_sections[['EVENT_ID', 'EVENT_SUB_TYPE', 'EVENT_MED_NAME',
                   'SECTION', 'CREDITS', 'MAX_PARTICIPANT',
                   'ACADEMIC_YEAR', 'ACADEMIC_TERM', 'ACADEMIC_SESSION',
                   'START_DATE', 'END_DATE', 'CIP_CODE',
                   'REVISION_DATE', 'REVISION_TIME',
                   ]]


print('ACADEMIC_TERM: ', df['ACADEMIC_TERM'].unique())

print('ACADEMIC_SESSION: ', df['ACADEMIC_SESSION'].unique())

In [None]:
print(df.shape)
df.head()

In [35]:
df = df[~(df['EVENT_ID'].str.contains('REG', case=False))]
df = df[~(df['EVENT_ID'].str.contains('STDY', case=False))]


In [36]:
print(df.shape)


(4855, 14)


In [37]:
df = df.rename(columns={'EVENT_MED_NAME': 'course_section_name',
                        'CREDITS': 'credit_hours',
                        'MAX_PARTICIPANT': 'maximum_enrollment_count',
                        'START_DATE': 'start_dt',
                        'END_DATE': 'end_dt',
                        'CIP_CODE': 'course_cip_code',
                        })


In [38]:
crs_id = (lambda c: (str(c['EVENT_ID']).replace(' ', '') +
                     str(c['EVENT_SUB_TYPE']).lower())
          if ((c['EVENT_SUB_TYPE'] == 'LAB') | (c['EVENT_SUB_TYPE'] == 'SI'))
          else (str(c['EVENT_ID']).replace(' ', ''))
          )
df.loc[:, 'course_id'] = df.apply(crs_id, axis=1)


In [39]:
df.loc[:, 'course_section_id'] = (df['EVENT_ID'] + '.' +
                                  df['EVENT_SUB_TYPE'] + '.' +
                                  df['ACADEMIC_YEAR'] + '.' +
                                  df['ACADEMIC_TERM'].str.title() + '.' +
                                  df['SECTION']
                                  )
df.loc[:, 'integration_id'] = df.loc[:, 'course_section_id']


In [40]:
term_id = (lambda c: (c['ACADEMIC_YEAR'] + '.' +
                      str(c['ACADEMIC_TERM']).title())
           if (c['ACADEMIC_SESSION'] == 'MAIN')
           else (c['ACADEMIC_YEAR'] + '.' +
                 str(c['ACADEMIC_TERM']).title() + '.' +
                 c['ACADEMIC_SESSION'])
           )
df.loc[:, 'term_id'] = df.apply(term_id, axis=1)


In [41]:
df['AY'] = (pd.to_numeric(df['ACADEMIC_YEAR'], errors='coerce')
              .fillna(sections_begin_year).astype(np.int64))
cat_yr = (lambda c: c['AY'] if (c['ACADEMIC_TERM'] == 'FALL')
          else (c['AY'] - 1))
df.loc[:, 'catalog_year'] = df.apply(cat_yr, axis=1)


In [42]:
crs_sect_delv = (lambda c: '03'
                 if str(c['SECTION'])[:2] == 'HY'
                 else ('02' if str(c['SECTION'])[:2] == 'ON'
                       else '01')
                 )
df.loc[:, 'course_section_delivery'] = df.apply(crs_sect_delv, axis=1)


In [None]:
print(df.shape)
df.head()

In [None]:
df[(df['SECTION'].str[:2]=='HY')]

In [43]:
crs_integ_id = (lambda c: (c['EVENT_ID'] + '.' + str(c['catalog_year']))
                if (c['EVENT_SUB_TYPE'] == '')
                else (c['EVENT_ID'] + '.' + c['EVENT_SUB_TYPE'] + '.' +
                      str(c['catalog_year'])))
df.loc[:, 'course_integration_id'] = df.apply(crs_integ_id, axis=1)


In [None]:
print(df.shape)
df.head()

In [44]:
# read course_catalog.txt to find the correct catalog year
dfcat = pd.read_csv('../course_catalog/course_catalog.txt')
print(dfcat.shape)

(652, 8)


In [None]:
dfcat.head()

In [45]:
dfcat = (dfcat[['course_id', 'integration_id']]
         .rename({'integration_id': 'cat_integ_id'}, axis='columns')
         )

In [None]:
print(dfcat.shape)
dfcat.head()

In [46]:
df = pd.merge(df, dfcat, on=['course_id'], how='left')

In [None]:
print(df.shape)
df.head()

In [47]:
df = (df.sort_values(['integration_id', 'course_integration_id'],
                     ascending=[True, True]))

# keep catalog_year before course year
df = df.loc[(df['course_integration_id'] >= df['cat_integ_id'])]

In [None]:
print(df.shape)
df.head(40)

In [None]:
df = (df.sort_values(['course_section_id', 'course_integration_id'],
                     ascending=[True, True]))
# df[df.duplicated(['integration_id'])]


In [None]:
print(df.shape)
df.head(40)

In [None]:
df[df.duplicated(['integration_id'])]

In [48]:
print(df.shape)
df = (df.sort_values(['course_section_id', 'course_integration_id'],
                     ascending=[True, True])
      .drop_duplicates(['course_section_id'], keep='last')
      )
print(df.shape)

(4915, 23)
(4414, 23)


In [None]:
print(df.shape)
df.head(40)

In [49]:
df.loc[:, 'course_integration_id'] = df.loc[:, 'cat_integ_id']

In [None]:
print(df.shape)
df.head(40)

In [50]:
# save for teaching.txt below 
dfs = df.copy()

In [51]:
dfs.head()

Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id,cat_integ_id
0,ACC 101,LEC,Financial Accounting,1,3.0,35,2011,FALL,MAIN,2011-08-31,...,1900-01-01 12:26:41.477,ACC101,ACC 101.LEC.2011.Fall.01,ACC 101.LEC.2011.Fall.01,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010
1,ACC 101,LEC,Financial Accounting,2,3.0,36,2011,FALL,MAIN,2011-08-31,...,1900-01-01 13:59:40.743,ACC101,ACC 101.LEC.2011.Fall.02,ACC 101.LEC.2011.Fall.02,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010
2,ACC 101,LEC,Financial Accounting,3,3.0,30,2011,FALL,MAIN,2011-08-31,...,1900-01-01 17:22:31.393,ACC101,ACC 101.LEC.2011.Fall.03,ACC 101.LEC.2011.Fall.03,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010
414,ACC 101,LEC,Financial Accounting,1,3.0,37,2011,SPRING,MAIN,2011-01-24,...,1900-01-01 16:19:39.223,ACC101,ACC 101.LEC.2011.Spring.01,ACC 101.LEC.2011.Spring.01,2011.Spring,2011,2010,1,ACC 101.LEC.2010,ACC 101.LEC.2010
415,ACC 101,LEC,Financial Accounting,2,3.0,40,2011,SPRING,MAIN,2011-01-24,...,1900-01-01 11:55:12.630,ACC101,ACC 101.LEC.2011.Spring.02,ACC 101.LEC.2011.Spring.02,2011.Spring,2011,2010,1,ACC 101.LEC.2010,ACC 101.LEC.2010


In [52]:
df = df.loc[:, ['integration_id', 'course_section_name', 'course_section_id',
            'start_dt', 'end_dt', 'term_id', 'course_integration_id',
            'course_section_delivery', 'maximum_enrollment_count',
            'credit_hours',
            ]]

df = df.sort_values(['integration_id'])


In [None]:
print(df.shape)
df.head()

In [22]:
today = datetime.now().strftime('%Y%m%d')
fn_output = f'{today}_sections.txt'
df.to_csv(fn_output, index=False)


## teaching.txt

In [53]:
sql_str = "SELECT * FROM SECTIONPER WHERE " + \
          "EVENT_SUB_TYPE NOT IN ('ADV') " + \
          f"AND ACADEMIC_YEAR >= '{sections_begin_year}' " + \
          "AND ACADEMIC_TERM IN ('FALL', 'SPRING', 'SUMMER') " + \
          "AND ACADEMIC_SESSION IN ('MAIN', 'CULN', 'EXT', 'FNRR', 'HEOP'," + \
          " 'SLAB', 'BLOCK A', 'BLOCK AB', 'BLOCK B') "
df_sectionper = pd.read_sql_query(sql_str, connection)


In [54]:
df_sectionper = df_sectionper[['ACADEMIC_YEAR', 'ACADEMIC_TERM', 'ACADEMIC_SESSION',
                               'EVENT_ID', 'EVENT_SUB_TYPE', 'SECTION',
                               'PERSON_CODE_ID',
                               ]]

In [56]:
print(df_sectionper.shape)
print(dfs.shape)

(4938, 7)
(4414, 23)


In [57]:
dft = pd.merge(dfs, df_sectionper,
               on=['ACADEMIC_YEAR', 'ACADEMIC_TERM', 'ACADEMIC_SESSION',
                   'EVENT_ID', 'EVENT_SUB_TYPE', 'SECTION'],
               how='left')


In [58]:
print(dft.shape)
dft.head()


(4619, 24)


Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id,cat_integ_id,PERSON_CODE_ID
0,ACC 101,LEC,Financial Accounting,1,3.0,35,2011,FALL,MAIN,2011-08-31,...,ACC101,ACC 101.LEC.2011.Fall.01,ACC 101.LEC.2011.Fall.01,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010,P000000065
1,ACC 101,LEC,Financial Accounting,2,3.0,36,2011,FALL,MAIN,2011-08-31,...,ACC101,ACC 101.LEC.2011.Fall.02,ACC 101.LEC.2011.Fall.02,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010,P000000065
2,ACC 101,LEC,Financial Accounting,3,3.0,30,2011,FALL,MAIN,2011-08-31,...,ACC101,ACC 101.LEC.2011.Fall.03,ACC 101.LEC.2011.Fall.03,2011.Fall,2011,2011,1,ACC 101.LEC.2010,ACC 101.LEC.2010,P000000065
3,ACC 101,LEC,Financial Accounting,1,3.0,37,2011,SPRING,MAIN,2011-01-24,...,ACC101,ACC 101.LEC.2011.Spring.01,ACC 101.LEC.2011.Spring.01,2011.Spring,2011,2010,1,ACC 101.LEC.2010,ACC 101.LEC.2010,P000000065
4,ACC 101,LEC,Financial Accounting,2,3.0,40,2011,SPRING,MAIN,2011-01-24,...,ACC101,ACC 101.LEC.2011.Spring.02,ACC 101.LEC.2011.Spring.02,2011.Spring,2011,2010,1,ACC 101.LEC.2010,ACC 101.LEC.2010,P000000065


In [60]:
dft = (dft[['course_section_id', 'PERSON_CODE_ID']]
       .rename({'course_section_id': 'course_section_integration_id',
                'PERSON_CODE_ID': 'user_integration_id',
                },
               axis='columns')
       )


In [61]:
print(dft.shape)
dft.head()


(4619, 2)


Unnamed: 0,course_section_integration_id,user_integration_id
0,ACC 101.LEC.2011.Fall.01,P000000065
1,ACC 101.LEC.2011.Fall.02,P000000065
2,ACC 101.LEC.2011.Fall.03,P000000065
3,ACC 101.LEC.2011.Spring.01,P000000065
4,ACC 101.LEC.2011.Spring.02,P000000065


In [64]:
dft.loc[:, 'user_role'] = 'INSTRUCTOR'
dft.loc[:, 'available_ind'] = '1'

In [65]:
print(dft.shape)
dft.head()


(4619, 4)


Unnamed: 0,course_section_integration_id,user_integration_id,user_role,available_ind
0,ACC 101.LEC.2011.Fall.01,P000000065,INSTRUCTOR,1
1,ACC 101.LEC.2011.Fall.02,P000000065,INSTRUCTOR,1
2,ACC 101.LEC.2011.Fall.03,P000000065,INSTRUCTOR,1
3,ACC 101.LEC.2011.Spring.01,P000000065,INSTRUCTOR,1
4,ACC 101.LEC.2011.Spring.02,P000000065,INSTRUCTOR,1


In [66]:
dft = dft.sort_values(['course_section_integration_id',
                       'user_integration_id'])


In [67]:
today = datetime.now().strftime('%Y%m%d')
fn_output = f'{today}_teaching.txt'
dft.to_csv(fn_output, index=False)
