In [1]:
import numpy as np
import pandas as pd
import os
from datetime import date, datetime
from sqlalchemy import create_engine


In [2]:
# local connection information
db_user = os.environ.get('DB_USER')
db_pass = os.environ.get('DB_PASS')
engine = create_engine(f'mssql+pyodbc://{db_user}:{db_pass}' +
                       '@PSC-SQLProd/Campus6?' +
                       'driver=ODBC+Driver+13+for+SQL+Server')
connection = engine.connect()


In [3]:
sections_begin_year = '2011'


In [4]:
sql_str = "SELECT * FROM SECTIONS WHERE " + \
          "EVENT_SUB_TYPE NOT IN ('ADV') " + \
          f"AND ACADEMIC_YEAR >= '{sections_begin_year}' " + \
          "AND ACADEMIC_TERM IN ('FALL', 'SPRING', 'SUMMER') " + \
          "AND ACADEMIC_SESSION IN ('MAIN', 'CULN', 'EXT', 'FNRR', 'HEOP'," + \
          " 'SLAB', 'BLOCK A', 'BLOCK AB', 'BLOCK B') "
df_sections = pd.read_sql_query(sql_str, connection)


In [5]:
df = df_sections[['EVENT_ID', 'EVENT_SUB_TYPE', 'EVENT_MED_NAME',
                   'SECTION', 'CREDITS', 'MAX_PARTICIPANT',
                   'ACADEMIC_YEAR', 'ACADEMIC_TERM', 'ACADEMIC_SESSION',
                   'START_DATE', 'END_DATE', 'CIP_CODE',
                   'REVISION_DATE', 'REVISION_TIME',
                   ]]


print('ACADEMIC_TERM: ', df['ACADEMIC_TERM'].unique())

print('ACADEMIC_SESSION: ', df['ACADEMIC_SESSION'].unique())

In [6]:
print(df.shape)
df.head()

(5049, 14)


Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,EVENT_MED_NAME,SECTION,CREDITS,MAX_PARTICIPANT,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,START_DATE,END_DATE,CIP_CODE,REVISION_DATE,REVISION_TIME
0,ACC 101,LEC,Financial Accounting,1,3.0,35,2011,FALL,MAIN,2011-08-31,2011-12-16,,2013-08-19,1900-01-01 12:26:41.477
1,ACC 101,LEC,Financial Accounting,2,3.0,36,2011,FALL,MAIN,2011-08-31,2011-12-16,,2012-05-07,1900-01-01 13:59:40.743
2,ACC 101,LEC,Financial Accounting,3,3.0,30,2011,FALL,MAIN,2011-08-31,2011-12-16,,2011-12-16,1900-01-01 17:22:31.393
3,ACC 201,LEC,Small Bus Acct,1,3.0,0,2011,FALL,MAIN,2011-08-31,2011-12-16,,2011-09-28,1900-01-01 13:25:25.420
4,ACC 301,LEC,Small Business Acc,1,3.0,25,2011,FALL,MAIN,2011-08-31,2011-12-16,,2012-01-19,1900-01-01 12:47:48.237


In [7]:
df = df[~(df['EVENT_ID'].str.contains('REG', case=False))]
df = df[~(df['EVENT_ID'].str.contains('STDY', case=False))]


In [8]:
print(df.shape)


(4855, 14)


In [9]:
df = df.rename(columns={'EVENT_MED_NAME': 'course_section_name',
                        'CREDITS': 'credit_hours',
                        'MAX_PARTICIPANT': 'maximum_enrollment_count',
                        'START_DATE': 'start_dt',
                        'END_DATE': 'end_dt',
                        'CIP_CODE': 'course_cip_code',
                        })


In [10]:
df.loc[:, 'course_id'] = df.loc[:, 'EVENT_ID'].str.replace(' ', '')


In [11]:
df.loc[:, 'course_section_id'] = (df['EVENT_ID'] + '.' +
                                  df['EVENT_SUB_TYPE'] + '.' +
                                  df['ACADEMIC_YEAR'] + '.' +
                                  df['ACADEMIC_TERM'].str.title() + '.' +
                                  df['SECTION']
                                  )
df.loc[:, 'integration_id'] = df.loc[:, 'course_section_id']


In [12]:
term_id = (lambda c: (c['ACADEMIC_YEAR'] + '.' +
                      str(c['ACADEMIC_TERM']).title())
           if (c['ACADEMIC_SESSION'] == 'MAIN')
           else (c['ACADEMIC_YEAR'] + '.' +
                 str(c['ACADEMIC_TERM']).title() + '.' +
                 c['ACADEMIC_SESSION'])
           )
df.loc[:, 'term_id'] = df.apply(term_id, axis=1)


In [13]:
df['AY'] = (pd.to_numeric(df['ACADEMIC_YEAR'], errors='coerce')
              .fillna(sections_begin_year).astype(np.int64))
cat_yr = (lambda c: c['AY'] if (c['ACADEMIC_TERM'] == 'FALL')
          else (c['AY'] - 1))
df.loc[:, 'catalog_year'] = df.apply(cat_yr, axis=1)


In [14]:
crs_sect_delv = (lambda c: '03'
                 if str(c['SECTION'])[:2] == 'HY'
                 else ('02' if str(c['SECTION'])[:2] == 'ON'
                       else '01')
                 )
df.loc[:, 'course_section_delivery'] = df.apply(crs_sect_delv, axis=1)


In [15]:
print(df.shape)
df.head()

(4855, 21)


Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,course_cip_code,REVISION_DATE,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery
0,ACC 101,LEC,Financial Accounting,1,3.0,35,2011,FALL,MAIN,2011-08-31,...,,2013-08-19,1900-01-01 12:26:41.477,ACC101,ACC 101.LEC.2011.Fall.01,ACC 101.LEC.2011.Fall.01,2011.Fall,2011,2011,1
1,ACC 101,LEC,Financial Accounting,2,3.0,36,2011,FALL,MAIN,2011-08-31,...,,2012-05-07,1900-01-01 13:59:40.743,ACC101,ACC 101.LEC.2011.Fall.02,ACC 101.LEC.2011.Fall.02,2011.Fall,2011,2011,1
2,ACC 101,LEC,Financial Accounting,3,3.0,30,2011,FALL,MAIN,2011-08-31,...,,2011-12-16,1900-01-01 17:22:31.393,ACC101,ACC 101.LEC.2011.Fall.03,ACC 101.LEC.2011.Fall.03,2011.Fall,2011,2011,1
3,ACC 201,LEC,Small Bus Acct,1,3.0,0,2011,FALL,MAIN,2011-08-31,...,,2011-09-28,1900-01-01 13:25:25.420,ACC201,ACC 201.LEC.2011.Fall.01,ACC 201.LEC.2011.Fall.01,2011.Fall,2011,2011,1
4,ACC 301,LEC,Small Business Acc,1,3.0,25,2011,FALL,MAIN,2011-08-31,...,,2012-01-19,1900-01-01 12:47:48.237,ACC301,ACC 301.LEC.2011.Fall.01,ACC 301.LEC.2011.Fall.01,2011.Fall,2011,2011,1


In [None]:
df[(df['SECTION'].str[:2]=='HY')]

In [16]:
crs_integ_id = (lambda c: (c['course_id'] + '.' + str(c['catalog_year']))
                if (c['EVENT_SUB_TYPE'] == '')
                else (c['course_id'] + '.' + c['EVENT_SUB_TYPE'] + '.' +
                      str(c['catalog_year'])))
df.loc[:, 'course_integration_id'] = df.apply(crs_integ_id, axis=1)


In [18]:
print(df.shape)
df.head()

(4855, 22)


Unnamed: 0,EVENT_ID,EVENT_SUB_TYPE,course_section_name,SECTION,credit_hours,maximum_enrollment_count,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,start_dt,...,REVISION_DATE,REVISION_TIME,course_id,course_section_id,integration_id,term_id,AY,catalog_year,course_section_delivery,course_integration_id
0,ACC 101,LEC,Financial Accounting,1,3.0,35,2011,FALL,MAIN,2011-08-31,...,2013-08-19,1900-01-01 12:26:41.477,ACC101,ACC 101.LEC.2011.Fall.01,ACC 101.LEC.2011.Fall.01,2011.Fall,2011,2011,1,ACC101.LEC.2011
1,ACC 101,LEC,Financial Accounting,2,3.0,36,2011,FALL,MAIN,2011-08-31,...,2012-05-07,1900-01-01 13:59:40.743,ACC101,ACC 101.LEC.2011.Fall.02,ACC 101.LEC.2011.Fall.02,2011.Fall,2011,2011,1,ACC101.LEC.2011
2,ACC 101,LEC,Financial Accounting,3,3.0,30,2011,FALL,MAIN,2011-08-31,...,2011-12-16,1900-01-01 17:22:31.393,ACC101,ACC 101.LEC.2011.Fall.03,ACC 101.LEC.2011.Fall.03,2011.Fall,2011,2011,1,ACC101.LEC.2011
3,ACC 201,LEC,Small Bus Acct,1,3.0,0,2011,FALL,MAIN,2011-08-31,...,2011-09-28,1900-01-01 13:25:25.420,ACC201,ACC 201.LEC.2011.Fall.01,ACC 201.LEC.2011.Fall.01,2011.Fall,2011,2011,1,ACC201.LEC.2011
4,ACC 301,LEC,Small Business Acc,1,3.0,25,2011,FALL,MAIN,2011-08-31,...,2012-01-19,1900-01-01 12:47:48.237,ACC301,ACC 301.LEC.2011.Fall.01,ACC 301.LEC.2011.Fall.01,2011.Fall,2011,2011,1,ACC301.LEC.2011


In [17]:
# read course_catalog.txt to find the correct catalog year
dfcat = pd.read_csv('../course_catalog/course_catalog.txt')
print(dfcat.shape)

(653, 8)


In [19]:
dfcat.head()

Unnamed: 0,integration_id,course_id,course_name,default_credit_hours,Level,description,status,catalog_year
0,ACC101.LEC.2010,ACC101,Financial Accounting,3.0,1,Students utilize the rules of debits/credits i...,ACTIVE,2010
1,ACC102.LEC.2010,ACC102,Managerial Accounting,3.0,1,Study of the principles of financial accountin...,ACTIVE,2010
2,ACC301.LEC.2011,ACC301,Small Business Accounting,3.0,3,This course will familiarize students with acc...,ACTIVE,2011
3,AR100.2011,AR100,Analytical Reasoning Foundational,3.0,1,,ACTIVE,2011
4,AR101.2011,AR101,Analytical Reasoning Reinforcing,3.0,1,,ACTIVE,2011


In [20]:
dfcat = (dfcat[['course_id', 'integration_id']]
         .rename({'integration_id': 'cat_integ_id'}, axis='columns')
        )

In [21]:
print(dfcat.shape)
dfcat.head()

(653, 2)


Unnamed: 0,course_id,cat_integ_id
0,ACC101,ACC101.LEC.2010
1,ACC102,ACC102.LEC.2010
2,ACC301,ACC301.LEC.2011
3,AR100,AR100.2011
4,AR101,AR101.2011


In [None]:
df = pd.merge(df, dfcat, on=['course_id'], how='left')

In [None]:
print(df.shape)
df.head()

In [None]:
df = (df.sort_values(['integration_id', 'course_integration_id'],
                     ascending=[True, True]))

df = df.loc[(df['course_integration_id'] >= df['cat_integ_id'])]

In [None]:
print(df.shape)
df.head()

In [None]:
df = (df.sort_values(['course_section_id', 'course_integration_id'],
                     ascending=[True, True]))
df[df.duplicated(['integration_id'])]


In [None]:
print(df.shape)
df = (df.sort_values(['course_section_id', 'course_integration_id'],
                     ascending=[True, True])
      .drop_duplicates(['course_section_id'], keep='last')
      )
print(df.shape)

In [None]:
df = df.loc[:, ['integration_id', 'course_section_name', 'course_section_id',
            'start_dt', 'end_dt', 'term_id', 'course_integration_id',
            'course_section_delivery', 'maximum_enrollment_count',
            'credit_hours',
            ]]

df = df.sort_values(['integration_id'])


In [None]:
print(df.shape)
df.head()

In [None]:
today = datetime.now().strftime('%Y%m%d')
fn_output = f'{today}_sections.txt'
df.to_csv(fn_output, index=False)
