In [1]:
import numpy as np
import pandas as pd
import os
from datetime import date, datetime
from sqlalchemy import create_engine


In [2]:
# local connection information
db_user = os.environ.get('DB_USER')
db_pass = os.environ.get('DB_PASS')
engine = create_engine(f'mssql+pyodbc://{db_user}:{db_pass}' +
                       '@PSC-SQLProd/Campus6?' +
                       'driver=ODBC+Driver+13+for+SQL+Server')
connection = engine.connect()


In [3]:
sections_begin_year = '2011'


In [4]:
sql_str = "SELECT * FROM SECTIONS WHERE " + \
          "EVENT_SUB_TYPE NOT IN ('ADV') " + \
          f"AND ACADEMIC_YEAR >= '{sections_begin_year}' " + \
          "AND ACADEMIC_TERM IN ('FALL', 'SPRING', 'SUMMER') " + \
          "AND ACADEMIC_SESSION IN ('MAIN', 'CULN', 'EXT', 'FNRR', 'HEOP'," + \
          " 'SLAB', 'BLOCK A', 'BLOCK AB', 'BLOCK B') "
df_sections = pd.read_sql_query(sql_str, connection)


In [5]:
df = df_sections[['EVENT_ID', 'EVENT_SUB_TYPE', 'EVENT_MED_NAME',
                   'SECTION', 'CREDITS', 'MAX_PARTICIPANT',
                   'ACADEMIC_YEAR', 'ACADEMIC_TERM', 'ACADEMIC_SESSION',
                   'START_DATE', 'END_DATE', 'CIP_CODE',
                   'REVISION_DATE', 'REVISION_TIME',
                   ]]


print('ACADEMIC_TERM: ', df['ACADEMIC_TERM'].unique())

print('ACADEMIC_SESSION: ', df['ACADEMIC_SESSION'].unique())

In [None]:
print(df.shape)
df.head()

In [6]:
df = df[~(df['EVENT_ID'].str.contains('REG', case=False))]
df = df[~(df['EVENT_ID'].str.contains('STDY', case=False))]


In [7]:
print(df.shape)


(4855, 14)


In [8]:
df = df.rename(columns={'EVENT_MED_NAME': 'course_section_name',
                        'CREDITS': 'credit_hours',
                        'MAX_PARTICIPANT': 'maximum_enrollment_count',
                        'START_DATE': 'start_dt',
                        'END_DATE': 'end_dt',
                        'CIP_CODE': 'course_cip_code',
                        })


In [9]:
crs_id = (lambda c: (str(c['EVENT_ID']).replace(' ', '') +
                     str(c['EVENT_SUB_TYPE']).lower())
          if ((c['EVENT_SUB_TYPE'] == 'LAB') | (c['EVENT_SUB_TYPE'] == 'SI'))
          else (str(c['EVENT_ID']).replace(' ', ''))
          )
df.loc[:, 'course_id'] = df.apply(crs_id, axis=1)


In [10]:
df.loc[:, 'course_section_id'] = (df['EVENT_ID'] + '.' +
                                  df['EVENT_SUB_TYPE'] + '.' +
                                  df['ACADEMIC_YEAR'] + '.' +
                                  df['ACADEMIC_TERM'].str.title() + '.' +
                                  df['SECTION']
                                  )
df.loc[:, 'integration_id'] = df.loc[:, 'course_section_id']


In [11]:
term_id = (lambda c: (c['ACADEMIC_YEAR'] + '.' +
                      str(c['ACADEMIC_TERM']).title())
           if (c['ACADEMIC_SESSION'] == 'MAIN')
           else (c['ACADEMIC_YEAR'] + '.' +
                 str(c['ACADEMIC_TERM']).title() + '.' +
                 c['ACADEMIC_SESSION'])
           )
df.loc[:, 'term_id'] = df.apply(term_id, axis=1)


In [12]:
df['AY'] = (pd.to_numeric(df['ACADEMIC_YEAR'], errors='coerce')
              .fillna(sections_begin_year).astype(np.int64))
cat_yr = (lambda c: c['AY'] if (c['ACADEMIC_TERM'] == 'FALL')
          else (c['AY'] - 1))
df.loc[:, 'catalog_year'] = df.apply(cat_yr, axis=1)


In [13]:
crs_sect_delv = (lambda c: '03'
                 if str(c['SECTION'])[:2] == 'HY'
                 else ('02' if str(c['SECTION'])[:2] == 'ON'
                       else '01')
                 )
df.loc[:, 'course_section_delivery'] = df.apply(crs_sect_delv, axis=1)


In [None]:
print(df.shape)
df.head()

In [None]:
df[(df['SECTION'].str[:2]=='HY')]

In [14]:
crs_integ_id = (lambda c: (c['EVENT_ID'] + '.' + str(c['catalog_year']))
                if (c['EVENT_SUB_TYPE'] == '')
                else (c['EVENT_ID'] + '.' + c['EVENT_SUB_TYPE'] + '.' +
                      str(c['catalog_year'])))
df.loc[:, 'course_integration_id'] = df.apply(crs_integ_id, axis=1)


In [None]:
print(df.shape)
df.head()

In [15]:
# read course_catalog.txt to find the correct catalog year
dfcat = pd.read_csv('../course_catalog/course_catalog.txt')
print(dfcat.shape)

(652, 8)


In [None]:
dfcat.head()

In [16]:
dfcat = (dfcat[['course_id', 'integration_id']]
         .rename({'integration_id': 'cat_integ_id'}, axis='columns')
         )

In [None]:
print(dfcat.shape)
dfcat.head()

In [17]:
df = pd.merge(df, dfcat, on=['course_id'], how='left')

In [None]:
print(df.shape)
df.head()

In [18]:
df = (df.sort_values(['integration_id', 'course_integration_id'],
                     ascending=[True, True]))

# keep catalog_year before course year
df = df.loc[(df['course_integration_id'] >= df['cat_integ_id'])]

In [None]:
print(df.shape)
df.head(40)

In [None]:
df = (df.sort_values(['course_section_id', 'course_integration_id'],
                     ascending=[True, True]))
# df[df.duplicated(['integration_id'])]


In [None]:
print(df.shape)
df.head(40)

In [None]:
df[df.duplicated(['integration_id'])]

In [20]:
print(df.shape)
df = (df.sort_values(['course_section_id', 'course_integration_id'],
                     ascending=[True, True])
      .drop_duplicates(['course_section_id'], keep='last')
      )
print(df.shape)

(4414, 23)
(4414, 23)


In [None]:
print(df.shape)
df.head(40)

In [21]:
df.loc[:, 'course_integration_id'] = df.loc[:, 'cat_integ_id']

In [None]:
print(df.shape)
df.head(40)

In [22]:
df = df.loc[:, ['integration_id', 'course_section_name', 'course_section_id',
            'start_dt', 'end_dt', 'term_id', 'course_integration_id',
            'course_section_delivery', 'maximum_enrollment_count',
            'credit_hours',
            ]]

df = df.sort_values(['integration_id'])


In [23]:
print(df.shape)
df.head()

(4414, 10)


Unnamed: 0,integration_id,course_section_name,course_section_id,start_dt,end_dt,term_id,course_integration_id,course_section_delivery,maximum_enrollment_count,credit_hours
0,ACC 101.LEC.2011.Fall.01,Financial Accounting,ACC 101.LEC.2011.Fall.01,2011-08-31,2011-12-16,2011.Fall,ACC 101.LEC.2010,1,35,3.0
1,ACC 101.LEC.2011.Fall.02,Financial Accounting,ACC 101.LEC.2011.Fall.02,2011-08-31,2011-12-16,2011.Fall,ACC 101.LEC.2010,1,36,3.0
2,ACC 101.LEC.2011.Fall.03,Financial Accounting,ACC 101.LEC.2011.Fall.03,2011-08-31,2011-12-16,2011.Fall,ACC 101.LEC.2010,1,30,3.0
414,ACC 101.LEC.2011.Spring.01,Financial Accounting,ACC 101.LEC.2011.Spring.01,2011-01-24,2011-05-11,2011.Spring,ACC 101.LEC.2010,1,37,3.0
415,ACC 101.LEC.2011.Spring.02,Financial Accounting,ACC 101.LEC.2011.Spring.02,2011-01-24,2011-05-11,2011.Spring,ACC 101.LEC.2010,1,40,3.0


In [24]:
today = datetime.now().strftime('%Y%m%d')
fn_output = f'{today}_sections.txt'
df.to_csv(fn_output, index=False)
