In [10]:
import numpy as np
import pandas as pd
from datetime import date


In [11]:
# local connection information
import local_db
connection = local_db.connection()


In [12]:
# utility functions
import util


In [14]:
today = date.today()
today_str = today.strftime('%Y%m%d')


In [20]:
sql_str = "SELECT PEOPLE_CODE_ID, ACADEMIC_YEAR, ACADEMIC_TERM, ACADEMIC_SESSION, " + \
          "CREDITS, PRIMARY_FLAG, CLASS_LEVEL " + \
          "FROM ACADEMIC WHERE " + \
          "PRIMARY_FLAG = 'Y' " + \
          "AND CREDITS > 0 "
df_aca = pd.read_sql_query(sql_str, connection)

df_aca = df_aca[['PEOPLE_CODE_ID', 'ACADEMIC_YEAR', 'ACADEMIC_TERM', 'ACADEMIC_SESSION', 
                 'CREDITS', 'PRIMARY_FLAG', 'CLASS_LEVEL', 
                 ]]

print(df_aca.shape)

(81405, 7)


In [32]:
# keep records for active students
df = util.apply_active(in_df=df_aca)

print(df.shape)

(9830, 7)


In [33]:
# find the latest year
df = df[(~df['ACADEMIC_YEAR'].isnull())]
print(df.shape)
print(df.head())
df['ACADEMIC_YEAR'] = (pd.to_numeric(df['ACADEMIC_YEAR'], errors='coerce'))
df_seq = pd.DataFrame([{'term': 'SPRING', 'seq': 1},
                       {'term': 'SUMMER', 'seq': 2},
                       {'term': 'FALL', 'seq': 3}])
df = pd.merge(df, df_seq, left_on='ACADEMIC_TERM', right_on='term', how='left')
df['term_seq'] = df['ACADEMIC_YEAR'] * 100 + df['seq']
df = (df.loc[df.reset_index()
               .groupby(['PEOPLE_CODE_ID'])['term_seq']
               .idxmax()])

print(df.shape)
print(df.head())


(9830, 7)
  PEOPLE_CODE_ID ACADEMIC_YEAR ACADEMIC_TERM ACADEMIC_SESSION  CREDITS  \
0     P000000006          2000        SPRING                       3.0   
1     P000000006          2000        SPRING             MAIN      3.0   
2     P000000006          2011        SUMMER                       3.0   
3     P000000006          2011        SUMMER             CULN      3.0   
4     P000000006          2018        SPRING                       3.0   

  PRIMARY_FLAG CLASS_LEVEL  
0            Y        FRES  
1            Y        FRES  
2            Y        FRES  
3            Y        FRES  
4            Y        FRES  
(1199, 10)
   PEOPLE_CODE_ID  ACADEMIC_YEAR ACADEMIC_TERM ACADEMIC_SESSION  CREDITS  \
4      P000000006         2018.0        SPRING                       3.0   
23     P000003107         2017.0          FALL                      15.0   
39     P000021089         2017.0          FALL                       3.0   
47     P000023459         2018.0        SPRING          

In [34]:
df['term_seq'].unique()

array([201801., 201703., 201802., 201701., 201803., 201702.])

In [None]:
df = df.rename(columns={'PEOPLE_CODE_ID': 'student_integration_id',
                        'CREDIT': 'credits',
                        'EVENT_MED_NAME': 'course_title',
                        'ACADEMIC_YEAR': 'term_year',
                        'ACADEMIC_TERM': 'term_season',
                       })


In [None]:
df = df.loc[:, ['student_integration_id', 'prereq_group_identifier',
               ]]


In [None]:
df = (df.sort_values(['student_integration_id', 
                      'prereq_group_identifier'])
        .drop_duplicates(['student_integration_id', 
                          'prereq_group_identifier'],
                         keep='last')
     )


In [None]:
fn_output = f'{today_str}_student_prereq_groups.txt'
df.to_csv(fn_output, index=False)
