In [1]:
from IPython import __version__ as ipython_version
from pandas import __version__ as pandas_version
from bokeh import __version__ as bokeh_version
print(f'IPython - {ipython_version}')
print(f'Pandas - {pandas_version}')
print(f'Bokeh - {bokeh_version}')

IPython - 6.1.0
Pandas - 0.22.0
Bokeh - 0.12.14


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, date

<H2>Stage Ranking codes</H2>

In [3]:
# read VWSTAGERANKING.csv and drop unused fields
stgrnk = pd.read_csv('VWSTAGERANKING.csv')
stgrnk.drop(['code_table', 'MEDIUM_DESC',
             'Converted/Confirmed/Accepted/Require SepDate'],
            inplace=True, axis=1)

In [None]:
print('stgrnk', stgrnk.shape)
print('stgrnk\n', stgrnk.dtypes)

In [None]:
print(stgrnk.columns)

In [None]:
stgrnk

<H2>Stage History data</H2>

In [4]:
# read STAGEHISTORY.csv
stg_hist_dtype = {'PEOPLE_CODE_ID': str, 'ACADEMIC_YEAR': str,
                  'ACADEMIC_TERM': str, 'ACADEMIC_SESSION': str,
                  'FIELD_ID': np.int64}
date_cols = ['FIELD_DATE', 'REVISION_DATE', 'REVISION_TIME']
stg_hist = pd.read_csv('STAGEHISTORY.csv', dtype=stg_hist_dtype,
                       parse_dates=date_cols,
                       usecols=['PEOPLE_CODE_ID', 'ACADEMIC_YEAR',
                                'ACADEMIC_TERM', 'ACADEMIC_SESSION',
                                'FIELD_ID', 'FIELD_DATE', 'REVISION_DATE',
                                'REVISION_TIME'])

In [None]:
print('stg_hist', stg_hist.shape)
print('stg_hist')
print(stg_hist.dtypes)

In [None]:
# create new fields
stg_hist['year_term'] = (stg_hist['ACADEMIC_YEAR'] + '.' +
                         stg_hist['ACADEMIC_TERM'].str.title())

In [None]:
# create Revision datetime
#stg_hist['Revision'] = stg_hist.apply(lambda r: pd.datetime.combine(
#                                      r['REVISION_DATE'].date(),
#                                      r['REVISION_TIME'].time()), 1)
stg_hist['create_date'] = stg_hist['FIELD_DATE']

In [None]:
# convert ACADEMIC_YEAR to numeric keep numeric-valued records
stg_hist['ACADEMIC_YEAR'] = pd.to_numeric(stg_hist['ACADEMIC_YEAR'],
                                          errors='coerce', downcast='integer')
stg_hist = stg_hist.loc[pd.to_numeric(stg_hist['ACADEMIC_YEAR'],
                                      errors='coerce',
                                      downcast='integer'
                                      ).notnull()]

In [None]:
# drop unused fields
stg_hist.drop(['FIELD_DATE', 'REVISION_DATE', 'REVISION_TIME'],
              inplace=True, axis=1)

In [None]:
# calculate new fields
stg_hist['Week_Number'] = stg_hist['create_date'].dt.week
stg_hist['Admissions_Week'] = stg_hist.apply(lambda r: (r['Week_Number'] -
                                                        (date(int(
                                                         r['ACADEMIC_YEAR']),
                                                            9, 1
                                                        ).isocalendar()[1])
                                                        )
                                             if (r['Week_Number'] >
                                                 (date(int(r['ACADEMIC_YEAR']),
                                                       9, 1).isocalendar()[1]))
                                             else (53 + r['Week_Number'] -
                                                   (date(int(
                                                       r['ACADEMIC_YEAR']),
                                                    9, 1).isocalendar()[1])),
                                             axis=1)

In [None]:
stage_data = pd.merge(stg_hist, stgrnk, left_on=['FIELD_ID'],
                      right_on=['STAGERANKING_ID'], how='left')

In [None]:
print('stage_data', stage_data.shape)
print('stage_data')
print(stage_data.dtypes)
stage_data.head()

In [None]:
ad_keep_values = ['300', 'ACC', 'ACXL', 'CANC', 'DEF', 'DEFR', 'DENY', 'DPAC',
                  'TRDP', 'TRPD', 'TRNS', 'WAIT']
ad_keep_cols = ['PEOPLE_CODE_ID', 'year_term', 'Admissions_Week',
                'field_value', 'status']
sd1 = stage_data.loc[(stage_data['field_value'].isin(ad_keep_values))]
sd1 = sd1[ad_keep_cols]

In [None]:
print('sd1', sd1.shape)
print('sd1')
print(sd1.dtypes)
sd1.head()

In [None]:
sd1.info()

In [None]:
admission_status = {'300': 'Applied', 'ACC': 'Accepted', 'ACXL': 'Canceled',
                    'CANC': 'Canceled', 'DEF': 'Canceled', 'DEFR': 'Canceled',
                    'DENY': 'Canceled', 'DPAC': 'Deposited',
                    'TRDP': 'Deposited', 'TRPD': 'Deposited',
                    'TRNS': 'Accepted', 'WAIT': 'Accepted'}
adm_stat = pd.DataFrame(list(admission_status.items()),
                        columns=['field_value', 'admission_status'])

In [None]:
print(adm_stat)

In [None]:
sd1 = (pd.merge(sd1, adm_stat, on=['field_value'], how='left')
       .drop(['field_value', 'status'], axis=1)
       .drop_duplicates(['PEOPLE_CODE_ID', 'year_term', 'Admissions_Week',
                         'admission_status'])
       .reset_index()
       .set_index(['year_term', 'PEOPLE_CODE_ID', 'admission_status'])
       .drop(['index'], axis=1)
       )

In [None]:
print('sd1', sd1.shape)
print('sd1')
print(sd1.dtypes)
sd1.head()

In [None]:
sd1.info()

<H2>Academic Data</H2>

In [None]:
academic_dtype = {'PEOPLE_CODE_ID': str, 'ACADEMIC_YEAR': str,
                  'ACADEMIC_TERM': str, 'ACADEMIC_SESSION': str,
                  'APPLICATION_FLAG': str, 'APP_STATUS': str}
date_cols = ['APPLICATION_DATE', 'APP_STATUS_DATE', 'APP_DECISION_DATE',
             'REVISION_DATE', 'REVISION_TIME']
academic = pd.read_csv('ACADEMIC.csv', dtype=academic_dtype,
                       parse_dates=date_cols,
                       usecols=['PEOPLE_CODE_ID',
                                'ACADEMIC_YEAR', 'ACADEMIC_TERM',
                                'ACADEMIC_SESSION', 'POPULATION',
                                'INQUIRY_FLAG',
                                'APPLICATION_FLAG', 'APPLICATION_DATE',
                                'APP_STATUS', 'APP_STATUS_DATE',
                                'APP_DECISION', 'APP_DECISION_DATE',
                                'REVISION_DATE', 'REVISION_TIME'])

In [None]:
print('academic', academic.shape)
print('academic')
print(academic.dtypes)
academic.info()

In [None]:
print(academic['POPULATION'].value_counts().sort_index())
print(academic['POPULATION'].value_counts().sum())

In [None]:
app_data = (academic.loc[~(academic['POPULATION'].isin(['ADVSTU', 'NOND'])) &
                         ((academic['INQUIRY_FLAG'] == 'Y') |
                          (academic['APPLICATION_FLAG'] == 'Y'))]
            )

print('app_data', app_data.shape)
print('app_data')
print(app_data.dtypes)

In [None]:
app_data.info()

In [None]:
academic_keep_fields = ['PEOPLE_CODE_ID', 'ACADEMIC_YEAR', 'ACADEMIC_TERM',
                        'ACADEMIC_SESSION', 'field_name', 'field_value', 'create_date']

In [None]:
applied = (app_data[app_data['APP_STATUS'].notnull()]
           .rename(columns={'APP_STATUS': 'field_value'})
           .rename(columns={'APP_STATUS_DATE': 'create_date'})
           )
applied.loc[:, 'field_name'] = 'Application Status'
applied = applied.loc[~applied['create_date'].isnull(), academic_keep_fields]

print('applied', applied.shape)
print('applied')
print(applied.dtypes)

In [None]:
applied.info()

In [None]:
accepted = (app_data[app_data['APP_DECISION'].notnull()]
            .rename(columns={'APP_DECISION': 'field_value'})
            .rename(columns={'APP_DECISION_DATE': 'create_date'})
            )
accepted.loc[:, 'field_name'] = 'Application Decision'
accepted = accepted.loc[~accepted['create_date'].isnull(), academic_keep_fields]


print('accepted', accepted.shape)
print('accepted')
print(accepted.dtypes)

In [None]:
accepted.info()

In [None]:
# stack Stage History, Academic Applied and Academic Accepted
#adm_df = stage_data.append(applied).append(accepted)
adm_df = applied.append(accepted)

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

In [None]:
# stack Stage History, Academic Applied and Academic Accepted
adm_df = stage_data.append(applied).append(accepted)
#adm_df = applied.append(accepted)

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

In [None]:
adm_df.head()

In [None]:
adm_df.info()

In [None]:
def revision(df):
    return pd.datetime.combine(df['REVISION_DATE'].date(), df['REVISION_TIME'].time())

In [None]:
# new columns
adm_df['year_term'] = adm_df['ACADEMIC_YEAR'] + '.' + adm_df['ACADEMIC_TERM'].str.title()

#adm_df['Revision'] = adm_df.apply(revision,1)
adm_df['Week_Number'] = adm_df['create_date'].dt.week

adm_df = (adm_df.loc[(adm_df['ACADEMIC_TERM'].isin(['FALL', 'SPRING'])) &
                     (adm_df['ACADEMIC_SESSION'] == 'MAIN')]
         )

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

In [None]:
adm_df.info()

In [None]:
adm_df[(adm_df['year_term']=='2014.Fall') & (adm_df['PEOPLE_CODE_ID']=='P000026232')].head(30)

In [None]:
adm_week_number = (lambda r: (r['Week_Number'] - 
                         (date(int(r['ACADEMIC_YEAR']), 9, 1).isocalendar()[1])
                         )
                         if (r['Week_Number'] > (date(int(r['ACADEMIC_YEAR']), 9, 1).isocalendar()[1]))
                         else (53 + r['Week_Number'] - (date(int(r['ACADEMIC_YEAR']), 9, 1).isocalendar()[1]))
              )

In [None]:
adm_df = adm_df[pd.to_numeric(adm_df['ACADEMIC_YEAR'], errors='coerce', downcast='integer').notnull()]

adm_df['Admissions_Week'] = adm_df.apply(adm_week_number, axis=1)

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

In [None]:
adm_df.head()

In [None]:
adm_df.columns

In [None]:
#adm_df.loc[(adm_df['ACADEMIC_YEAR']==2014 & (adm_df['PEOPLE_CODE_ID']=='P000026232') )]
adm_df[(adm_df['year_term']=='2014.Fall') & (adm_df['PEOPLE_CODE_ID']=='P000026232')].head(30)


In [None]:
print(adm_df['field_value'].value_counts().sort_index())
print(adm_df['field_value'].value_counts().sum())

In [None]:
adm_keep_values = ['300', 'ACC', 'ACXL', 'CANC', 'DEF', 'DEFR', 'DENY', 'DPAC',
                  'TRDP', 'TRPD', 'TRNS', 'WAIT']
adm_keep_cols = ['PEOPLE_CODE_ID', 'year_term', 'Admissions_Week',
                'field_value']
adm_df = adm_df.loc[(adm_df['field_value'].isin(adm_keep_values))]
adm_df = adm_df[adm_keep_cols]

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

In [None]:
adm_df.head()

In [None]:
adm_df[(adm_df['year_term']=='2014.Fall') & (adm_df['PEOPLE_CODE_ID']=='P000026232')].head(30)

In [None]:
admission_status = {'300' : 'Applied', 'ACC': 'Accepted', 'ACXL': 'Canceled', 'CANC': 'Canceled', 'DEF': 'Canceled', 'DEFR': 'Canceled', 'DENY': 'Canceled', 'DPAC': 'Deposited', 'TRDP': 'Deposited', 'TRPD': 'Deposited', 'TRNS': 'Accepted', 'WAIT': 'Accepted'}
adm_stat = pd.DataFrame(list(admission_status.items()), columns=['field_value', 'admission_status'])

adm_df1 = (pd.merge(adm_df, adm_stat, on=['field_value'], how='left' )
           .drop(['field_value'], axis=1)
           .drop_duplicates(['PEOPLE_CODE_ID', 'year_term', 'Admissions_Week', 'admission_status'])
          )

print('adm_df1', adm_df1.shape)
print('adm_df1')
print(adm_df1.dtypes)

In [None]:
adm_df1[(adm_df1['year_term']=='2014.Fall') & (adm_df1['admission_status']=='Deposited')].head(30)

In [None]:
adm_df1.head()

In [None]:
adm_df1[(adm_df1['year_term']=='2014.Fall') & (adm_df1['PEOPLE_CODE_ID']=='P000026232')].head(30)

In [None]:
adm_df2 = (adm_df1.loc[(adm_df1['year_term'].isin(['2014.Fall', '2015.Fall']))]
           .sort_values(['year_term', 'PEOPLE_CODE_ID', 'Admissions_Week'])
           .drop_duplicates(['year_term', 'PEOPLE_CODE_ID', 'admission_status'], keep='first')
           .reset_index()
           .set_index(['year_term', 'PEOPLE_CODE_ID', 'admission_status'])
           .drop(['index'], axis=1)
           .unstack(level=-1)
          )

print('adm_df2', adm_df2.shape)
print('adm_df2\n', adm_df2.dtypes)

In [None]:
adm_df2.head()