In [1]:
from IPython import __version__ as ipython_version
from pandas import __version__ as pandas_version
from bokeh import __version__ as bokeh_version
print(f'IPython - {ipython_version}')
print(f'Pandas - {pandas_version}')
print(f'Bokeh - {bokeh_version}')

IPython - 6.1.0
Pandas - 0.22.0
Bokeh - 0.12.14


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, date

<H2>Stage Ranking codes</H2>

In [3]:
# read VWSTAGERANKING.csv and drop unused fields
stgrnk = pd.read_csv('VWSTAGERANKING.csv')
stgrnk.drop(['code_table', 'MEDIUM_DESC',
             'Converted/Confirmed/Accepted/Require SepDate'],
            inplace=True, axis=1)

In [None]:
print('stgrnk', stgrnk.shape)

In [None]:
print('stgrnk\n', stgrnk.dtypes)

In [None]:
print(stgrnk.columns)

In [None]:
stgrnk

<H2>Stage History data</H2>

In [4]:
# read STAGEHISTORY.csv
stg_hist_dtype = {'PEOPLE_CODE_ID': str, 'ACADEMIC_YEAR': str,
                  'ACADEMIC_TERM': str, 'ACADEMIC_SESSION': str,
                  'FIELD_ID': np.int64}
date_cols = ['FIELD_DATE']
stg_hist = pd.read_csv('STAGEHISTORY.csv', dtype=stg_hist_dtype,
                       parse_dates=date_cols,
                       usecols=['PEOPLE_CODE_ID', 'ACADEMIC_YEAR',
                                'ACADEMIC_TERM', 'ACADEMIC_SESSION',
                                'FIELD_ID', 'FIELD_DATE'])

In [None]:
print('stg_hist', stg_hist.shape)

In [None]:
print('stg_hist')
print(stg_hist.dtypes)

In [5]:
stg_hist['create_date'] = stg_hist['FIELD_DATE']

In [6]:
stage_data = pd.merge(stg_hist, stgrnk, left_on=['FIELD_ID'],
                      right_on=['STAGERANKING_ID'], how='left')

In [7]:
keep_fields = ['PEOPLE_CODE_ID', 'ACADEMIC_YEAR', 'ACADEMIC_TERM',
               'ACADEMIC_SESSION', 'field_name', 'field_value', 'create_date']
stage_data = stage_data.loc[~stage_data['create_date'].isnull(), keep_fields]

In [None]:
print('stage_data', stage_data.shape)

In [None]:
print('stage_data')
print(stage_data.dtypes)
stage_data.head()

<H2>Academic Data</H2>

In [8]:
academic_dtype = {'PEOPLE_CODE_ID': str, 'ACADEMIC_YEAR': str,
                  'ACADEMIC_TERM': str, 'ACADEMIC_SESSION': str,
                  'APPLICATION_FLAG': str, 'APP_STATUS': str}
date_cols = ['APPLICATION_DATE', 'APP_STATUS_DATE', 'APP_DECISION_DATE']
academic = pd.read_csv('ACADEMIC.csv', dtype=academic_dtype,
                       parse_dates=date_cols,
                       usecols=['PEOPLE_CODE_ID',
                                'ACADEMIC_YEAR', 'ACADEMIC_TERM',
                                'ACADEMIC_SESSION', 'POPULATION',
                                'INQUIRY_FLAG',
                                'APPLICATION_FLAG', 'APPLICATION_DATE',
                                'APP_STATUS', 'APP_STATUS_DATE',
                                'APP_DECISION', 'APP_DECISION_DATE'])

In [None]:
print('academic', academic.shape)
print('academic')
print(academic.dtypes)
academic.info()

In [None]:
print(academic['POPULATION'].value_counts().sort_index())
print(academic['POPULATION'].value_counts().sum())

In [9]:
app_data = (academic.loc[~(academic['POPULATION'].isin(['ADVSTU', 'NOND'])) &
                         ((academic['INQUIRY_FLAG'] == 'Y') |
                          (academic['APPLICATION_FLAG'] == 'Y'))]
            )

print('app_data', app_data.shape)
print('app_data')
print(app_data.dtypes)

app_data (42151, 12)
app_data
PEOPLE_CODE_ID               object
ACADEMIC_YEAR                object
ACADEMIC_TERM                object
ACADEMIC_SESSION             object
POPULATION                   object
APPLICATION_FLAG             object
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
INQUIRY_FLAG                 object
APPLICATION_DATE     datetime64[ns]
dtype: object


In [None]:
app_data.info()

In [10]:
applied = (app_data[app_data['APP_STATUS'].notnull()]
           .rename(columns={'APP_STATUS': 'field_value'})
           .rename(columns={'APP_STATUS_DATE': 'create_date'})
           )
applied.loc[:, 'field_name'] = 'Application Status'
applied = applied.loc[~applied['create_date'].isnull(), keep_fields]

print('applied', applied.shape)
print('applied')
print(applied.dtypes)

applied (14387, 7)
applied
PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
field_name                  object
field_value                 object
create_date         datetime64[ns]
dtype: object


In [None]:
applied.info()

In [11]:
accepted = (app_data[app_data['APP_DECISION'].notnull()]
            .rename(columns={'APP_DECISION': 'field_value'})
            .rename(columns={'APP_DECISION_DATE': 'create_date'})
            )
accepted.loc[:, 'field_name'] = 'Application Decision'
accepted = accepted.loc[~accepted['create_date'].isnull(), keep_fields]


print('accepted', accepted.shape)
print('accepted')
print(accepted.dtypes)

accepted (14290, 7)
accepted
PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
field_name                  object
field_value                 object
create_date         datetime64[ns]
dtype: object


In [None]:
accepted.info()

In [199]:
# stack Stage History, Academic Applied and Academic Accepted
adm_df = stage_data.append(applied).append(accepted)

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

adm_df (308512, 7)
adm_df
PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
field_name                  object
field_value                 object
create_date         datetime64[ns]
dtype: object


In [None]:
adm_df.head()

In [None]:
adm_df.info()

In [201]:
adm_df = (adm_df.loc[((adm_df['ACADEMIC_TERM'].isin(['FALL', 'SPRING'])) &
                      (adm_df['ACADEMIC_SESSION'] == 'MAIN') &
                      (adm_df['ACADEMIC_YEAR'] >= '2009')
                      )
                     ]
          )

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

adm_df (118325, 7)
adm_df
PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
field_name                  object
field_value                 object
create_date         datetime64[ns]
dtype: object


In [14]:
# create new fields
adm_df['year_term'] = (adm_df['ACADEMIC_YEAR'] + '.' +
                       adm_df['ACADEMIC_TERM'].str.title())
adm_df['Week_Number'] = adm_df['create_date'].dt.week

In [15]:
# convert ACADEMIC_YEAR to numeric keep numeric-valued records
adm_df['ACADEMIC_YEAR'] = pd.to_numeric(adm_df['ACADEMIC_YEAR'],
                                        errors='coerce', downcast='integer')
adm_df = adm_df.loc[adm_df['ACADEMIC_YEAR'].notnull()]

In [16]:
adm_week_number = (lambda r: (r['Week_Number'] -
                              (date(int(r['ACADEMIC_YEAR']), 9, 1)
                              .isocalendar()[1])
                              )
                   if (r['Week_Number'] > (date(int(r['ACADEMIC_YEAR']), 9, 1)
                                           .isocalendar()[1]))
                   else (53 + r['Week_Number'] -
                         (date(int(r['ACADEMIC_YEAR']), 9, 1)
                         .isocalendar()[1])
                         )
                   )


In [17]:
adm_df['Admissions_Week'] = adm_df.apply(adm_week_number, axis=1)

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

adm_df (169029, 10)
adm_df
PEOPLE_CODE_ID              object
ACADEMIC_YEAR                int16
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
field_name                  object
field_value                 object
create_date         datetime64[ns]
year_term                   object
Week_Number                  int64
Admissions_Week              int64
dtype: object


In [None]:
adm_df.info()

In [None]:
adm_df.head()

In [None]:
adm_df.columns

In [None]:
adm_df[(adm_df['year_term']=='2014.Fall') & (adm_df['PEOPLE_CODE_ID']=='P000026232')].head(30)

In [None]:
print(adm_df['field_value'].value_counts().sort_index())
print(adm_df['field_value'].value_counts().sum())

In [18]:
adm_keep_values = ['300', 'ACC', 'ACXL', 'CANC', 'DEF', 'DEFR', 'DENY', 'DPAC',
                   'TRDP', 'TRPD', 'TRNS', 'WAIT']
adm_keep_cols = ['PEOPLE_CODE_ID', 'year_term', 'Admissions_Week',
                 'field_value']
adm_df = adm_df.loc[(adm_df['field_value'].isin(adm_keep_values)),
                    adm_keep_cols]

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

adm_df (42973, 4)
adm_df
PEOPLE_CODE_ID     object
year_term          object
Admissions_Week     int64
field_value        object
dtype: object


In [None]:
adm_df.head()

In [None]:
adm_df[(adm_df['year_term']=='2014.Fall') & (adm_df['PEOPLE_CODE_ID']=='P000026232')].head(30)

In [19]:
# admissions status table
admission_status = {'300': 'Applied', 'ACC': 'Accepted', 'ACXL': 'Canceled',
                    'CANC': 'Canceled', 'DEF': 'Canceled', 'DEFR': 'Canceled',
                    'DENY': 'Canceled', 'DPAC': 'Deposited',
                    'TRDP': 'Deposited', 'TRPD': 'Deposited',
                    'TRNS': 'Accepted', 'WAIT': 'Accepted'}
adm_stat = pd.DataFrame(list(admission_status.items()),
                        columns=['field_value', 'admission_status'])

In [20]:
adm_df1 = (pd.merge(adm_df, adm_stat, on=['field_value'], how='left')
           .drop(['field_value'], axis=1)
           .drop_duplicates(['PEOPLE_CODE_ID', 'year_term', 'Admissions_Week',
                             'admission_status'])
           )

print('adm_df1', adm_df1.shape)
print('adm_df1')
print(adm_df1.dtypes)

adm_df1 (28178, 4)
adm_df1
PEOPLE_CODE_ID      object
year_term           object
Admissions_Week      int64
admission_status    object
dtype: object


In [None]:
adm_df1.head()

In [21]:
adm_df1[(adm_df1['year_term']=='2014.Fall') & (adm_df1['admission_status']=='Deposited')].head(30)

Unnamed: 0,PEOPLE_CODE_ID,year_term,Admissions_Week,admission_status
2728,P000013143,2014.Fall,49,Deposited
2730,P000013143,2014.Fall,53,Deposited
8256,P000026232,2014.Fall,36,Deposited
8559,P000026411,2014.Fall,39,Deposited
9819,P000027147,2014.Fall,34,Deposited
9872,P000027224,2014.Fall,40,Deposited
9899,P000027290,2014.Fall,42,Deposited
9900,P000027290,2014.Fall,23,Deposited
10205,P000027843,2014.Fall,8,Deposited
10309,P000028043,2014.Fall,37,Deposited


In [22]:
adm_df1[(adm_df1['year_term']=='2014.Fall') & (adm_df1['PEOPLE_CODE_ID']=='P000026232')].head(30)

Unnamed: 0,PEOPLE_CODE_ID,year_term,Admissions_Week,admission_status
8254,P000026232,2014.Fall,21,Applied
8255,P000026232,2014.Fall,22,Accepted
8256,P000026232,2014.Fall,36,Deposited


In [24]:
adm_df1 = (adm_df1.sort_values(['year_term', 'PEOPLE_CODE_ID',
                                'Admissions_Week'])
           .drop_duplicates(['year_term', 'PEOPLE_CODE_ID',
                             'admission_status'],
                            keep='first')
           )

In [25]:
adm_df1[(adm_df1['year_term']=='2014.Fall') & (adm_df1['admission_status']=='Deposited')].head(30)

Unnamed: 0,PEOPLE_CODE_ID,year_term,Admissions_Week,admission_status
2728,P000013143,2014.Fall,49,Deposited
8256,P000026232,2014.Fall,36,Deposited
8559,P000026411,2014.Fall,39,Deposited
9819,P000027147,2014.Fall,34,Deposited
9872,P000027224,2014.Fall,40,Deposited
9900,P000027290,2014.Fall,23,Deposited
10205,P000027843,2014.Fall,8,Deposited
10309,P000028043,2014.Fall,37,Deposited
10312,P000028044,2014.Fall,30,Deposited
10317,P000028047,2014.Fall,32,Deposited


In [26]:
adm_df1[(adm_df1['year_term']=='2014.Fall') & (adm_df1['PEOPLE_CODE_ID']=='P000026232')].head(30)

Unnamed: 0,PEOPLE_CODE_ID,year_term,Admissions_Week,admission_status
8254,P000026232,2014.Fall,21,Applied
8255,P000026232,2014.Fall,22,Accepted
8256,P000026232,2014.Fall,36,Deposited


In [None]:
adm_df2 = (adm_df1.loc[(adm_df1['year_term'].isin(['2014.Fall', '2015.Fall']))]
           .reset_index()
           .set_index(['year_term', 'PEOPLE_CODE_ID', 'admission_status'])
           .drop(['index'], axis=1)
           .unstack(level=-1)
          )

print('adm_df2', adm_df2.shape)
print('adm_df2')
print(adm_df2.dtypes)

In [None]:
adm_df2.head()

In [None]:
#q = adm_df1.loc[(adm_df1['year_term']=='2014.Fall'), ['PEOPLE_CODE_ID', 'Admissions_Week', 'admission_status']]
q = adm_df1.loc[((adm_df1['year_term']=='2014.Fall') | (adm_df1['year_term']=='2015.Fall')), ['year_term', 'PEOPLE_CODE_ID', 'Admissions_Week', 'admission_status']]

In [None]:
print(q.shape)
q.head()

In [None]:
w = q.set_index(['year_term', 'PEOPLE_CODE_ID'])

In [None]:
print(w.shape)
w.head(30)

In [183]:
e = adm_df1.pivot_table(index=['year_term', 'PEOPLE_CODE_ID'],
                        columns=['admission_status'],
                        values=['Admissions_Week']
                        )
print(e.shape)
e.head(30)

(15721, 4)


Unnamed: 0_level_0,Unnamed: 1_level_0,Admissions_Week,Admissions_Week,Admissions_Week,Admissions_Week
Unnamed: 0_level_1,admission_status,Accepted,Applied,Canceled,Deposited
year_term,PEOPLE_CODE_ID,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1987.Fall,P000025214,43.0,43.0,,
1995.Fall,P000012054,11.0,,,
1997.Fall,P000006798,,13.0,,13.0
1999.Fall,P000004968,9.0,,,
2000.Fall,P000000055,8.0,,,
2000.Fall,P000000083,8.0,,,
2000.Fall,P000000476,8.0,,,
2000.Fall,P000000531,8.0,,,
2000.Fall,P000000561,8.0,,,
2000.Fall,P000000583,8.0,,,


In [190]:
# function returns status for week
def f_status(field, data_frame, n):
    f_week = (lambda df: 1
              if ((df[('Admissions_Week', field)] <= n) &
                  (df[('Admissions_Week', 'Canceled')] > n))
              else 0
              )
    return data_frame.apply(f_week, axis=1)

In [191]:
# function returns DataFrame of 53 week status values
def fill_weeks(field, data_frame):
    weeks = range(1, 54)
    fld = field[:2]
    r = pd.DataFrame(np.zeros((data_frame.shape[0], 53)),
                     index=data_frame.index,
                     columns=[f'{fld}{w:02d}' for w in weeks])
    for w in weeks:
        f = f'{fld}{w:02d}'
        r.loc[:, f] = f_status(field, data_frame, w)

    return r

In [195]:
stage_list = ['Applied', 'Accepted', 'Deposited']
p = pd.DataFrame()
for s in stage_list:
    p = pd.concat([p, fill_weeks(s, e)], axis=1)


In [196]:
p.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Ap01,Ap02,Ap03,Ap04,Ap05,Ap06,Ap07,Ap08,Ap09,Ap10,...,De44,De45,De46,De47,De48,De49,De50,De51,De52,De53
year_term,PEOPLE_CODE_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1987.Fall,P000025214,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1995.Fall,P000012054,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997.Fall,P000006798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1999.Fall,P000004968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.Fall,P000000055,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [197]:

print(p.shape)
p.head(30)

(15721, 159)


Unnamed: 0_level_0,Unnamed: 1_level_0,Ap01,Ap02,Ap03,Ap04,Ap05,Ap06,Ap07,Ap08,Ap09,Ap10,...,De44,De45,De46,De47,De48,De49,De50,De51,De52,De53
year_term,PEOPLE_CODE_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1987.Fall,P000025214,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1995.Fall,P000012054,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997.Fall,P000006798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1999.Fall,P000004968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.Fall,P000000055,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.Fall,P000000083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.Fall,P000000476,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.Fall,P000000531,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.Fall,P000000561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.Fall,P000000583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
