In [1]:
from IPython import __version__ as ipython_version
from pandas import __version__ as pandas_version
from bokeh import __version__ as bokeh_version
print("IPython - %s" % ipython_version)
print("Pandas - %s" % pandas_version)
print("Bokeh - %s" % bokeh_version)

IPython - 6.1.0
Pandas - 0.21.0
Bokeh - 0.12.11


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, date

<H2>Stage Ranking codes</H2>

In [3]:
stgrnk = (pd.read_csv('VWSTAGERANKING.csv')
          .drop(['code_table','MEDIUM_DESC','Converted/Confirmed/Accepted/Require SepDate'], axis=1)
         )

print('stgrnk', stgrnk.shape)
print('stgrnk\n', stgrnk.dtypes)

stgrnk (53, 7)
stgrnk
 STAGERANKING_ID     int64
field_name         object
field_value        object
rank                int64
short_desc         object
Canceled           object
status             object
dtype: object


<H2>Stage History data</H2>

In [4]:
stg_hist_dtype = {'PEOPLE_CODE_ID': str, 'ACADEMIC_YEAR': str, 'ACADEMIC_TERM': str, 'ACADEMIC_SESSION': str, 'FIELD_ID': np.int64 }
date_cols = ['FIELD_DATE','REVISION_DATE','REVISION_TIME']
stg_hist = pd.read_csv('STAGEHISTORY.csv', dtype=stg_hist_dtype, parse_dates=date_cols,
                       usecols=['PEOPLE_CODE_ID','ACADEMIC_YEAR','ACADEMIC_TERM','ACADEMIC_SESSION','FIELD_ID','FIELD_DATE','REVISION_DATE','REVISION_TIME'])

print('stg_hist', stg_hist.shape)
print('stg_hist\n', stg_hist.dtypes)

stg_hist (279835, 8)
stg_hist
 PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
FIELD_ID                     int64
FIELD_DATE          datetime64[ns]
REVISION_DATE       datetime64[ns]
REVISION_TIME       datetime64[ns]
dtype: object


In [5]:
stage_data = pd.merge(stg_hist, stgrnk, left_on=['FIELD_ID'], right_on=['STAGERANKING_ID'], how='left' )

print('stage_data', stage_data.shape)
print('stage_data\n', stage_data.dtypes)

stage_data (279835, 15)
stage_data
 PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
FIELD_ID                     int64
FIELD_DATE          datetime64[ns]
REVISION_DATE       datetime64[ns]
REVISION_TIME       datetime64[ns]
STAGERANKING_ID              int64
field_name                  object
field_value                 object
rank                         int64
short_desc                  object
Canceled                    object
status                      object
dtype: object


<H2>Academic data</H2>

In [6]:
academic_dtype = {'PEOPLE_CODE_ID': str, 'ACADEMIC_YEAR': str, 'ACADEMIC_TERM': str, 'ACADEMIC_SESSION': str, 'APPLICATION_FLAG': str, 'APP_STATUS': str}
date_cols = ['APPLICATION_DATE', 'APP_STATUS_DATE', 'APP_DECISION_DATE', 'REVISION_DATE', 'REVISION_TIME']
academic = pd.read_csv('ACADEMIC.csv', dtype=academic_dtype, parse_dates=date_cols,
                       usecols=['PEOPLE_CODE_ID','ACADEMIC_YEAR','ACADEMIC_TERM','ACADEMIC_SESSION','POPULATION','INQUIRY_FLAG','APPLICATION_FLAG','APPLICATION_DATE', 'APP_STATUS', 'APP_STATUS_DATE', 'APP_DECISION', 'APP_DECISION_DATE', 'REVISION_DATE', 'REVISION_TIME'])

print('academic', academic.shape)
print('academic\n', academic.dtypes)

academic (122300, 14)
academic
 PEOPLE_CODE_ID               object
ACADEMIC_YEAR                object
ACADEMIC_TERM                object
ACADEMIC_SESSION             object
POPULATION                   object
REVISION_DATE        datetime64[ns]
REVISION_TIME        datetime64[ns]
APPLICATION_FLAG             object
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
INQUIRY_FLAG                 object
APPLICATION_DATE     datetime64[ns]
dtype: object


In [7]:
app_data = academic.loc[~(academic['POPULATION'].isin(['AVDSTU', 'NOND'])) & ((academic['INQUIRY_FLAG']=='Y') | (academic['APPLICATION_FLAG']=='Y'))]
print('app_data', app_data.shape)
print('app_data\n', app_data.dtypes)

applied = (app_data[app_data['APP_STATUS'].notnull()]
           .rename(columns={'APP_STATUS': 'field_value'})
           .rename(columns={'APP_STATUS_DATE': 'Revision'})
          )
applied.loc[:, 'field_name'] = 'Application Status'
print('applied', applied.shape)
print('applied\n', applied.dtypes)

accepted = (app_data[app_data['APP_DECISION'].notnull()]
            .rename(columns={'APP_DECISION': 'field_value'})
            .rename(columns={'APP_DECISION_DATE': 'Revision'})
           )
accepted.loc[:, 'field_name'] = 'Application Decision'
print('accepted', accepted.shape)
print('accepted\n', accepted.dtypes)


app_data (42157, 14)
app_data
 PEOPLE_CODE_ID               object
ACADEMIC_YEAR                object
ACADEMIC_TERM                object
ACADEMIC_SESSION             object
POPULATION                   object
REVISION_DATE        datetime64[ns]
REVISION_TIME        datetime64[ns]
APPLICATION_FLAG             object
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
INQUIRY_FLAG                 object
APPLICATION_DATE     datetime64[ns]
dtype: object
applied (23155, 15)
applied
 PEOPLE_CODE_ID               object
ACADEMIC_YEAR                object
ACADEMIC_TERM                object
ACADEMIC_SESSION             object
POPULATION                   object
REVISION_DATE        datetime64[ns]
REVISION_TIME        datetime64[ns]
APPLICATION_FLAG             object
field_value                  object
Revision             datetime64[ns]
APP_DECISION                 object
APP_DECISION_DATE    datet

In [8]:
# stack Stage History, Academic Applied and Academic Accepted
adm_df = stage_data.append(applied).append(accepted)

print('adm_df', adm_df.shape)
print('adm_df\n', adm_df.dtypes)

adm_df (326049, 24)
adm_df
 ACADEMIC_SESSION             object
ACADEMIC_TERM                object
ACADEMIC_YEAR                object
APPLICATION_DATE     datetime64[ns]
APPLICATION_FLAG             object
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
Canceled                     object
FIELD_DATE           datetime64[ns]
FIELD_ID                    float64
INQUIRY_FLAG                 object
PEOPLE_CODE_ID               object
POPULATION                   object
REVISION_DATE        datetime64[ns]
REVISION_TIME        datetime64[ns]
Revision             datetime64[ns]
STAGERANKING_ID             float64
field_name                   object
field_value                  object
rank                        float64
short_desc                   object
status                       object
dtype: object


In [9]:
adm_df.head()

Unnamed: 0,ACADEMIC_SESSION,ACADEMIC_TERM,ACADEMIC_YEAR,APPLICATION_DATE,APPLICATION_FLAG,APP_DECISION,APP_DECISION_DATE,APP_STATUS,APP_STATUS_DATE,Canceled,...,POPULATION,REVISION_DATE,REVISION_TIME,Revision,STAGERANKING_ID,field_name,field_value,rank,short_desc,status
0,,SPRING,2000,NaT,,,NaT,,NaT,,...,,2006-10-26,1900-01-01 16:41:53,NaT,15.0,Enrolled/Separated,ENRL,15.0,Enrolled,A
1,,SPRING,2000,NaT,,,NaT,,NaT,N,...,,2006-10-26,1900-01-01 16:41:53,NaT,10.0,Application Status,500,10.0,Deposited,A
2,,SPRING,2000,NaT,,,NaT,,NaT,,...,,2006-10-26,1900-01-01 16:41:53,NaT,1.0,Application Decision,ACC,1.0,Accepted,A
3,,SPRING,2000,NaT,,,NaT,,NaT,,...,,2006-12-21,1900-01-01 16:53:25,NaT,18.0,Enrolled/Separated,WITH,18.0,Withdrawn,A
4,,SPRING,2000,NaT,,,NaT,,NaT,,...,,2006-12-21,1900-01-01 16:53:25,NaT,15.0,Enrolled/Separated,ENRL,15.0,Enrolled,A


In [10]:
# new columns
adm_df['Year_Term'] = adm_df['ACADEMIC_YEAR'] + '.' + adm_df['ACADEMIC_TERM'].str.title()

adm_df['Revision'] = adm_df.apply(lambda r : pd.datetime.combine(r['REVISION_DATE'].date(),r['REVISION_TIME'].time()),1)
adm_df['Week_Number'] = adm_df['Revision'].dt.week

adm_df = (adm_df.drop(['FIELD_DATE','REVISION_DATE','REVISION_TIME'], axis=1)
          .loc[(adm_df['ACADEMIC_TERM'].isin(['FALL', 'SPRING'])) & (adm_df['ACADEMIC_SESSION'] == 'MAIN')]
         )

print('adm_df', adm_df.shape)
print('adm_df\n', adm_df.dtypes)

adm_df (177129, 23)
adm_df
 ACADEMIC_SESSION             object
ACADEMIC_TERM                object
ACADEMIC_YEAR                object
APPLICATION_DATE     datetime64[ns]
APPLICATION_FLAG             object
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
Canceled                     object
FIELD_ID                    float64
INQUIRY_FLAG                 object
PEOPLE_CODE_ID               object
POPULATION                   object
Revision             datetime64[ns]
STAGERANKING_ID             float64
field_name                   object
field_value                  object
rank                        float64
short_desc                   object
status                       object
Year_Term                    object
Week_Number                   int64
dtype: object


In [11]:
adm_df = adm_df[pd.to_numeric(adm_df['ACADEMIC_YEAR'], errors='coerce', downcast='integer').notnull()]

adm_df['Admissions_Week'] = adm_df.apply(lambda r: (r['Week_Number'] - (date(int(r['ACADEMIC_YEAR']), 9, 1).isocalendar()[1]))
                                            if (r['Week_Number'] > (date(int(r['ACADEMIC_YEAR']), 9, 1).isocalendar()[1]))
                                            else (53 + r['Week_Number'] - (date(int(r['ACADEMIC_YEAR']), 9, 1).isocalendar()[1])), axis=1)

print('adm_df', adm_df.shape)
print('adm_df\n', adm_df.dtypes)

adm_df (177129, 24)
adm_df
 ACADEMIC_SESSION             object
ACADEMIC_TERM                object
ACADEMIC_YEAR                object
APPLICATION_DATE     datetime64[ns]
APPLICATION_FLAG             object
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
Canceled                     object
FIELD_ID                    float64
INQUIRY_FLAG                 object
PEOPLE_CODE_ID               object
POPULATION                   object
Revision             datetime64[ns]
STAGERANKING_ID             float64
field_name                   object
field_value                  object
rank                        float64
short_desc                   object
status                       object
Year_Term                    object
Week_Number                   int64
Admissions_Week               int64
dtype: object


In [12]:
adm_keep_values = ['300', 'ACC', 'ACXL', 'CANC', 'DEF', 'DEFR', 'DENY', 'DPAC',
                  'TRDP', 'TRPD', 'TRNS', 'WAIT']
adm_keep_cols = ['PEOPLE_CODE_ID', 'Year_Term', 'Admissions_Week',
                'field_value', 'status']
adm_df = adm_df.loc[(adm_df['field_value'].isin(adm_keep_values))]
adm_df = adm_df[adm_keep_cols]

print('adm_df', adm_df.shape)
print('adm_df\n', adm_df.dtypes)

adm_df (47024, 5)
adm_df
 PEOPLE_CODE_ID     object
Year_Term          object
Admissions_Week     int64
field_value        object
status             object
dtype: object


In [13]:
admission_status = {'300' : 'Applied', 'ACC': 'Accepted', 'ACXL': 'Canceled', 'CANC': 'Canceled', 'DEF': 'Canceled', 'DEFR': 'Canceled', 'DENY': 'Canceled', 'DPAC': 'Deposited', 'TRDP': 'Deposited', 'TRPD': 'Deposited', 'TRNS': 'Accepted', 'WAIT': 'Accepted'}
adm_stat = pd.DataFrame(list(admission_status.items()), columns=['field_value', 'admission_status'])

adm_df1 = (pd.merge(adm_df, adm_stat, on=['field_value'], how='left' )
           .drop(['field_value','status'], axis=1)
           .drop_duplicates(['PEOPLE_CODE_ID', 'Year_Term', 'Admissions_Week', 'admission_status'])
          )

print('adm_df1', adm_df1.shape)
print('adm_df1\n', adm_df1.dtypes)

adm_df1 (39121, 4)
adm_df1
 PEOPLE_CODE_ID      object
Year_Term           object
Admissions_Week      int64
admission_status    object
dtype: object


In [61]:
#year_term_list = ['2008.Fall', '2009.Fall', '2010.Fall', '2011.Fall', '2012.Fall', '2013.Fall', '2014.Fall', '2015.Fall', '2016.Fall', '2017.Fall']
year_term_list = ['2012.Fall', '2013.Fall', '2014.Fall', '2015.Fall', '2016.Fall', '2017.Fall']
#year_term_list = ['2013.Fall']
adm_df2 = (adm_df1.loc[(adm_df1['Year_Term'].isin(year_term_list))]
           .sort_values(['Year_Term', 'PEOPLE_CODE_ID', 'Admissions_Week'])
           .drop_duplicates(['Year_Term', 'PEOPLE_CODE_ID', 'admission_status'], keep='first')
           .reset_index()
           .set_index(['Year_Term', 'PEOPLE_CODE_ID', 'admission_status'])
           .drop(['index'], axis=1)
           .unstack(level=-1)
          )

print('adm_df2', adm_df2.shape)
print('adm_df2\n', adm_df2.dtypes)

adm_df2 (8686, 4)
adm_df2
                  admission_status
Admissions_Week  Accepted            float64
                 Applied             float64
                 Canceled            float64
                 Deposited           float64
dtype: object


In [53]:
#adm_df2.head()
adm_df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Admissions_Week,Admissions_Week,Admissions_Week,Admissions_Week
Unnamed: 0_level_1,admission_status,Accepted,Applied,Canceled,Deposited
Year_Term,PEOPLE_CODE_ID,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2013.Fall,P000022035,,13.0,,
2013.Fall,P000023298,38.0,38.0,1.0,1.0
2013.Fall,P000024221,,,47.0,
2013.Fall,P000025778,51.0,43.0,43.0,43.0
2013.Fall,P000025803,39.0,,27.0,39.0
2013.Fall,P000025838,36.0,3.0,49.0,3.0
2013.Fall,P000025863,1.0,22.0,1.0,
2013.Fall,P000025895,38.0,,,25.0
2013.Fall,P000025915,36.0,36.0,46.0,14.0
2013.Fall,P000025938,39.0,22.0,1.0,2.0


In [62]:
adm_df2.groupby(['Year_Term']).count()

Unnamed: 0_level_0,Admissions_Week,Admissions_Week,Admissions_Week,Admissions_Week
admission_status,Accepted,Applied,Canceled,Deposited
Year_Term,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2012.Fall,816,649,904,443
2013.Fall,837,1131,1106,367
2014.Fall,804,1093,1348,357
2015.Fall,1160,1663,1837,368
2016.Fall,804,833,224,337
2017.Fall,969,976,57,308


In [63]:
#
adm_df3 = adm_df2.copy()
#print(adm_df3.head())
app_status = 'Applied', 'Accepted', 'Deposited', 'Canceled'
for aps in app_status:
    for w in range(53):
        col_name = 'AdmWk{:02d}'.format(w+1)
    
        adm_df3[(col_name, aps)] = (adm_df3[('Admissions_Week', aps)] <= (w+1))
#        adm_df3[col_name] = ((adm_df3[('Admissions_Week', 'Applied')] <= (w+1)) &
#                             (adm_df3[('Admissions_Week', 'Canceled')] > (w+1)))

print(adm_df3.head())

                         Admissions_Week                            AdmWk01  \
admission_status                Accepted Applied Canceled Deposited Applied   
Year_Term PEOPLE_CODE_ID                                                      
2012.Fall P000012781                35.0     NaN     41.0       NaN   False   
          P000021989                46.0    45.0     51.0       NaN   False   
          P000023258                 NaN     9.0      NaN       NaN   False   
          P000023557                 NaN    31.0     37.0       NaN   False   
          P000024505                34.0    33.0      NaN      46.0   False   

                         AdmWk02 AdmWk03 AdmWk04 AdmWk05 AdmWk06   ...     \
admission_status         Applied Applied Applied Applied Applied   ...      
Year_Term PEOPLE_CODE_ID                                           ...      
2012.Fall P000012781       False   False   False   False   False   ...      
          P000021989       False   False   False   False   

In [64]:
adm_df3.groupby(['Year_Term']).sum()

Unnamed: 0_level_0,Admissions_Week,Admissions_Week,Admissions_Week,Admissions_Week,AdmWk01,AdmWk02,AdmWk03,AdmWk04,AdmWk05,AdmWk06,...,AdmWk44,AdmWk45,AdmWk46,AdmWk47,AdmWk48,AdmWk49,AdmWk50,AdmWk51,AdmWk52,AdmWk53
admission_status,Accepted,Applied,Canceled,Deposited,Applied,Applied,Applied,Applied,Applied,Applied,...,Canceled,Canceled,Canceled,Canceled,Canceled,Canceled,Canceled,Canceled,Canceled,Canceled
Year_Term,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2012.Fall,19355.0,18379.0,26395.0,10620.0,0.0,27.0,27.0,31.0,31.0,31.0,...,713.0,723.0,746.0,755.0,805.0,807.0,824.0,872.0,881.0,904.0
2013.Fall,18036.0,24068.0,32146.0,8847.0,8.0,18.0,38.0,54.0,76.0,102.0,...,919.0,946.0,975.0,999.0,1009.0,1048.0,1062.0,1085.0,1089.0,1106.0
2014.Fall,17059.0,22653.0,44699.0,4664.0,10.0,18.0,23.0,35.0,77.0,117.0,...,974.0,980.0,994.0,1060.0,1067.0,1075.0,1098.0,1107.0,1284.0,1348.0
2015.Fall,24292.0,32747.0,66849.0,9763.0,9.0,23.0,43.0,52.0,67.0,95.0,...,1093.0,1135.0,1147.0,1162.0,1226.0,1473.0,1767.0,1798.0,1822.0,1837.0
2016.Fall,20719.0,21296.0,9865.0,4892.0,0.0,1.0,1.0,1.0,3.0,4.0,...,89.0,115.0,127.0,158.0,164.0,178.0,187.0,203.0,216.0,224.0
2017.Fall,29386.0,29649.0,2116.0,2288.0,0.0,0.0,0.0,1.0,1.0,2.0,...,22.0,23.0,23.0,27.0,27.0,28.0,40.0,40.0,46.0,57.0


In [74]:
adm_df3.columns


MultiIndex(levels=[['Admissions_Week', 'AdmWk01', 'AdmWk02', 'AdmWk03', 'AdmWk04', 'AdmWk05', 'AdmWk06', 'AdmWk07', 'AdmWk08', 'AdmWk09', 'AdmWk10', 'AdmWk11', 'AdmWk12', 'AdmWk13', 'AdmWk14', 'AdmWk15', 'AdmWk16', 'AdmWk17', 'AdmWk18', 'AdmWk19', 'AdmWk20', 'AdmWk21', 'AdmWk22', 'AdmWk23', 'AdmWk24', 'AdmWk25', 'AdmWk26', 'AdmWk27', 'AdmWk28', 'AdmWk29', 'AdmWk30', 'AdmWk31', 'AdmWk32', 'AdmWk33', 'AdmWk34', 'AdmWk35', 'AdmWk36', 'AdmWk37', 'AdmWk38', 'AdmWk39', 'AdmWk40', 'AdmWk41', 'AdmWk42', 'AdmWk43', 'AdmWk44', 'AdmWk45', 'AdmWk46', 'AdmWk47', 'AdmWk48', 'AdmWk49', 'AdmWk50', 'AdmWk51', 'AdmWk52', 'AdmWk53'], ['Accepted', 'Applied', 'Canceled', 'Deposited']],
           labels=[[0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 

In [76]:
deposits = adm_df3.loc[['2014.Fall'],[:,'Applied']]
deposits.head()

SyntaxError: invalid syntax (<ipython-input-76-be7b79542242>, line 1)

In [None]:
from bokeh.plotting import figure, output_file, show

df = pd.read_csv('Active Deposits--historical data.txt', sep='\t')
p = figure(plot_width=800, plot_height=600, title="Weekly Deposits", x_axis_label="Week Number", y_axis_label="Deposits")

from bokeh.palettes import Blues8

p.line(df['Week #'], df['2008'], color=Blues8[6], legend='2008')
p.line(df['Week #'], df['2009'], color=Blues8[5], legend='2009')
p.line(df['Week #'], df['2010'], color=Blues8[4], legend='2010')
p.line(df['Week #'], df['2011'], color=Blues8[3], legend='2011')
p.line(df['Week #'], df['2012'], color=Blues8[2], legend='2012')
p.line(df['Week #'], df['2013'], color=Blues8[1], legend='2013')
p.line(df['Week #'], df['2014'], color=Blues8[0], legend='2014')

p.legend.location = "top_left"

output_file("line.html")

show(p)