In [1]:
from IPython import __version__ as ipython_version
from pandas import __version__ as pandas_version
from bokeh import __version__ as bokeh_version
print(f'IPython - {ipython_version}')
print(f'Pandas - {pandas_version}')
print(f'Bokeh - {bokeh_version}')

IPython - 6.1.0
Pandas - 0.22.0
Bokeh - 0.12.14


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, date

<H2>Stage Ranking codes</H2>

In [3]:
# read VWSTAGERANKING.csv and drop unused fields
stgrnk = pd.read_csv('VWSTAGERANKING.csv')
stgrnk.drop(['code_table', 'MEDIUM_DESC',
             'Converted/Confirmed/Accepted/Require SepDate'],
            inplace=True, axis=1)

In [4]:
print('stgrnk', stgrnk.shape)
print('stgrnk\n', stgrnk.dtypes)

stgrnk (53, 7)
stgrnk
 STAGERANKING_ID     int64
field_name         object
field_value        object
rank                int64
short_desc         object
Canceled           object
status             object
dtype: object


In [5]:
print(stgrnk.columns)

Index(['STAGERANKING_ID', 'field_name', 'field_value', 'rank', 'short_desc',
       'Canceled', 'status'],
      dtype='object')


In [6]:
stgrnk

Unnamed: 0,STAGERANKING_ID,field_name,field_value,rank,short_desc,Canceled,status
0,1,Application Decision,ACC,1,Accepted,,A
1,3,Application Decision,CREV,3,CommReview,,I
2,4,Application Decision,DENY,4,Denied,,A
3,5,Application Decision,FULL,5,Full Admit,,I
4,6,Application Decision,PROV,6,ProvAdmit,,I
5,7,Application Decision,WAIT,7,Wait List,,A
6,8,Application Status,300,8,Applied,N,A
7,9,Application Status,400,9,Accepted,N,I
8,10,Application Status,500,10,Deposited,N,A
9,11,Application Status,600,11,Enrolled,N,I


<H2>Stage History data</H2>

In [7]:
# read STAGEHISTORY.csv
stg_hist_dtype = {'PEOPLE_CODE_ID': str, 'ACADEMIC_YEAR': str,
                  'ACADEMIC_TERM': str, 'ACADEMIC_SESSION': str,
                  'FIELD_ID': np.int64}
date_cols = ['FIELD_DATE', 'REVISION_DATE', 'REVISION_TIME']
stg_hist = pd.read_csv('STAGEHISTORY.csv', dtype=stg_hist_dtype,
                       parse_dates=date_cols,
                       usecols=['PEOPLE_CODE_ID', 'ACADEMIC_YEAR',
                                'ACADEMIC_TERM', 'ACADEMIC_SESSION',
                                'FIELD_ID', 'FIELD_DATE', 'REVISION_DATE',
                                'REVISION_TIME'])

In [8]:
print('stg_hist', stg_hist.shape)
print('stg_hist')
print(stg_hist.dtypes)

stg_hist (279835, 8)
stg_hist
PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
FIELD_ID                     int64
FIELD_DATE          datetime64[ns]
REVISION_DATE       datetime64[ns]
REVISION_TIME       datetime64[ns]
dtype: object


In [9]:
# create new fields
stg_hist['year_term'] = (stg_hist['ACADEMIC_YEAR'] + '.' +
                         stg_hist['ACADEMIC_TERM'].str.title())

In [10]:
# create Revision datetime
stg_hist['Revision'] = stg_hist.apply(lambda r: pd.datetime.combine(
                                      r['REVISION_DATE'].date(),
                                      r['REVISION_TIME'].time()), 1)

In [11]:
# convert ACADEMIC_YEAR to numeric keep numeric-valued records
stg_hist['ACADEMIC_YEAR'] = pd.to_numeric(stg_hist['ACADEMIC_YEAR'],
                                          errors='coerce', downcast='integer')
stg_hist = stg_hist.loc[pd.to_numeric(stg_hist['ACADEMIC_YEAR'],
                                      errors='coerce',
                                      downcast='integer'
                                      ).notnull()]

In [12]:
# drop unused fields
stg_hist.drop(['FIELD_DATE', 'REVISION_DATE', 'REVISION_TIME'],
              inplace=True, axis=1)

In [13]:
# calculate new fields
stg_hist['Week_Number'] = stg_hist['Revision'].dt.week
stg_hist['Admissions_Week'] = stg_hist.apply(lambda r: (r['Week_Number'] -
                                                        (date(int(
                                                         r['ACADEMIC_YEAR']),
                                                            9, 1
                                                        ).isocalendar()[1])
                                                        )
                                             if (r['Week_Number'] >
                                                 (date(int(r['ACADEMIC_YEAR']),
                                                       9, 1).isocalendar()[1]))
                                             else (53 + r['Week_Number'] -
                                                   (date(int(
                                                       r['ACADEMIC_YEAR']),
                                                    9, 1).isocalendar()[1])),
                                             axis=1)

In [14]:
stage_data = pd.merge(stg_hist, stgrnk, left_on=['FIELD_ID'],
                      right_on=['STAGERANKING_ID'], how='left')

In [15]:
print('stage_data', stage_data.shape)
print('stage_data')
print(stage_data.dtypes)
stage_data.head()

stage_data (279683, 16)
stage_data
PEOPLE_CODE_ID              object
ACADEMIC_YEAR              float64
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
FIELD_ID                     int64
year_term                   object
Revision            datetime64[ns]
Week_Number                  int64
Admissions_Week              int64
STAGERANKING_ID              int64
field_name                  object
field_value                 object
rank                         int64
short_desc                  object
Canceled                    object
status                      object
dtype: object


Unnamed: 0,PEOPLE_CODE_ID,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,FIELD_ID,year_term,Revision,Week_Number,Admissions_Week,STAGERANKING_ID,field_name,field_value,rank,short_desc,Canceled,status
0,P000000006,2000.0,SPRING,,15,2000.Spring,2006-10-26 16:41:53,43,8,15,Enrolled/Separated,ENRL,15,Enrolled,,A
1,P000000006,2000.0,SPRING,,10,2000.Spring,2006-10-26 16:41:53,43,8,10,Application Status,500,10,Deposited,N,A
2,P000000006,2000.0,SPRING,,1,2000.Spring,2006-10-26 16:41:53,43,8,1,Application Decision,ACC,1,Accepted,,A
3,P000000006,2000.0,SPRING,,18,2000.Spring,2006-12-21 16:53:25,51,16,18,Enrolled/Separated,WITH,18,Withdrawn,,A
4,P000000006,2000.0,SPRING,,15,2000.Spring,2006-12-21 16:53:25,51,16,15,Enrolled/Separated,ENRL,15,Enrolled,,A


In [17]:
ad_keep_values = ['300', 'ACC', 'ACXL', 'CANC', 'DEF', 'DEFR', 'DENY', 'DPAC',
                  'TRDP', 'TRPD', 'TRNS', 'WAIT']
ad_keep_cols = ['PEOPLE_CODE_ID', 'year_term', 'Admissions_Week',
                'field_value', 'status']
sd1 = stage_data.loc[(stage_data['field_value'].isin(ad_keep_values))]
sd1 = sd1[ad_keep_cols]

In [19]:
print('sd1', sd1.shape)
print('sd1')
print(sd1.dtypes)
sd1.head()

sd1 (37676, 5)
sd1
PEOPLE_CODE_ID     object
year_term          object
Admissions_Week     int64
field_value        object
status             object
dtype: object


Unnamed: 0,PEOPLE_CODE_ID,year_term,Admissions_Week,field_value,status
2,P000000006,2000.Spring,8,ACC,A
16,P000000006,2000.Spring,8,ACC,A
34,P000000007,2003.Spring,7,ACC,A
36,P000000007,2003.Spring,7,ACC,A
40,P000000013,2004.Fall,7,ACC,A


In [None]:
sd_df[(sd_df['Year_Term']=='2014.Fall') & (sd_df['PEOPLE_CODE_ID']=='P000026232') & (sd_df['field_value']=='DPAC')].head(30)

In [None]:
sd_df[(sd_df['Year_Term']=='2014.Fall') & (sd_df['PEOPLE_CODE_ID']=='P000026232')].head(30)

<H2>Academic data</H2>

In [None]:
academic_dtype = {'PEOPLE_CODE_ID': str, 'ACADEMIC_YEAR': str, 'ACADEMIC_TERM': str, 'ACADEMIC_SESSION': str, 'APPLICATION_FLAG': str, 'APP_STATUS': str}
date_cols = ['APPLICATION_DATE', 'APP_STATUS_DATE', 'APP_DECISION_DATE', 'REVISION_DATE', 'REVISION_TIME']
academic = pd.read_csv('ACADEMIC.csv', dtype=academic_dtype, parse_dates=date_cols,
                       usecols=['PEOPLE_CODE_ID','ACADEMIC_YEAR','ACADEMIC_TERM','ACADEMIC_SESSION','POPULATION','INQUIRY_FLAG','APPLICATION_FLAG','APPLICATION_DATE', 'APP_STATUS', 'APP_STATUS_DATE', 'APP_DECISION', 'APP_DECISION_DATE', 'REVISION_DATE', 'REVISION_TIME'])

print('academic', academic.shape)
print('academic')
print(academic.dtypes)
academic.info()

In [None]:
print(academic['POPULATION'].value_counts().sort_index())
print(academic['POPULATION'].value_counts().sum())

In [None]:
app_data = academic.loc[~(academic['POPULATION'].isin(['ADVSTU', 'NOND'])) & ((academic['INQUIRY_FLAG']=='Y') | (academic['APPLICATION_FLAG']=='Y'))]
print('app_data', app_data.shape)
print('app_data')
print(app_data.dtypes)

applied = (app_data[app_data['APP_STATUS'].notnull()]
           .rename(columns={'APP_STATUS': 'field_value'})
           .rename(columns={'APP_STATUS_DATE': 'Revision'})
          )
applied.loc[:, 'field_name'] = 'Application Status'
print('applied', applied.shape)
print('applied')
print(applied.dtypes)

accepted = (app_data[app_data['APP_DECISION'].notnull()]
            .rename(columns={'APP_DECISION': 'field_value'})
            .rename(columns={'APP_DECISION_DATE': 'Revision'})
           )
accepted.loc[:, 'field_name'] = 'Application Decision'
print('accepted', accepted.shape)
print('accepted')
print(accepted.dtypes)


In [None]:
# stack Stage History, Academic Applied and Academic Accepted
#adm_df = stage_data.append(applied).append(accepted)
adm_df = applied.append(accepted)

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

In [None]:
adm_df.head()

In [None]:
adm_df.info()

In [None]:
# new columns
adm_df['Year_Term'] = adm_df['ACADEMIC_YEAR'] + '.' + adm_df['ACADEMIC_TERM'].str.title()

adm_df['Revision'] = adm_df.apply(revision,1)
adm_df['Week_Number'] = adm_df['Revision'].dt.week

adm_df = (adm_df.drop(['REVISION_DATE','REVISION_TIME'], axis=1)
          .loc[(adm_df['ACADEMIC_TERM'].isin(['FALL', 'SPRING'])) & (adm_df['ACADEMIC_SESSION'] == 'MAIN')]
         )

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

In [None]:
adm_df[(adm_df['Year_Term']=='2014.Fall') & (adm_df['PEOPLE_CODE_ID']=='P000026232')].head(30)

In [None]:
adm_df = adm_df[pd.to_numeric(adm_df['ACADEMIC_YEAR'], errors='coerce', downcast='integer').notnull()]

adm_df['Admissions_Week'] = adm_df.apply(week_number, axis=1)

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

In [None]:
adm_df.head()

In [None]:
adm_df.columns

In [None]:
adm_df['ACADEMIC_YEAR']==2014 & (adm_df['PEOPLE_CODE_ID']=='P000026232') 


In [None]:
print(adm_df['field_value'].value_counts().sort_index())
print(adm_df['field_value'].value_counts().sum())

In [None]:
print(adm_df['short_desc'].value_counts().sort_index())
print(adm_df['short_desc'].value_counts().sum())

In [None]:
adm_df[(adm_df['ACADEMIC_YEAR']==2014) & (adm_df['short_desc']=='Deposited')].head(30)

In [None]:
adm_keep_values = ['300', 'ACC', 'ACXL', 'CANC', 'DEF', 'DEFR', 'DENY', 'DPAC',
                  'TRDP', 'TRPD', 'TRNS', 'WAIT']
adm_keep_cols = ['PEOPLE_CODE_ID', 'Year_Term', 'Admissions_Week',
                'field_value']
adm_df = adm_df.loc[(adm_df['field_value'].isin(adm_keep_values))]
adm_df = adm_df[adm_keep_cols]

print('adm_df', adm_df.shape)
print('adm_df')
print(adm_df.dtypes)

In [None]:
adm_df.head()

In [None]:
adm_df[(adm_df['Year_Term']=='2014.Fall') & (adm_df['PEOPLE_CODE_ID']=='P000026232')].head(30)

In [None]:
admission_status = {'300' : 'Applied', 'ACC': 'Accepted', 'ACXL': 'Canceled', 'CANC': 'Canceled', 'DEF': 'Canceled', 'DEFR': 'Canceled', 'DENY': 'Canceled', 'DPAC': 'Deposited', 'TRDP': 'Deposited', 'TRPD': 'Deposited', 'TRNS': 'Accepted', 'WAIT': 'Accepted'}
adm_stat = pd.DataFrame(list(admission_status.items()), columns=['field_value', 'admission_status'])

adm_df1 = (pd.merge(adm_df, adm_stat, on=['field_value'], how='left' )
           .drop(['field_value'], axis=1)
           .drop_duplicates(['PEOPLE_CODE_ID', 'Year_Term', 'Admissions_Week', 'admission_status'])
          )

print('adm_df1', adm_df1.shape)
print('adm_df1')
print(adm_df1.dtypes)

In [None]:
adm_df1.head()

In [None]:
adm_df1[(adm_df1['Year_Term']=='2014.Fall') & (adm_df1['PEOPLE_CODE_ID']=='P000026232')].head(30)

In [None]:
adm_df2 = (adm_df1.loc[(adm_df1['Year_Term'].isin(['2014.Fall', '2015.Fall']))]
           .sort_values(['Year_Term', 'PEOPLE_CODE_ID', 'Admissions_Week'])
           .drop_duplicates(['Year_Term', 'PEOPLE_CODE_ID', 'admission_status'], keep='first')
           .reset_index()
           .set_index(['Year_Term', 'PEOPLE_CODE_ID', 'admission_status'])
           .drop(['index'], axis=1)
           .unstack(level=-1)
          )

print('adm_df2', adm_df2.shape)
print('adm_df2\n', adm_df2.dtypes)

In [None]:
adm_df2.head()