In [1]:
from IPython import __version__ as ipython_version
from pandas import __version__ as pandas_version
from bokeh import __version__ as bokeh_version

print(f"IPython - {ipython_version}")
print(f"Pandas - {pandas_version}")
print(f"Bokeh - {bokeh_version}")


IPython - 6.1.0
Pandas - 0.22.0
Bokeh - 0.12.14


In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, date


<H2>Stage Ranking codes</H2>

In [4]:
stgrnk = pd.read_csv("VWSTAGERANKING.csv").drop(
    ["code_table", "MEDIUM_DESC", "Converted/Confirmed/Accepted/Require SepDate"],
    axis=1,
)

print("stgrnk", stgrnk.shape)
print("stgrnk\n", stgrnk.dtypes)


stgrnk (53, 7)
stgrnk
 STAGERANKING_ID     int64
field_name         object
field_value        object
rank                int64
short_desc         object
Canceled           object
status             object
dtype: object


In [None]:
print(stgrnk.columns)


In [5]:
stgrnk


Unnamed: 0,STAGERANKING_ID,field_name,field_value,rank,short_desc,Canceled,status
0,1,Application Decision,ACC,1,Accepted,,A
1,3,Application Decision,CREV,3,CommReview,,I
2,4,Application Decision,DENY,4,Denied,,A
3,5,Application Decision,FULL,5,Full Admit,,I
4,6,Application Decision,PROV,6,ProvAdmit,,I
5,7,Application Decision,WAIT,7,Wait List,,A
6,8,Application Status,300,8,Applied,N,A
7,9,Application Status,400,9,Accepted,N,I
8,10,Application Status,500,10,Deposited,N,A
9,11,Application Status,600,11,Enrolled,N,I


<H2>Stage History data</H2>

In [6]:
stg_hist_dtype = {
    "PEOPLE_CODE_ID": str,
    "ACADEMIC_YEAR": str,
    "ACADEMIC_TERM": str,
    "ACADEMIC_SESSION": str,
    "FIELD_ID": np.int64,
}
date_cols = ["FIELD_DATE", "REVISION_DATE", "REVISION_TIME"]
stg_hist = pd.read_csv(
    "STAGEHISTORY.csv",
    dtype=stg_hist_dtype,
    parse_dates=date_cols,
    usecols=[
        "PEOPLE_CODE_ID",
        "ACADEMIC_YEAR",
        "ACADEMIC_TERM",
        "ACADEMIC_SESSION",
        "FIELD_ID",
        "FIELD_DATE",
        "REVISION_DATE",
        "REVISION_TIME",
    ],
)

print("stg_hist", stg_hist.shape)
print("stg_hist")
print(stg_hist.dtypes)


stg_hist (279835, 8)
stg_hist
PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
FIELD_ID                     int64
FIELD_DATE          datetime64[ns]
REVISION_DATE       datetime64[ns]
REVISION_TIME       datetime64[ns]
dtype: object


In [7]:
sd_df = pd.merge(
    stg_hist, stgrnk, left_on=["FIELD_ID"], right_on=["STAGERANKING_ID"], how="left"
)

print("sd_df", sd_df.shape)
print("sd_df")
print(sd_df.dtypes)


sd_df (279835, 15)
sd_df
PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
FIELD_ID                     int64
FIELD_DATE          datetime64[ns]
REVISION_DATE       datetime64[ns]
REVISION_TIME       datetime64[ns]
STAGERANKING_ID              int64
field_name                  object
field_value                 object
rank                         int64
short_desc                  object
Canceled                    object
status                      object
dtype: object


In [8]:
def revision(df):
    return pd.datetime.combine(df["REVISION_DATE"].date(), df["REVISION_TIME"].time())


In [9]:
sd_df["Year_Term"] = sd_df["ACADEMIC_YEAR"] + "." + sd_df["ACADEMIC_TERM"].str.title()

# sd_df['Revision'] = sd_df.apply(lambda r : pd.datetime.combine(r['REVISION_DATE'].date(),r['REVISION_TIME'].time()),1)
sd_df["Revision"] = sd_df.apply(revision, axis=1)
sd_df["Week_Number"] = sd_df["Revision"].dt.week

sd_df = sd_df.drop(["FIELD_DATE", "REVISION_DATE", "REVISION_TIME"], axis=1).loc[
    (sd_df["ACADEMIC_TERM"].isin(["FALL", "SPRING"]))
    & (sd_df["ACADEMIC_SESSION"] == "MAIN")
]

print("sd_df", sd_df.shape)
print("sd_df")
print(sd_df.dtypes)


sd_df (148631, 15)
sd_df
PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
FIELD_ID                     int64
STAGERANKING_ID              int64
field_name                  object
field_value                 object
rank                         int64
short_desc                  object
Canceled                    object
status                      object
Year_Term                   object
Revision            datetime64[ns]
Week_Number                  int64
dtype: object


In [10]:
print("sd_df", sd_df.shape)
print("sd_df")
print(sd_df.dtypes)


sd_df (148631, 15)
sd_df
PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
FIELD_ID                     int64
STAGERANKING_ID              int64
field_name                  object
field_value                 object
rank                         int64
short_desc                  object
Canceled                    object
status                      object
Year_Term                   object
Revision            datetime64[ns]
Week_Number                  int64
dtype: object


In [11]:
def week_number(df):
    if df["Week_Number"] > (date(int(df["ACADEMIC_YEAR"]), 9, 1).isocalendar()[1]):
        return df["Week_Number"] - (
            date(int(df["ACADEMIC_YEAR"]), 9, 1).isocalendar()[1]
        )
    else:
        return (
            53
            + df["Week_Number"]
            - (date(int(df["ACADEMIC_YEAR"]), 9, 1).isocalendar()[1])
        )


In [12]:
sd_df = sd_df[
    pd.to_numeric(sd_df["ACADEMIC_YEAR"], errors="coerce", downcast="integer").notnull()
]

sd_df["Admissions_Week"] = sd_df.apply(week_number, axis=1)

print("sd_df", sd_df.shape)
print("sd_df")
print(sd_df.dtypes)
sd_df.head()


sd_df (148631, 16)
sd_df
PEOPLE_CODE_ID              object
ACADEMIC_YEAR               object
ACADEMIC_TERM               object
ACADEMIC_SESSION            object
FIELD_ID                     int64
STAGERANKING_ID              int64
field_name                  object
field_value                 object
rank                         int64
short_desc                  object
Canceled                    object
status                      object
Year_Term                   object
Revision            datetime64[ns]
Week_Number                  int64
Admissions_Week              int64
dtype: object


Unnamed: 0,PEOPLE_CODE_ID,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,FIELD_ID,STAGERANKING_ID,field_name,field_value,rank,short_desc,Canceled,status,Year_Term,Revision,Week_Number,Admissions_Week
15,P000000006,2000,SPRING,MAIN,10,10,Application Status,500,10,Deposited,N,A,2000.Spring,2006-10-26 16:41:53,43,8
16,P000000006,2000,SPRING,MAIN,1,1,Application Decision,ACC,1,Accepted,,A,2000.Spring,2006-10-26 16:41:53,43,8
17,P000000006,2000,SPRING,MAIN,15,15,Enrolled/Separated,ENRL,15,Enrolled,,A,2000.Spring,2006-10-26 16:41:53,43,8
18,P000000006,2000,SPRING,MAIN,18,18,Enrolled/Separated,WITH,18,Withdrawn,,A,2000.Spring,2006-12-21 16:53:25,51,16
19,P000000006,2000,SPRING,MAIN,15,15,Enrolled/Separated,ENRL,15,Enrolled,,A,2000.Spring,2006-12-21 16:53:25,51,16


In [13]:
adm_keep_values = [
    "300",
    "ACC",
    "ACXL",
    "CANC",
    "DEF",
    "DEFR",
    "DENY",
    "DPAC",
    "TRDP",
    "TRPD",
    "TRNS",
    "WAIT",
]
adm_keep_cols = [
    "PEOPLE_CODE_ID",
    "Year_Term",
    "Admissions_Week",
    "field_value",
    "status",
]
sd_df = sd_df.loc[(sd_df["field_value"].isin(adm_keep_values))]
sd_df = sd_df[adm_keep_cols]

print("sd_df", sd_df.shape)
print("sd_df\n", sd_df.dtypes)
sd_df.head()


sd_df (28719, 5)
sd_df
 PEOPLE_CODE_ID     object
Year_Term          object
Admissions_Week     int64
field_value        object
status             object
dtype: object


Unnamed: 0,PEOPLE_CODE_ID,Year_Term,Admissions_Week,field_value,status
16,P000000006,2000.Spring,8,ACC,A
36,P000000007,2003.Spring,7,ACC,A
42,P000000013,2004.Fall,7,ACC,A
76,P000000016,2000.Spring,8,ACC,A
258,P000000017,2001.Spring,8,ACC,A


In [14]:
sd_df[
    (sd_df["Year_Term"] == "2014.Fall")
    & (sd_df["PEOPLE_CODE_ID"] == "P000026232")
    & (sd_df["field_value"] == "DPAC")
].head(30)


Unnamed: 0,PEOPLE_CODE_ID,Year_Term,Admissions_Week,field_value,status
191632,P000026232,2014.Fall,36,DPAC,A


In [15]:
sd_df[
    (sd_df["Year_Term"] == "2014.Fall") & (sd_df["PEOPLE_CODE_ID"] == "P000026232")
].head(30)


Unnamed: 0,PEOPLE_CODE_ID,Year_Term,Admissions_Week,field_value,status
191627,P000026232,2014.Fall,21,300,A
191630,P000026232,2014.Fall,22,ACC,A
191632,P000026232,2014.Fall,36,DPAC,A


<H2>Academic data</H2>

In [27]:
academic_dtype = {
    "PEOPLE_CODE_ID": str,
    "ACADEMIC_YEAR": str,
    "ACADEMIC_TERM": str,
    "ACADEMIC_SESSION": str,
    "APPLICATION_FLAG": str,
    "APP_STATUS": str,
}
date_cols = [
    "APPLICATION_DATE",
    "APP_STATUS_DATE",
    "APP_DECISION_DATE",
    "REVISION_DATE",
    "REVISION_TIME",
]
academic = pd.read_csv(
    "ACADEMIC.csv",
    dtype=academic_dtype,
    parse_dates=date_cols,
    usecols=[
        "PEOPLE_CODE_ID",
        "ACADEMIC_YEAR",
        "ACADEMIC_TERM",
        "ACADEMIC_SESSION",
        "POPULATION",
        "INQUIRY_FLAG",
        "APPLICATION_FLAG",
        "APPLICATION_DATE",
        "APP_STATUS",
        "APP_STATUS_DATE",
        "APP_DECISION",
        "APP_DECISION_DATE",
        "REVISION_DATE",
        "REVISION_TIME",
    ],
)

print("academic", academic.shape)
print("academic")
print(academic.dtypes)
academic.info()


academic (122300, 14)
academic
PEOPLE_CODE_ID               object
ACADEMIC_YEAR                object
ACADEMIC_TERM                object
ACADEMIC_SESSION             object
POPULATION                   object
REVISION_DATE        datetime64[ns]
REVISION_TIME        datetime64[ns]
APPLICATION_FLAG             object
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
INQUIRY_FLAG                 object
APPLICATION_DATE     datetime64[ns]
dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122300 entries, 0 to 122299
Data columns (total 14 columns):
PEOPLE_CODE_ID       122300 non-null object
ACADEMIC_YEAR        122159 non-null object
ACADEMIC_TERM        121990 non-null object
ACADEMIC_SESSION     73525 non-null object
POPULATION           13257 non-null object
REVISION_DATE        122300 non-null datetime64[ns]
REVISION_TIME        122300 non-null datetime64[ns]
APPLICATION_FLAG  

In [28]:
print(academic["POPULATION"].value_counts().sort_index())
print(academic["POPULATION"].value_counts().sum())


          3967
ACE        554
ADVSTU    8372
NOND       344
TBC          6
TRNSTN      14
Name: POPULATION, dtype: int64
13257


In [29]:
app_data = academic.loc[
    ~(academic["POPULATION"].isin(["ADVSTU", "NOND"]))
    & ((academic["INQUIRY_FLAG"] == "Y") | (academic["APPLICATION_FLAG"] == "Y"))
]
print("app_data", app_data.shape)
print("app_data")
print(app_data.dtypes)

applied = (
    app_data[app_data["APP_STATUS"].notnull()]
    .rename(columns={"APP_STATUS": "field_value"})
    .rename(columns={"APP_STATUS_DATE": "Revision"})
)
applied.loc[:, "field_name"] = "Application Status"
print("applied", applied.shape)
print("applied")
print(applied.dtypes)

accepted = (
    app_data[app_data["APP_DECISION"].notnull()]
    .rename(columns={"APP_DECISION": "field_value"})
    .rename(columns={"APP_DECISION_DATE": "Revision"})
)
accepted.loc[:, "field_name"] = "Application Decision"
print("accepted", accepted.shape)
print("accepted")
print(accepted.dtypes)


app_data (42151, 14)
app_data
PEOPLE_CODE_ID               object
ACADEMIC_YEAR                object
ACADEMIC_TERM                object
ACADEMIC_SESSION             object
POPULATION                   object
REVISION_DATE        datetime64[ns]
REVISION_TIME        datetime64[ns]
APPLICATION_FLAG             object
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
INQUIRY_FLAG                 object
APPLICATION_DATE     datetime64[ns]
dtype: object
applied (23149, 15)
applied
PEOPLE_CODE_ID               object
ACADEMIC_YEAR                object
ACADEMIC_TERM                object
ACADEMIC_SESSION             object
POPULATION                   object
REVISION_DATE        datetime64[ns]
REVISION_TIME        datetime64[ns]
APPLICATION_FLAG             object
field_value                  object
Revision             datetime64[ns]
APP_DECISION                 object
APP_DECISION_DATE    datetim

In [53]:
# stack Stage History, Academic Applied and Academic Accepted
# adm_df = stage_data.append(applied).append(accepted)
adm_df = applied.append(accepted)

print("adm_df", adm_df.shape)
print("adm_df")
print(adm_df.dtypes)


adm_df (46202, 17)
adm_df
ACADEMIC_SESSION             object
ACADEMIC_TERM                object
ACADEMIC_YEAR                object
APPLICATION_DATE     datetime64[ns]
APPLICATION_FLAG             object
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
INQUIRY_FLAG                 object
PEOPLE_CODE_ID               object
POPULATION                   object
REVISION_DATE        datetime64[ns]
REVISION_TIME        datetime64[ns]
Revision             datetime64[ns]
field_name                   object
field_value                  object
dtype: object


In [54]:
adm_df.head()


Unnamed: 0,ACADEMIC_SESSION,ACADEMIC_TERM,ACADEMIC_YEAR,APPLICATION_DATE,APPLICATION_FLAG,APP_DECISION,APP_DECISION_DATE,APP_STATUS,APP_STATUS_DATE,INQUIRY_FLAG,PEOPLE_CODE_ID,POPULATION,REVISION_DATE,REVISION_TIME,Revision,field_name,field_value
0,,SPRING,2000,NaT,Y,ACC,NaT,,NaT,Y,P000000006,,2010-07-16,1900-01-01 09:49:37.943,NaT,Application Status,500
1,MAIN,SPRING,2000,NaT,Y,ACC,NaT,,NaT,Y,P000000006,,2007-06-19,1900-01-01 11:57:40.000,NaT,Application Status,500
4,,SPRING,2003,NaT,Y,ACC,NaT,,NaT,Y,P000000007,,2010-07-16,1900-01-01 09:49:38.317,NaT,Application Status,500
5,MAIN,SPRING,2003,NaT,Y,ACC,NaT,,NaT,Y,P000000007,,2007-05-15,1900-01-01 17:20:07.000,NaT,Application Status,500
6,,FALL,2004,NaT,Y,ACC,NaT,,NaT,Y,P000000013,,2010-07-16,1900-01-01 09:49:38.683,NaT,Application Status,500


In [33]:
adm_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 46202 entries, 0 to 122299
Data columns (total 17 columns):
ACADEMIC_SESSION     29170 non-null object
ACADEMIC_TERM        46202 non-null object
ACADEMIC_YEAR        46200 non-null object
APPLICATION_DATE     28654 non-null datetime64[ns]
APPLICATION_FLAG     46202 non-null object
APP_DECISION         23053 non-null object
APP_DECISION_DATE    14296 non-null datetime64[ns]
APP_STATUS           23053 non-null object
APP_STATUS_DATE      14291 non-null datetime64[ns]
INQUIRY_FLAG         46202 non-null object
PEOPLE_CODE_ID       46202 non-null object
POPULATION           2875 non-null object
REVISION_DATE        46202 non-null datetime64[ns]
REVISION_TIME        46202 non-null datetime64[ns]
Revision             28677 non-null datetime64[ns]
field_name           46202 non-null object
field_value          46202 non-null object
dtypes: datetime64[ns](6), object(11)
memory usage: 6.3+ MB


In [56]:
# new columns
adm_df["Year_Term"] = (
    adm_df["ACADEMIC_YEAR"] + "." + adm_df["ACADEMIC_TERM"].str.title()
)

adm_df["Revision"] = adm_df.apply(revision, 1)
adm_df["Week_Number"] = adm_df["Revision"].dt.week

adm_df = adm_df.drop(["REVISION_DATE", "REVISION_TIME"], axis=1).loc[
    (adm_df["ACADEMIC_TERM"].isin(["FALL", "SPRING"]))
    & (adm_df["ACADEMIC_SESSION"] == "MAIN")
]

print("adm_df", adm_df.shape)
print("adm_df")
print(adm_df.dtypes)


adm_df (28492, 17)
adm_df
ACADEMIC_SESSION             object
ACADEMIC_TERM                object
ACADEMIC_YEAR                object
APPLICATION_DATE     datetime64[ns]
APPLICATION_FLAG             object
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
INQUIRY_FLAG                 object
PEOPLE_CODE_ID               object
POPULATION                   object
Revision             datetime64[ns]
field_name                   object
field_value                  object
Year_Term                    object
Week_Number                   int64
dtype: object


In [57]:
adm_df[
    (adm_df["Year_Term"] == "2014.Fall") & (adm_df["PEOPLE_CODE_ID"] == "P000026232")
].head(30)


Unnamed: 0,ACADEMIC_SESSION,ACADEMIC_TERM,ACADEMIC_YEAR,APPLICATION_DATE,APPLICATION_FLAG,APP_DECISION,APP_DECISION_DATE,APP_STATUS,APP_STATUS_DATE,INQUIRY_FLAG,PEOPLE_CODE_ID,POPULATION,Revision,field_name,field_value,Year_Term,Week_Number
73996,MAIN,FALL,2014,2014-01-24,Y,DPAC,2014-05-05,,NaT,Y,P000026232,,2014-10-16 14:05:18.067,Application Status,500,2014.Fall,42
73996,MAIN,FALL,2014,2014-01-24,Y,,NaT,500.0,2014-05-05,Y,P000026232,,2014-10-16 14:05:18.067,Application Decision,DPAC,2014.Fall,42


In [37]:
adm_df = adm_df[
    pd.to_numeric(
        adm_df["ACADEMIC_YEAR"], errors="coerce", downcast="integer"
    ).notnull()
]

adm_df["Admissions_Week"] = adm_df.apply(week_number, axis=1)

print("adm_df", adm_df.shape)
print("adm_df")
print(adm_df.dtypes)


adm_df (28492, 18)
adm_df
ACADEMIC_SESSION             object
ACADEMIC_TERM                object
ACADEMIC_YEAR                object
APPLICATION_DATE     datetime64[ns]
APPLICATION_FLAG             object
APP_DECISION                 object
APP_DECISION_DATE    datetime64[ns]
APP_STATUS                   object
APP_STATUS_DATE      datetime64[ns]
INQUIRY_FLAG                 object
PEOPLE_CODE_ID               object
POPULATION                   object
Revision             datetime64[ns]
field_name                   object
field_value                  object
Year_Term                    object
Week_Number                   int64
Admissions_Week               int64
dtype: object


In [38]:
adm_df.head()


Unnamed: 0,ACADEMIC_SESSION,ACADEMIC_TERM,ACADEMIC_YEAR,APPLICATION_DATE,APPLICATION_FLAG,APP_DECISION,APP_DECISION_DATE,APP_STATUS,APP_STATUS_DATE,INQUIRY_FLAG,PEOPLE_CODE_ID,POPULATION,Revision,field_name,field_value,Year_Term,Week_Number,Admissions_Week
1,MAIN,SPRING,2000,NaT,Y,ACC,NaT,,NaT,Y,P000000006,,2007-06-19 11:57:40,Application Status,500,2000.Spring,25,43
5,MAIN,SPRING,2003,NaT,Y,ACC,NaT,,NaT,Y,P000000007,,2007-05-15 17:20:07,Application Status,500,2003.Spring,20,37
7,MAIN,FALL,2004,NaT,Y,ACC,NaT,,NaT,Y,P000000013,,2008-09-16 10:33:00,Application Status,500,2004.Fall,38,2
13,MAIN,SPRING,2000,NaT,Y,ACC,NaT,,NaT,Y,P000000016,,2007-06-25 15:58:31,Application Status,500,2000.Spring,26,44
39,MAIN,SPRING,2001,NaT,Y,ACC,NaT,,NaT,Y,P000000017,,2007-05-15 17:20:12,Application Status,500,2001.Spring,20,38


In [39]:
adm_df.columns


Index(['ACADEMIC_SESSION', 'ACADEMIC_TERM', 'ACADEMIC_YEAR',
       'APPLICATION_DATE', 'APPLICATION_FLAG', 'APP_DECISION',
       'APP_DECISION_DATE', 'APP_STATUS', 'APP_STATUS_DATE', 'INQUIRY_FLAG',
       'PEOPLE_CODE_ID', 'POPULATION', 'Revision', 'field_name', 'field_value',
       'Year_Term', 'Week_Number', 'Admissions_Week'],
      dtype='object')

In [40]:
adm_df["ACADEMIC_YEAR"] == 2014 & (adm_df["PEOPLE_CODE_ID"] == "P000026232")


1         False
5         False
7         False
13        False
39        False
41        False
49        False
66        False
75        False
83        False
85        False
95        False
99        False
105       False
109       False
111       False
129       False
149       False
156       False
169       False
177       False
188       False
202       False
204       False
214       False
216       False
222       False
232       False
240       False
242       False
          ...  
122270    False
122271    False
122272    False
122273    False
122274    False
122275    False
122276    False
122277    False
122278    False
122279    False
122280    False
122281    False
122282    False
122283    False
122284    False
122285    False
122286    False
122287    False
122288    False
122289    False
122290    False
122291    False
122292    False
122293    False
122294    False
122295    False
122296    False
122297    False
122298    False
122299    False
Length: 28492, dtype: bo

In [41]:
print(adm_df["field_value"].value_counts().sort_index())
print(adm_df["field_value"].value_counts().sum())


300     1516
500     8189
ACC     9503
ACXL    1704
CANC    2711
COMP     152
DEF       10
DEFR     184
DENY      39
DPAC    2087
PEND    1843
TRDP     276
TRNS     272
nosh       6
Name: field_value, dtype: int64
28492


In [None]:
print(adm_df["short_desc"].value_counts().sort_index())
print(adm_df["short_desc"].value_counts().sum())


In [None]:
adm_df[(adm_df["ACADEMIC_YEAR"] == 2014) & (adm_df["short_desc"] == "Deposited")].head(
    30
)


In [46]:
adm_keep_values = [
    "300",
    "ACC",
    "ACXL",
    "CANC",
    "DEF",
    "DEFR",
    "DENY",
    "DPAC",
    "TRDP",
    "TRPD",
    "TRNS",
    "WAIT",
]
adm_keep_cols = ["PEOPLE_CODE_ID", "Year_Term", "Admissions_Week", "field_value"]
adm_df = adm_df.loc[(adm_df["field_value"].isin(adm_keep_values))]
adm_df = adm_df[adm_keep_cols]

print("adm_df", adm_df.shape)
print("adm_df")
print(adm_df.dtypes)


adm_df (18302, 4)
adm_df
PEOPLE_CODE_ID     object
Year_Term          object
Admissions_Week     int64
field_value        object
dtype: object


In [47]:
adm_df.head()


Unnamed: 0,PEOPLE_CODE_ID,Year_Term,Admissions_Week,field_value
404,P000000507,2007.Fall,42,300
2650,P000001732,2007.Fall,45,300
4142,P000002346,2007.Fall,50,300
6583,P000003967,2004.Spring,53,ACXL
6941,P000004234,2003.Fall,20,CANC


In [52]:
adm_df[
    (adm_df["Year_Term"] == "2014.Fall") & (adm_df["PEOPLE_CODE_ID"] == "P000026232")
].head(30)


Unnamed: 0,PEOPLE_CODE_ID,Year_Term,Admissions_Week,field_value
73996,P000026232,2014.Fall,6,DPAC


In [49]:
admission_status = {
    "300": "Applied",
    "ACC": "Accepted",
    "ACXL": "Canceled",
    "CANC": "Canceled",
    "DEF": "Canceled",
    "DEFR": "Canceled",
    "DENY": "Canceled",
    "DPAC": "Deposited",
    "TRDP": "Deposited",
    "TRPD": "Deposited",
    "TRNS": "Accepted",
    "WAIT": "Accepted",
}
adm_stat = pd.DataFrame(
    list(admission_status.items()), columns=["field_value", "admission_status"]
)

adm_df1 = (
    pd.merge(adm_df, adm_stat, on=["field_value"], how="left")
    .drop(["field_value"], axis=1)
    .drop_duplicates(
        ["PEOPLE_CODE_ID", "Year_Term", "Admissions_Week", "admission_status"]
    )
)

print("adm_df1", adm_df1.shape)
print("adm_df1")
print(adm_df1.dtypes)


adm_df1 (18075, 4)
adm_df1
PEOPLE_CODE_ID      object
Year_Term           object
Admissions_Week      int64
admission_status    object
dtype: object


In [50]:
adm_df1.head()


Unnamed: 0,PEOPLE_CODE_ID,Year_Term,Admissions_Week,admission_status
0,P000000507,2007.Fall,42,Applied
1,P000001732,2007.Fall,45,Applied
2,P000002346,2007.Fall,50,Applied
3,P000003967,2004.Spring,53,Canceled
4,P000004234,2003.Fall,20,Canceled


In [51]:
adm_df1[
    (adm_df1["Year_Term"] == "2014.Fall") & (adm_df1["PEOPLE_CODE_ID"] == "P000026232")
].head(30)


Unnamed: 0,PEOPLE_CODE_ID,Year_Term,Admissions_Week,admission_status
12868,P000026232,2014.Fall,6,Deposited


In [None]:
adm_df2 = (
    adm_df1.loc[(adm_df1["Year_Term"].isin(["2014.Fall", "2015.Fall"]))]
    .sort_values(["Year_Term", "PEOPLE_CODE_ID", "Admissions_Week"])
    .drop_duplicates(["Year_Term", "PEOPLE_CODE_ID", "admission_status"], keep="first")
    .reset_index()
    .set_index(["Year_Term", "PEOPLE_CODE_ID", "admission_status"])
    .drop(["index"], axis=1)
    .unstack(level=-1)
)

print("adm_df2", adm_df2.shape)
print("adm_df2\n", adm_df2.dtypes)


In [None]:
adm_df2.head()
