In [1]:
from numpy import __version__ as numpy_version
from pandas import __version__ as pandas_version

print(f"Numpy - {numpy_version}")
print(f"Pandas - {pandas_version}")


Numpy - 1.16.4
Pandas - 0.24.2


In [2]:
import numpy as np
import pandas as pd
#import altair as alt
import datetime as dt
from pathlib import Path


In [3]:
# local connection information
import local_db

connection = local_db.connection()


In [4]:
print(f"Last Run: {dt.datetime.now()}")

today = dt.datetime.now().strftime("%Y%m%d")
year = 2019

fall_term = f"{year}.Fall"
print(f"{fall_term}")

Last Run: 2019-07-08 15:22:58.801531
2019.Fall


In [5]:
sql_str = (
    "SELECT DISTINCT "
    + "PEOPLE_CODE_ID, "
    + "ACADEMIC_YEAR, "
    + "ACADEMIC_TERM, "
    + "FIELD_ID, "
    + "FIELD_DATE "
    + "FROM [Campus6].[dbo].[STAGEHISTORY] "
    + f"WHERE ACADEMIC_YEAR='{year}' "
    + "and ACADEMIC_TERM='FALL' "
    + "and FIELD_ID in (10,49,50,39,13) "
    + "ORDER BY PEOPLE_CODE_ID, FIELD_DATE, FIELD_ID "
)

df_stghist = pd.read_sql_query(sql_str, connection)

print(df_stghist.shape)


(525, 5)


In [6]:
df_stghist.head()

Unnamed: 0,PEOPLE_CODE_ID,ACADEMIC_YEAR,ACADEMIC_TERM,FIELD_ID,FIELD_DATE
0,P000035178,2019,FALL,10,2019-04-03
1,P000035178,2019,FALL,50,2019-04-03
2,P000043954,2019,FALL,10,2019-05-08
3,P000043954,2019,FALL,50,2019-05-08
4,P000047813,2019,FALL,10,2019-03-20


In [7]:
df_stghist.loc[((df_stghist['FIELD_ID']==10)|(df_stghist['FIELD_ID']==50))]

Unnamed: 0,PEOPLE_CODE_ID,ACADEMIC_YEAR,ACADEMIC_TERM,FIELD_ID,FIELD_DATE
0,P000035178,2019,FALL,10,2019-04-03
1,P000035178,2019,FALL,50,2019-04-03
2,P000043954,2019,FALL,10,2019-05-08
3,P000043954,2019,FALL,50,2019-05-08
4,P000047813,2019,FALL,10,2019-03-20
5,P000047813,2019,FALL,50,2019-03-20
6,P000050419,2019,FALL,10,2019-04-22
7,P000050419,2019,FALL,50,2019-04-22
8,P000052458,2019,FALL,10,2019-04-11
9,P000052458,2019,FALL,50,2019-04-11


### Use admissions.py code

In [8]:
# read VWSSTAGERANKING data
sql_str = (
    "SELECT STAGERANKING_ID, field_name, field_value "
    + "FROM VWSSTAGERANKING WHERE "
    + "status = 'A' "
)
stgrnk = pd.read_sql_query(sql_str, connection)

# read STAGEHISTORY data
sql_str = (
    "SELECT PEOPLE_CODE_ID, ACADEMIC_YEAR, ACADEMIC_TERM, "
    + "ACADEMIC_SESSION, FIELD_ID, FIELD_DATE, HIDDEN "
    + "FROM STAGEHISTORY WHERE "
    + "HIDDEN = 'N' "
    + f"AND ACADEMIC_YEAR = '{year}' "
    + "and ACADEMIC_TERM='FALL' "
)
stg_hist = pd.read_sql_query(sql_str, connection)

stg_hist = stg_hist.rename(columns={"FIELD_DATE": "create_date"})
stage_data = pd.merge(
    stg_hist, stgrnk, left_on=["FIELD_ID"], right_on=["STAGERANKING_ID"], how="left"
)
keep_fields = [
    "PEOPLE_CODE_ID",
    "ACADEMIC_YEAR",
    "ACADEMIC_TERM",
    "ACADEMIC_SESSION",
    "field_name",
    "field_value",
    "create_date",
]
stage_data = stage_data.loc[~stage_data["create_date"].isnull(), keep_fields]

print(stage_data.shape)

(4907, 7)


In [9]:
# read ACADEMIC data
sql_str = (
    "SELECT PEOPLE_CODE_ID, ACADEMIC_YEAR, ACADEMIC_TERM, "
    + "ACADEMIC_SESSION, POPULATION, INQUIRY_FLAG, "
    + "APPLICATION_FLAG, APPLICATION_DATE, "
    + "APP_STATUS, APP_STATUS_DATE, "
    + "APP_DECISION, APP_DECISION_DATE "
    + "FROM ACADEMIC WHERE "
    + f"ACADEMIC_YEAR >= '{year}' "
)
academic = pd.read_sql_query(sql_str, connection)

app_data = academic.loc[
    ~(academic["POPULATION"].isin(["ADVSTU", "NOND"]))
    & ((academic["INQUIRY_FLAG"] == "Y") | (academic["APPLICATION_FLAG"] == "Y"))
]

applied = app_data[app_data["APP_STATUS"].notnull()].rename(
    columns={"APP_STATUS": "field_value", "APP_STATUS_DATE": "create_date"}
)
applied.loc[:, "field_name"] = "Application Status"
applied = applied.loc[~applied["create_date"].isnull(), keep_fields]

accepted = app_data[app_data["APP_DECISION"].notnull()].rename(
    columns={"APP_DECISION": "field_value", "APP_DECISION_DATE": "create_date"}
)
accepted.loc[:, "field_name"] = "Application Decision"
accepted = accepted.loc[~accepted["create_date"].isnull(), keep_fields]


In [10]:
# stack Stage History, Academic Applied and Academic Accepted
adm_df = stage_data.append(applied).append(accepted)

adm_df = adm_df.loc[
    (
        (adm_df["ACADEMIC_TERM"].isin(["FALL", "SPRING"]))
        & (adm_df["ACADEMIC_SESSION"] == "MAIN")
        & (adm_df["ACADEMIC_YEAR"] == f"{year}")
    )
]


In [11]:
print(adm_df.shape)

(5526, 7)


In [12]:
print(adm_df.columns)

Index(['PEOPLE_CODE_ID', 'ACADEMIC_YEAR', 'ACADEMIC_TERM', 'ACADEMIC_SESSION',
       'field_name', 'field_value', 'create_date'],
      dtype='object')


In [13]:
# create new fields
adm_df["year_term"] = (
    adm_df["ACADEMIC_YEAR"] + "." + adm_df["ACADEMIC_TERM"].str.title()
)


In [14]:
adm_df = adm_df.loc[(adm_df["year_term"]==fall_term)]
print(adm_df.shape)

(5466, 8)


In [15]:
adm_keep_values = [
    "300",
    "ACC",
    "ACXL",
    "CANC",
    "DEF",
    "DEFR",
    "DENY",
    "DPAC",
    "TRDP",
    "TRPD",
    "TRNS",
    "WAIT",
    "500",
    "PEND",
    "COMP",
]
adm_keep_cols = ["PEOPLE_CODE_ID", "year_term", "create_date", "field_value"]
adm_df = adm_df.loc[(adm_df["field_value"].isin(adm_keep_values)), adm_keep_cols]


In [16]:
print(adm_df.shape)

(4506, 4)


In [17]:
adm_df.head()

Unnamed: 0,PEOPLE_CODE_ID,year_term,create_date,field_value
10,P000055649,2019.Fall,2018-08-31,300
11,P000055649,2019.Fall,2018-10-10,ACC
12,P000055839,2019.Fall,2018-09-11,300
13,P000055839,2019.Fall,2018-10-09,ACC
14,P000055840,2019.Fall,2018-08-21,300


In [18]:
# admissions status table
admission_status = {
    "300": "1_Applied",
    "ACC": "2_Accepted",
    "ACXL": "4_Canceled",
    "CANC": "4_Canceled",
    "DEF": "4_Canceled",
    "DEFR": "4_Canceled",
    "DENY": "4_Canceled",
    "DPAC": "3_Deposited",
    "TRDP": "3_Deposited",
    "TRPD": "3_Deposited",
    "TRNS": "2_Accepted",
    "WAIT": "2_Accepted",
    "500": "3_Deposited",
    "PEND": "1_Applied",
    "COMP": "1_Applied",
}
adm_stat = pd.DataFrame(
    list(admission_status.items()), columns=["field_value", "admission_status"]
)

adm_stat

Unnamed: 0,field_value,admission_status
0,300,1_Applied
1,ACC,2_Accepted
2,ACXL,4_Canceled
3,CANC,4_Canceled
4,DEF,4_Canceled
5,DEFR,4_Canceled
6,DENY,4_Canceled
7,DPAC,3_Deposited
8,TRDP,3_Deposited
9,TRPD,3_Deposited


In [19]:
adm_df1 = (
    pd.merge(adm_df, adm_stat, on=["field_value"], how="left")
    .drop(["field_value"], axis=1)
    .drop_duplicates(
        ["PEOPLE_CODE_ID", "year_term", "create_date", "admission_status"]
    )
)

adm_df1.head()

Unnamed: 0,PEOPLE_CODE_ID,year_term,create_date,admission_status
0,P000055649,2019.Fall,2018-08-31,1_Applied
1,P000055649,2019.Fall,2018-10-10,2_Accepted
2,P000055839,2019.Fall,2018-09-11,1_Applied
3,P000055839,2019.Fall,2018-10-09,2_Accepted
4,P000055840,2019.Fall,2018-08-21,1_Applied


In [20]:
adm_df1 = adm_df1.sort_values(
    ["year_term", "PEOPLE_CODE_ID", "admission_status", "create_date"]
).drop_duplicates(["year_term", "PEOPLE_CODE_ID", "admission_status"], keep="first")


In [21]:
adm_df1 = adm_df1[["PEOPLE_CODE_ID", "admission_status", "create_date"]]

adm_df1["status"] = 1

In [22]:
print(adm_df1.shape)

(2199, 4)


In [23]:
print(adm_df1.columns)

Index(['PEOPLE_CODE_ID', 'admission_status', 'create_date', 'status'], dtype='object')


In [24]:
adm_df1.head()

Unnamed: 0,PEOPLE_CODE_ID,admission_status,create_date,status
1570,P000035178,1_Applied,2019-02-13,1
1571,P000035178,2_Accepted,2019-03-07,1
1742,P000035178,3_Deposited,2019-04-03,1
1572,P000043954,1_Applied,2019-03-15,1
1573,P000043954,2_Accepted,2019-03-18,1


In [25]:
e = adm_df1.pivot_table(
    index=["PEOPLE_CODE_ID"],
    columns=["admission_status"],
    values=["status"],
)
e = e.fillna(0)

e.columns = e.columns.droplevel()


In [26]:
e

admission_status,1_Applied,2_Accepted,3_Deposited,4_Canceled
PEOPLE_CODE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P000035178,1.0,1.0,1.0,0.0
P000043954,1.0,1.0,1.0,0.0
P000047813,1.0,1.0,1.0,0.0
P000050419,0.0,0.0,1.0,0.0
P000052458,0.0,0.0,1.0,0.0
P000053525,0.0,0.0,1.0,0.0
P000053594,0.0,0.0,1.0,0.0
P000053667,0.0,0.0,1.0,0.0
P000053682,0.0,0.0,1.0,0.0
P000054046,0.0,0.0,1.0,0.0


In [27]:
dep = e.loc[(e["3_Deposited"]==1)]

In [28]:
active_dep = e.loc[(e["3_Deposited"]==1)&(~(e["4_Canceled"]==1))]
print(active_dep.shape)
active_dep.head()

(245, 4)


admission_status,1_Applied,2_Accepted,3_Deposited,4_Canceled
PEOPLE_CODE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P000035178,1.0,1.0,1.0,0.0
P000043954,1.0,1.0,1.0,0.0
P000047813,1.0,1.0,1.0,0.0
P000050419,0.0,0.0,1.0,0.0
P000052458,0.0,0.0,1.0,0.0


In [29]:
sql_str = (
    f"""
SELECT [PEOPLE].[PEOPLE_CODE_ID],
	[PEOPLE].[LAST_NAME],
	[PEOPLE].[FIRST_NAME],
	[PEOPLE].[MIDDLE_NAME],
	[PEOPLE].[BIRTH_DATE],
	[PEOPLE].[GOVERNMENT_ID],
	[ADDRESS].[EMAIL_ADDRESS],
	[ADDRESS].[ADDRESS_TYPE],
	[ADDRESS].[ADDRESS_LINE_1],
	[ADDRESS].[CITY],
	[ADDRESS].[STATE],
    [ADDRESS].[COUNTRY], 
	[ADDRESS].[ZIP_CODE],
	[ACADEMIC].[CREDITS],
	[ACADEMIC].[DEGREE],
	[ACADEMIC].[CURRICULUM],
	[ACADEMIC].[ACADEMIC_YEAR],
	[ACADEMIC].[ACADEMIC_TERM],
	[ACADEMIC].[ADMIT_YEAR],
	[ACADEMIC].[ADMIT_TERM],
	[ACADEMIC].[APP_STATUS_DATE],
	[ACADEMIC].[APP_STATUS],
	[ACADEMIC].[COLLEGE_ATTEND],
	[ACADEMIC].[REVISION_DATE],
	[ACADEMIC].[REVISION_TIME],
	[ACADEMIC].[REVISION_OPID]
FROM [Campus6].[dbo].[ACADEMIC], [Campus6].[dbo].PEOPLE, [Campus6].[dbo].ADDRESS
WHERE ( 
	[ACADEMIC].[PEOPLE_CODE_ID] = [ADDRESS].[PEOPLE_ORG_CODE_ID] AND
	[ACADEMIC].[PEOPLE_ID] = [PEOPLE].[PEOPLE_ID] 
	  ) AND
	  (  
	 [ACADEMIC].[ADMIT_YEAR] = N'2019'  and
	 [ACADEMIC].[ADMIT_TERM] = N'fall'  And
	 [ACADEMIC].[ACADEMIC_YEAR] = N'{year}'  And
	 [ACADEMIC].[ACADEMIC_TERM] = N'fall'  And
	 [ADDRESS].[ADDRESS_TYPE] = N'HOME'  and
	 [ACADEMIC].[APP_STATUS] = N'500'  and
	 [ACADEMIC].[ACADEMIC_SESSION] = N''   and
     (ACADEMIC.APP_DECISION = 'DPAC' or
	  ACADEMIC.APP_DECISION = 'TRDP')
      )
--	  and (GOVERNMENT_ID is null or GOVERNMENT_ID='' or GOVERNMENT_ID=' ')
GROUP BY [PEOPLE].[PEOPLE_CODE_ID], 
         [PEOPLE].[LAST_NAME], 
		 [PEOPLE].[FIRST_NAME], 
		 [PEOPLE].[MIDDLE_NAME], 
		 [PEOPLE].[BIRTH_DATE], 
		 [PEOPLE].[GOVERNMENT_ID],
		 [ADDRESS].[EMAIL_ADDRESS], 
		 [ADDRESS].[ADDRESS_TYPE], 
		 [ADDRESS].[ADDRESS_LINE_1], 
		 [ADDRESS].[CITY], 
		 [ADDRESS].[STATE], 
		 [ADDRESS].[COUNTRY], 
		 [ADDRESS].[ZIP_CODE], 
		 [ACADEMIC].[CREDITS], 
		 [ACADEMIC].[DEGREE], 
		 [ACADEMIC].[CURRICULUM], 
		 [ACADEMIC].[ACADEMIC_YEAR], 
		 [ACADEMIC].[ACADEMIC_TERM], 
		 [ACADEMIC].[ADMIT_YEAR],
		 [ACADEMIC].[ADMIT_TERM], 
		 [ACADEMIC].[APP_STATUS_DATE], 
		 [ACADEMIC].[APP_STATUS],
		 [ACADEMIC].[COLLEGE_ATTEND],
		 [ACADEMIC].[REVISION_DATE],
		 [ACADEMIC].[REVISION_TIME],
		 [ACADEMIC].[REVISION_OPID]


ORDER BY 
--		[ACADEMIC].[APP_STATUS_DATE],
		[PEOPLE].[LAST_NAME]
--		[ACADEMIC].[REVISION_DATE],
--	    [ACADEMIC].[REVISION_TIME]
    """
)

df_deposits = pd.read_sql_query(sql_str, connection)

print(df_deposits.shape)
df_deposits.head()

(245, 26)


Unnamed: 0,PEOPLE_CODE_ID,LAST_NAME,FIRST_NAME,MIDDLE_NAME,BIRTH_DATE,GOVERNMENT_ID,EMAIL_ADDRESS,ADDRESS_TYPE,ADDRESS_LINE_1,CITY,...,ACADEMIC_YEAR,ACADEMIC_TERM,ADMIT_YEAR,ADMIT_TERM,APP_STATUS_DATE,APP_STATUS,COLLEGE_ATTEND,REVISION_DATE,REVISION_TIME,REVISION_OPID
0,P000055913,Abele,Maxon,,2000-07-24,70907364,mabele@paulsmiths.edu,HOME,4472 Lake Shore Drive,Diamond Point,...,2019,FALL,2019,FALL,2018-11-20,500,FTF,2019-06-27,1900-01-01 14:46:39.577,CARLINJ
1,P000055865,Adamiec,Lillian,,2000-12-05,20845664,ladamiec@paulsmiths.edu,HOME,26 Kenney Street,New Bedford,...,2019,FALL,2019,FALL,2019-02-14,500,FTF,2019-06-13,1900-01-01 09:38:40.190,CARLINJ
2,P000056876,Ahl,Maureen,Elliot,1999-01-14,86884068,mahl@paulsmiths.edu,HOME,212 Meadowbrook Road,Saratoga Springs,...,2019,FALL,2019,FALL,2019-06-27,500,TRAN,2019-07-01,1900-01-01 09:21:48.773,CARLINJ
3,P000056203,Aini,Patrick,Joeseph,2000-08-13,70905130,paini@paulsmiths.edu,HOME,1 Bush Street,Arkport,...,2019,FALL,2019,FALL,2019-06-18,500,FTF,2019-06-20,1900-01-01 16:05:11.100,CARLINJ
4,P000056509,Akabati,Azanui,Stanley,1999-03-04,999956509,aakabati@paulsmiths.edu,HOME,Below Ccast Bambili,"Bamenda, Cameroon",...,2019,FALL,2019,FALL,2019-06-07,500,FTF,2019-06-12,1900-01-01 14:25:36.597,CARLINJ


In [30]:
cmp = (
    pd.merge(active_dep, df_deposits, on=["PEOPLE_CODE_ID"], how="outer", indicator=True)
)

print(cmp.shape)
cmp

(245, 31)


Unnamed: 0,PEOPLE_CODE_ID,1_Applied,2_Accepted,3_Deposited,4_Canceled,LAST_NAME,FIRST_NAME,MIDDLE_NAME,BIRTH_DATE,GOVERNMENT_ID,...,ACADEMIC_TERM,ADMIT_YEAR,ADMIT_TERM,APP_STATUS_DATE,APP_STATUS,COLLEGE_ATTEND,REVISION_DATE,REVISION_TIME,REVISION_OPID,_merge
0,P000035178,1.0,1.0,1.0,0.0,Palmer,Jack,,1996-06-29,055866222,...,FALL,2019,FALL,2019-04-03,500,TRAN,2019-06-13,1900-01-01 14:53:22.467,CARLINJ,both
1,P000043954,1.0,1.0,1.0,0.0,Hoy,Savannah,Anne,1996-08-16,633543612,...,FALL,2019,FALL,2019-05-08,500,TRAN,2019-06-21,1900-01-01 15:36:45.530,JWALTON,both
2,P000047813,1.0,1.0,1.0,0.0,Lockwood,Danielle,Kathryn,2001-09-23,135114783,...,FALL,2019,FALL,2019-03-20,500,FTF,2019-04-29,1900-01-01 13:53:14.703,CARLINJ,both
3,P000050419,0.0,0.0,1.0,0.0,Gosselin,Julia,Ann,1998-03-08,008828670,...,FALL,2019,FALL,2019-04-22,500,TRAN,2019-06-20,1900-01-01 10:54:59.623,CARLINJ,both
4,P000052458,0.0,0.0,1.0,0.0,Squillante,Christopher,Martin,1998-05-22,036669894,...,FALL,2019,FALL,2019-04-11,500,TRAN,2019-06-24,1900-01-01 16:08:57.160,JWALTON,both
5,P000053525,0.0,0.0,1.0,0.0,Sanborn,Jaden,Emily,1999-05-10,003924985,...,FALL,2019,FALL,2019-05-16,500,TRAN,2019-06-20,1900-01-01 10:07:06.247,CARLINJ,both
6,P000053594,0.0,0.0,1.0,0.0,Cancilla,Abigail,Elizabeth,1999-04-01,095885102,...,FALL,2019,FALL,2019-05-28,500,TRAN,2019-06-17,1900-01-01 15:36:18.980,CARLINJ,both
7,P000053667,0.0,0.0,1.0,0.0,Gleason,Drew,Samuel,1999-02-10,089885627,...,FALL,2019,FALL,2019-04-23,500,TRAN,2019-06-14,1900-01-01 08:43:36.773,CARLINJ,both
8,P000053682,0.0,0.0,1.0,0.0,Johnson,Jordan,Nesmith,1999-02-24,009826569,...,FALL,2019,FALL,2019-04-11,500,TRAN,2019-05-31,1900-01-01 08:59:52.047,CARLINJ,both
9,P000054046,0.0,0.0,1.0,0.0,Hardeman,Meaghan,E,1999-01-08,086883458,...,FALL,2019,FALL,2019-02-27,500,TRAN,2019-05-06,1900-01-01 11:17:07.957,CARLINJ,both


In [31]:
cmp.loc[(cmp["_merge"]=="left_only")]

Unnamed: 0,PEOPLE_CODE_ID,1_Applied,2_Accepted,3_Deposited,4_Canceled,LAST_NAME,FIRST_NAME,MIDDLE_NAME,BIRTH_DATE,GOVERNMENT_ID,...,ACADEMIC_TERM,ADMIT_YEAR,ADMIT_TERM,APP_STATUS_DATE,APP_STATUS,COLLEGE_ATTEND,REVISION_DATE,REVISION_TIME,REVISION_OPID,_merge


In [32]:
cmp.loc[(cmp["_merge"]=="right_only")]

Unnamed: 0,PEOPLE_CODE_ID,1_Applied,2_Accepted,3_Deposited,4_Canceled,LAST_NAME,FIRST_NAME,MIDDLE_NAME,BIRTH_DATE,GOVERNMENT_ID,...,ACADEMIC_TERM,ADMIT_YEAR,ADMIT_TERM,APP_STATUS_DATE,APP_STATUS,COLLEGE_ATTEND,REVISION_DATE,REVISION_TIME,REVISION_OPID,_merge


In [33]:
adm_df1.loc[(adm_df1["PEOPLE_CODE_ID"]=="P000057033")]

Unnamed: 0,PEOPLE_CODE_ID,admission_status,create_date,status
2201,P000057033,1_Applied,2019-04-27,1
2202,P000057033,2_Accepted,2019-05-09,1
2410,P000057033,3_Deposited,2019-06-13,1


In [34]:
df_deposits.loc[(df_deposits["PEOPLE_CODE_ID"]=="P000057033")]

Unnamed: 0,PEOPLE_CODE_ID,LAST_NAME,FIRST_NAME,MIDDLE_NAME,BIRTH_DATE,GOVERNMENT_ID,EMAIL_ADDRESS,ADDRESS_TYPE,ADDRESS_LINE_1,CITY,...,ACADEMIC_YEAR,ACADEMIC_TERM,ADMIT_YEAR,ADMIT_TERM,APP_STATUS_DATE,APP_STATUS,COLLEGE_ATTEND,REVISION_DATE,REVISION_TIME,REVISION_OPID
210,P000057033,Spencer,Evan,Reece,1994-06-15,9786809,espencer1@paulsmiths.edu,HOME,179 Pine Hill Drive,West Berlin,...,2019,FALL,2019,FALL,2019-06-13,500,TRAN,2019-07-03,1900-01-01 16:25:46.117,CARLINJ
