In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date
from pathlib import Path


In [2]:
#data_path = Path(r"\\psc-data\E\Applications\Admissions\funnel")
data_path = Path(r"E:\Applications\Admissions\funnel")
##data_path = Path(r"C:\JW\Python\Admissions\funnel")
data_store = data_path / "data\stage_data_wn"

import local_db
connection = local_db.connection()


In [3]:
today = datetime.now().strftime("%Y%m%d")

begin_year = "2015"


In [4]:
# read VWSSTAGERANKING data
sql_str = (
    "SELECT STAGERANKING_ID, field_name, field_value "
    + "FROM VWSSTAGERANKING WHERE "
    + "status = 'A' "
)
stgrnk = pd.read_sql_query(sql_str, connection)


In [5]:
# read STAGEHISTORY data
sql_str = (
    "SELECT PEOPLE_CODE_ID, ACADEMIC_YEAR, ACADEMIC_TERM, "
    + "ACADEMIC_SESSION, FIELD_ID, FIELD_DATE, HIDDEN "
    + "FROM STAGEHISTORY WHERE "
    + "HIDDEN = 'N' "
    + f"AND ACADEMIC_YEAR >= '{begin_year}' "
)
stg_hist = pd.read_sql_query(sql_str, connection)

stg_hist = stg_hist.rename(columns={"FIELD_DATE": "create_date"})
stage_data = pd.merge(
    stg_hist, stgrnk, left_on=["FIELD_ID"], right_on=["STAGERANKING_ID"], how="left"
)
keep_fields = [
    "PEOPLE_CODE_ID",
    "ACADEMIC_YEAR",
    "ACADEMIC_TERM",
    "ACADEMIC_SESSION",
    "field_name",
    "field_value",
    "create_date",
]
stage_data = stage_data.loc[~stage_data["create_date"].isnull(), keep_fields]
print(stage_data.shape)

(87137, 7)


In [6]:
# read ACADEMIC data
sql_str = (
    "SELECT PEOPLE_CODE_ID, ACADEMIC_YEAR, ACADEMIC_TERM, "
    + "ACADEMIC_SESSION, POPULATION, INQUIRY_FLAG, "
    + "APPLICATION_FLAG, APPLICATION_DATE, "
    + "APP_STATUS, APP_STATUS_DATE, "
    + "APP_DECISION, APP_DECISION_DATE "
    + "FROM ACADEMIC WHERE "
    + f"ACADEMIC_YEAR >= '{begin_year}' "
)
academic = pd.read_sql_query(sql_str, connection)
print(academic.shape)

(49210, 12)


In [7]:
app_data = academic.loc[
    ~(academic["POPULATION"].isin(["ADVSTU", "NOND"]))
    & ((academic["INQUIRY_FLAG"] == "Y") | (academic["APPLICATION_FLAG"] == "Y"))
]

applied = app_data[app_data["APP_STATUS"].notnull()].rename(
    columns={"APP_STATUS": "field_value", "APP_STATUS_DATE": "create_date"}
)
applied.loc[:, "field_name"] = "Application Status"
applied = applied.loc[~applied["create_date"].isnull(), keep_fields]

accepted = app_data[app_data["APP_DECISION"].notnull()].rename(
    columns={"APP_DECISION": "field_value", "APP_DECISION_DATE": "create_date"}
)
accepted.loc[:, "field_name"] = "Application Decision"
accepted = accepted.loc[~accepted["create_date"].isnull(), keep_fields]

print(applied.shape)
print(accepted.shape)

(10116, 7)
(9311, 7)


In [8]:
# stack Stage History, Academic Applied and Academic Accepted
adm_df = stage_data.append(applied).append(accepted)

adm_df = adm_df.loc[
    (
        (adm_df["ACADEMIC_TERM"].isin(["FALL", "SPRING"]))
        & (adm_df["ACADEMIC_SESSION"] == "MAIN")
        & (adm_df["ACADEMIC_YEAR"] >= "2009")
    )
]

print(adm_df.shape)
adm_df.head(3)

(75279, 7)


Unnamed: 0,PEOPLE_CODE_ID,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,field_name,field_value,create_date
6,P000016934,2018,FALL,MAIN,Enrolled/Separated,TBC,2018-11-29 00:00:00.000
7,P000016934,2018,FALL,MAIN,Enrolled/Separated,TBC,2018-11-29 00:00:00.000
8,P000016934,2018,FALL,MAIN,Enrolled/Separated,ENRL,2018-12-21 10:46:32.793


In [9]:
# create new fields
adm_df["year_term"] = (
    adm_df["ACADEMIC_YEAR"] + "." + adm_df["ACADEMIC_TERM"].str.title()
)
# week_number = (
#     lambda r: (r["create_date"].date().isocalendar()[1])
#     if (r["create_date"].date() >= date((int(r["ACADEMIC_YEAR"]) - 1), 9, 1))
#     else (date((int(r["ACADEMIC_YEAR"]) - 1), 9, 1).isocalendar()[1])
# )
# adm_df["Week_Number"] = adm_df.apply(week_number, axis=1)

print(adm_df.shape)
adm_df.head(3)

(75279, 8)


Unnamed: 0,PEOPLE_CODE_ID,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,field_name,field_value,create_date,year_term
6,P000016934,2018,FALL,MAIN,Enrolled/Separated,TBC,2018-11-29 00:00:00.000,2018.Fall
7,P000016934,2018,FALL,MAIN,Enrolled/Separated,TBC,2018-11-29 00:00:00.000,2018.Fall
8,P000016934,2018,FALL,MAIN,Enrolled/Separated,ENRL,2018-12-21 10:46:32.793,2018.Fall


In [10]:
# convert ACADEMIC_YEAR to numeric keep numeric-valued records
adm_df["ACADEMIC_YEAR"] = pd.to_numeric(
    adm_df["ACADEMIC_YEAR"], errors="coerce", downcast="integer"
)
adm_df = adm_df.loc[adm_df["ACADEMIC_YEAR"].notnull()]

print(adm_df.shape)
adm_df.head(3)

(75279, 8)


Unnamed: 0,PEOPLE_CODE_ID,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,field_name,field_value,create_date,year_term
6,P000016934,2018,FALL,MAIN,Enrolled/Separated,TBC,2018-11-29 00:00:00.000,2018.Fall
7,P000016934,2018,FALL,MAIN,Enrolled/Separated,TBC,2018-11-29 00:00:00.000,2018.Fall
8,P000016934,2018,FALL,MAIN,Enrolled/Separated,ENRL,2018-12-21 10:46:32.793,2018.Fall


In [11]:
adm_df.loc[(adm_df['PEOPLE_CODE_ID']=='P000024505') & (adm_df['year_term']=='2012.Fall')]

Unnamed: 0,PEOPLE_CODE_ID,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,field_name,field_value,create_date,year_term


In [12]:
adm_week_number = (
    lambda r: (
        int(
            (pd.to_datetime(r["create_date"])
            - pd.to_datetime(date((int(r["ACADEMIC_YEAR"]) - 1), 9, 1)))
            / np.timedelta64(1,'W')
        )
    )
    if (
        pd.to_datetime(r["create_date"]) >= pd.to_datetime(date((int(r["ACADEMIC_YEAR"]) - 1), 9, 1))
    )
    else 0
)
adm_df["Admissions_Week"] = adm_df.apply(adm_week_number, axis=1)

print(adm_df.shape)
adm_df.head(3)

(75279, 9)


Unnamed: 0,PEOPLE_CODE_ID,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,field_name,field_value,create_date,year_term,Admissions_Week
6,P000016934,2018,FALL,MAIN,Enrolled/Separated,TBC,2018-11-29 00:00:00.000,2018.Fall,64
7,P000016934,2018,FALL,MAIN,Enrolled/Separated,TBC,2018-11-29 00:00:00.000,2018.Fall,64
8,P000016934,2018,FALL,MAIN,Enrolled/Separated,ENRL,2018-12-21 10:46:32.793,2018.Fall,68


In [13]:
adm_df.loc[(adm_df['PEOPLE_CODE_ID']=='P000002941') & (adm_df['year_term']=='2012.Fall')]

Unnamed: 0,PEOPLE_CODE_ID,ACADEMIC_YEAR,ACADEMIC_TERM,ACADEMIC_SESSION,field_name,field_value,create_date,year_term,Admissions_Week


In [14]:
adm_keep_values = [
    "300",
    "ACC",
    "ACXL",
    "CANC",
    "DEF",
    "DEFR",
    "DENY",
    "DPAC",
    "TRDP",
    "TRPD",
    "TRNS",
    "WAIT",
    "500",
    "PEND",
    "COMP",
]
adm_keep_cols = ["PEOPLE_CODE_ID", "year_term", "Admissions_Week", "field_value"]
adm_df0 = adm_df.loc[(adm_df["field_value"].isin(adm_keep_values)), adm_keep_cols]


In [15]:
adm_keep_cols_2 = ["PEOPLE_CODE_ID", "year_term", "create_date", "field_value"]
adm_df_2 = adm_df.loc[(adm_df["field_value"].isin(adm_keep_values)), adm_keep_cols_2]


In [16]:
# admissions status table
admission_status = {
    "300": "Applied",
    "ACC": "Accepted",
    "ACXL": "Canceled",
    "CANC": "Canceled",
    "DEF": "Canceled",
    "DEFR": "Canceled",
    "DENY": "Canceled",
    "DPAC": "Deposited",
    "TRDP": "Deposited",
    "TRPD": "Deposited",
    "TRNS": "Accepted",
    "WAIT": "Accepted",
    "500": "Deposited",
    "PEND": "Applied",
    "COMP": "Applied",
}
adm_stat = pd.DataFrame(
    list(admission_status.items()), columns=["field_value", "admission_status"]
)


In [17]:
adm_df1 = (
    pd.merge(adm_df0, adm_stat, on=["field_value"], how="left")
    .drop(["field_value"], axis=1)
    .drop_duplicates(
        ["PEOPLE_CODE_ID", "year_term", "Admissions_Week", "admission_status"]
    )
)
print(adm_df1.shape)

adm_df1 = adm_df1.sort_values(
    ["year_term", "PEOPLE_CODE_ID", "admission_status", "Admissions_Week"]
).drop_duplicates(["year_term", "PEOPLE_CODE_ID", "admission_status"], keep="first")
print(adm_df1.shape)


(21696, 4)
(20247, 4)


In [18]:
adm_df1_2 = (
    pd.merge(adm_df_2, adm_stat, on=["field_value"], how="left")
    .drop(["field_value"], axis=1)
    .drop_duplicates(
        ["PEOPLE_CODE_ID", "year_term", "create_date", "admission_status"]
    )
)
print(adm_df1_2.shape)

adm_df1_2 = adm_df1_2.sort_values(
    ["year_term", "PEOPLE_CODE_ID", "admission_status", "create_date"]
).drop_duplicates(["year_term", "PEOPLE_CODE_ID", "admission_status"], keep="first")
print(adm_df1_2.shape)


(21983, 4)
(20247, 4)


In [19]:
print(today)
writer = pd.ExcelWriter(
    f"PowerCampus_AdmissionsData_{today}.xlsx"
)

adm_df1_2.to_excel(writer, "data")

writer.save()


20210107


In [20]:
adm_df1.loc[(adm_df1['PEOPLE_CODE_ID']=='P000024505')]

Unnamed: 0,PEOPLE_CODE_ID,year_term,Admissions_Week,admission_status


In [21]:
e = adm_df1.pivot_table(
    index=["year_term", "PEOPLE_CODE_ID"],
    columns=["admission_status"],
    values=["Admissions_Week"],
)
e = e.fillna(np.int(54))
print(e.shape)


(8814, 4)


In [22]:
e.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Admissions_Week,Admissions_Week,Admissions_Week,Admissions_Week
Unnamed: 0_level_1,admission_status,Accepted,Applied,Canceled,Deposited
year_term,PEOPLE_CODE_ID,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2015.Fall,P000025856,0.0,0.0,38.0,54.0
2015.Fall,P000026328,28.0,28.0,40.0,54.0
2015.Fall,P000026959,54.0,54.0,44.0,54.0
2015.Fall,P000027130,14.0,14.0,54.0,34.0
2015.Fall,P000027236,54.0,54.0,0.0,54.0


In [23]:
e.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 8814 entries, ('2015.Fall', 'P000025856') to ('2021.Spring', 'P000063793')
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   (Admissions_Week, Accepted)   8814 non-null   float64
 1   (Admissions_Week, Applied)    8814 non-null   float64
 2   (Admissions_Week, Canceled)   8814 non-null   float64
 3   (Admissions_Week, Deposited)  8814 non-null   float64
dtypes: float64(4)
memory usage: 367.4+ KB


In [24]:
# e.loc[('2012.Fall', 'P000024505')]

In [25]:
# function returns status for week
def f_status(field, data_frame, n):
    f_week = (
        lambda df: 1
        if (
            (df[("Admissions_Week", field)] <= n)
            & (df[("Admissions_Week", "Canceled")] > n)
        )
        else 0
    )
    return data_frame.apply(f_week, axis=1)


# function returns DataFrame of 53 week status values
def fill_weeks(field, data_frame):
    weeks = range(0, 54)
    r = pd.DataFrame(
        np.zeros((data_frame.shape[0], 54)),
        index=data_frame.index,
        columns=[f"{w:02d}" for w in weeks],
    )
    for w in weeks:
        f = f"{w:02d}"
        r.loc[:, f] = f_status(field, data_frame, w)
        r.loc[:, "stage"] = field

    r = r.reset_index().set_index(["year_term", "stage", "PEOPLE_CODE_ID"])

    return r


In [26]:
stage_list = ["Applied", "Accepted", "Deposited"]
w = pd.DataFrame()
for stg in stage_list:
    w = pd.concat([w, fill_weeks(stg, e)])

print(w.shape)

(26442, 54)


In [27]:
w.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,00,01,02,03,04,05,06,07,08,09,...,44,45,46,47,48,49,50,51,52,53
year_term,stage,PEOPLE_CODE_ID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2015.Fall,Applied,P000025856,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2015.Fall,Applied,P000026328,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015.Fall,Applied,P000026959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015.Fall,Applied,P000027130,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
2015.Fall,Applied,P000027236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
w.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 26442 entries, ('2015.Fall', 'Applied', 'P000025856') to ('2021.Spring', 'Deposited', 'P000063793')
Data columns (total 54 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   00      26442 non-null  int64
 1   01      26442 non-null  int64
 2   02      26442 non-null  int64
 3   03      26442 non-null  int64
 4   04      26442 non-null  int64
 5   05      26442 non-null  int64
 6   06      26442 non-null  int64
 7   07      26442 non-null  int64
 8   08      26442 non-null  int64
 9   09      26442 non-null  int64
 10  10      26442 non-null  int64
 11  11      26442 non-null  int64
 12  12      26442 non-null  int64
 13  13      26442 non-null  int64
 14  14      26442 non-null  int64
 15  15      26442 non-null  int64
 16  16      26442 non-null  int64
 17  17      26442 non-null  int64
 18  18      26442 non-null  int64
 19  19      26442 non-null  int64
 20  20      26442 non-null  int64
 21  21     

In [30]:
# add CURRICULUM field
sql_str = (
    "SELECT PEOPLE_CODE_ID, ACADEMIC_YEAR, ACADEMIC_TERM, "
    + "ACADEMIC_SESSION, CURRICULUM, PRIMARY_FLAG "
    + "FROM ACADEMIC WHERE "
    + "PRIMARY_FLAG = 'Y' AND "
    + f"ACADEMIC_YEAR >= '{begin_year}' "
)
curriculum_df = pd.read_sql_query(sql_str, connection)
curriculum_df["year_term"] = (
    curriculum_df["ACADEMIC_YEAR"] + "." + curriculum_df["ACADEMIC_TERM"].str.title()
)
curriculum_df = curriculum_df.rename(columns={"CURRICULUM": "curriculum"})
curr_flds = ["PEOPLE_CODE_ID", "year_term", "curriculum"]
curriculum_df = curriculum_df[curr_flds]
curriculum_df = curriculum_df.drop_duplicates(curr_flds)


In [31]:
y = pd.merge(
    w.reset_index(), curriculum_df, on=["year_term", "PEOPLE_CODE_ID"], how="left"
)
print(y.shape)
y.head()

#y.to_hdf(data_store, key="weekly", mode="w", data_columns=True, complevel=0)


(26442, 58)


Unnamed: 0,year_term,stage,PEOPLE_CODE_ID,00,01,02,03,04,05,06,...,45,46,47,48,49,50,51,52,53,curriculum
0,2015.Fall,Applied,P000025856,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,
1,2015.Fall,Applied,P000026328,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
2,2015.Fall,Applied,P000026959,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
3,2015.Fall,Applied,P000027130,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,FWSW
4,2015.Fall,Applied,P000027236,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [55]:
test_yearterm = '2021.Fall'
acc20XX = y.loc[((y['year_term']==test_yearterm) &  (y['stage']=='Applied'))]


In [56]:
print(acc20XX.shape)

(655, 58)


In [57]:
acc20XX.head()

Unnamed: 0,year_term,stage,PEOPLE_CODE_ID,00,01,02,03,04,05,06,...,45,46,47,48,49,50,51,52,53,curriculum
8011,2021.Fall,Applied,P000055672,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,
8012,2021.Fall,Applied,P000056469,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,
8013,2021.Fall,Applied,P000056509,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,NRCM
8014,2021.Fall,Applied,P000056596,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,COMM
8015,2021.Fall,Applied,P000056752,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,


In [58]:
acc20XX.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 655 entries, 8011 to 8665
Data columns (total 58 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_term       655 non-null    object
 1   stage           655 non-null    object
 2   PEOPLE_CODE_ID  655 non-null    object
 3   00              655 non-null    int64 
 4   01              655 non-null    int64 
 5   02              655 non-null    int64 
 6   03              655 non-null    int64 
 7   04              655 non-null    int64 
 8   05              655 non-null    int64 
 9   06              655 non-null    int64 
 10  07              655 non-null    int64 
 11  08              655 non-null    int64 
 12  09              655 non-null    int64 
 13  10              655 non-null    int64 
 14  11              655 non-null    int64 
 15  12              655 non-null    int64 
 16  13              655 non-null    int64 
 17  14              655 non-null    int64 
 18  15    

In [59]:
acc20XX['sum'] = acc20XX.sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acc20XX['sum'] = acc20XX.sum(axis=1)


In [60]:
col_list = [f"{w:02d}" for w in range(0, 54)]
#col_list

acc20XX = acc20XX.drop(columns=col_list)


In [61]:
acc20XX.head()

Unnamed: 0,year_term,stage,PEOPLE_CODE_ID,curriculum,sum
8011,2021.Fall,Applied,P000055672,,54
8012,2021.Fall,Applied,P000056469,,45
8013,2021.Fall,Applied,P000056509,NRCM,0
8014,2021.Fall,Applied,P000056596,COMM,0
8015,2021.Fall,Applied,P000056752,,40


In [62]:
print(today)
writer = pd.ExcelWriter(
    f"{test_yearterm}_Weekly_{today}.xlsx"
)


20210107


In [54]:
acc20XX.to_excel(writer, "test_year")

writer.save()

In [None]:
adm_df1_2.

In [None]:
acc20XX = y.loc[((y['year_term']==test_yearterm) &  (y['stage']=='Accepted'))]