In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import zscore, rankdata, kendalltau
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
import seaborn as sns

import csv 
from collections import Counter
import datetime
import holidays
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from scipy.sparse import csr_matrix

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import OLSInfluence
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices

from lineartree import LinearTreeRegressor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_validate
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, explained_variance_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


https://ny1.com/nyc/all-boroughs/education/2021/12/16/nyc-doe-school-closures-spike-as-covid-cases-climb-in-nyc

https://ballotpedia.org/School_responses_in_New_York_to_the_coronavirus_(COVID-19)_pandemic#Timeline_by_school_year

https://www.nysed.gov/coronavirus/guidance-p-12-schools

https://www.governor.ny.gov/sites/default/files/atoms/files/Pre-K_to_Grade_12_Schools_MasterGuidance.pdf

https://coronavirus.health.ny.gov/system/files/documents/2022/02/updated_tts-memo_0.pdf

https://coronavirus.health.ny.gov/system/files/documents/2021/09/school-guidance_0.pdf

https://www.schools.nyc.gov/docs/default-source/default-document-library/doe-calendar-sy-21-22

In [2]:
ww_df = pd.read_csv('data/new_SARS-CoV-2_concentrations_measured_in_NYC_Wastewater.csv')

ww_df.head

<bound method NDFrame.head of      Sample Date   Test date       WRRF Name WRRF Abbreviation  \
0     08/31/2020  09/01/2020       26th Ward               26W   
1     08/31/2020  09/01/2020      Bowery Bay                BB   
2     08/31/2020  09/01/2020    Coney Island                CI   
3     08/31/2020  09/01/2020     Hunts Point                HP   
4     08/31/2020  09/01/2020     Jamaica Bay                JA   
...          ...         ...             ...               ...   
3971  08/01/2023  08/02/2023   Port Richmond                PR   
3972  08/01/2023  08/02/2023        Red Hook                RH   
3973  08/01/2023  08/02/2023        Rockaway                RK   
3974  08/01/2023  08/02/2023  Tallman Island                TI   
3975  08/01/2023  08/02/2023    Wards Island                WI   

      Concentration SARS-CoV-2 gene target (N1 Copies/L)   \
0                                                 389.0     
1                                                1204.0

In [3]:
# Well, we have a new column here (Technology). 
ww_df.dtypes

Sample Date                                                       object
Test date                                                         object
WRRF Name                                                         object
WRRF Abbreviation                                                 object
Concentration SARS-CoV-2 gene target (N1 Copies/L)               float64
Per capita SARS-CoV-2 load (N1 copies per day per population)    float64
Annotation                                                        object
Population Served, estimated                                       int64
Technology                                                        object
dtype: object

In [None]:
# At this moment, we just want to see how our model works with updated data. Schools recently started up for the fall 
# semester here and cases are skyrocketing. We will therefore not include the technology data. We will also
# need to make some adjustments due to changes in methodology made in the spring of 2023, as noted on the state dashboard

In [None]:
# First, let's do the same as before, by creating a dataframe of school events. Even though these are much
# more typical, they still need to be accounted for. 

Aug. 27, 2021
Sept. 2, 2021
Sept. 13, 2021
Dec. 24
Jan. 4, 2022
Feb. 21, 2022
March 3, 2022

# Create DataFrame
data = {
    'StartDate': ['2021-08-27', '2021-09-02', '2021-09-13', '2021-12-24', '2022-01-03', '2022-01-04', '2022-01-14', 
                  '2022-02-21', '2022-02-28', '2022-03-3', '2022-06-28'],
    'Event': ['Mask Mandate', 'Vaccine and Testing Mandate, Close Contact Minimized', 
              'Start of Term','Shortened Isolation', 
              'End Contact Tracing', 'End of Mask Mandate', 
              'Shutdown', 'Winter Recess', 'WR School Resumes', 
              'Mid-winter Recess', 'MWR School Resumes', 'Summer Break']
}

school_df = pd.DataFrame(data)

# Convert 'StartDate' to datetime
school_df['StartDate'] = pd.to_datetime(school_df['StartDate'])

# Sort DataFrame by 'StartDate'
school_df = school_df.sort_values(by='StartDate')

# Create 'EndDate' as the day before the next event's 'StartDate'
school_df['EndDate'] = school_df['StartDate'].shift(-1) - pd.Timedelta(days=1)

# End date will be the end of the school year.

# Create a new dataframe with each date and event
date_event_df = pd.DataFrame(columns=['Date', 'Event'])
for i in range(len(school_df)):
    date_range = pd.date_range(start=school_df['StartDate'].iloc[i], end=school_df['EndDate'].iloc[i])
    temp_df = pd.DataFrame({
        'Date': date_range,
        'Event': [school_df['Event'].iloc[i]]*len(date_range)
    })
    date_event_df = pd.concat([date_event_df, temp_df])

# Reset index of the new dataframe
date_event_df.reset_index(drop=True, inplace=True)


# date_event_df.to_csv('school_events.csv', index=False)
