In [None]:
'''
Tukey outlier removal
'''
import pandas as pd
import numpy as np
state = pd.read_csv('23.csv')

VA_confirmed = state[['Date', 'VA confirmed']].set_index('Date').diff()
VA_confirmed.fillna(0, inplace=True)

VA_deaths = state[['Date', 'VA deaths']].set_index('Date').diff()
VA_deaths.fillna(0, inplace=True)


UT_confirmed = VA_confirmed = state[['Date', 'UT confirmed']].set_index('Date').diff()
UT_confirmed.fillna(0, inplace=True)


UT_deaths = state[['Date', 'UT deaths']].set_index('Date').diff()
UT_deaths.fillna(0, inplace=True)


def tukey(dataframe):
  q1 = dataframe.quantile(0.25)
  q3 = dataframe.quantile(0.75)
  iqr = q3 - q1
  lower_threshold = q1 - (1.5 * iqr)
  upper_threshold = q3 + (1.5 * iqr)
  outliers = []
  return dataframe[dataframe.values > upper_threshold | dataframe.values < lower_threshold]
  

In [None]:
|print("Outliers in UT confirmed: ", tukey(UT_confirmed))
print("Outliers in UT deaths: ", tukey(UT_deaths))

print("Outliers in VA confirmed: ", tukey(VA_confirmed))
print("Outliers in VA deaths: ", tukey(VA_deaths))



TypeError: ignored

In [None]:
'''
Wald's Test, Z-test, and t-test
'''
import pandas as pd
import numpy as np
state = pd.read_csv('23.csv')

In [None]:
state['Date'] = pd.to_datetime(state['Date'])  
february = (state['Date'] >= '2021-02-01') & (state['Date'] <= '2021-02-28')
march = (state['Date'] >= '2021-03-01') & (state['Date'] <= '2021-03-31')

In [None]:
state_feb = state[february]
state_march = state[march]

cases_VA_feb = state_feb[['Date', 'VA confirmed']]
cases_count_VA_feb = cases_VA_feb['VA confirmed']

deaths_VA_feb = state_feb[['Date', 'VA deaths']]
deaths_count_VA_feb = deaths_VA_feb['VA deaths']

cases_UT_feb = state_feb[['Date', 'UT confirmed']]
cases_count_UT_feb = cases_UT_feb['UT confirmed']

deaths_UT_feb = state_feb[['Date', 'UT deaths']]
deaths_count_UT_feb = deaths_UT_feb['UT deaths']

cases_VA_mar = state_march[['Date', 'VA confirmed']]
cases_count_VA_mar = cases_VA_mar['VA confirmed']

deaths_VA_mar = state_march[['Date', 'VA deaths']]
deaths_count_VA_mar = deaths_VA_mar['VA deaths']

cases_UT_mar = state_march[['Date', 'UT confirmed']]
cases_count_UT_mar = cases_UT_mar['UT confirmed']

deaths_UT_mar = state_march[['Date', 'UT deaths']]
deaths_count_UT_mar = deaths_UT_mar['UT deaths']

In [None]:
def walds_one(feb, march):
  theta_hat = np.mean(feb)
  theta_0 = np.mean(march)

  se = np.sqrt(theta_hat / len(march))

  walds_statistic = (theta_hat - theta_0) / se

  if np.abs(walds_statistic) > 1.96:
    print("|W| = ", np.abs(walds_statistic), " > 1.96, therefore reject the Null Hypothesis")
  else:
    print("|W| = ", np.abs(walds_statistic), " <= 1.96, therefore accept the Null Hypothesis")


In [None]:
walds_one(deaths_count_UT_feb, deaths_count_UT_mar)

|W| =  31.172891168763257  > 1.96, therefore reject the Null Hypothesis


In [None]:
walds_one(cases_count_UT_feb, cases_count_UT_mar)

|W| =  169.5916851488749  > 1.96, therefore reject the Null Hypothesis


In [None]:
walds_one(cases_count_VA_feb, cases_count_VA_mar)

|W| =  386.3680317489496  > 1.96, therefore reject the Null Hypothesis


In [None]:
walds_one(deaths_count_VA_feb, deaths_count_VA_mar)

|W| =  181.22032132577195  > 1.96, therefore reject the Null Hypothesis


In [None]:
def z_one(original, feb, march):
  #Z test requires true standard deviation, so we need to find that
  sigma = np.std(original)

  x_bar = np.mean(feb)
  mu_0 = np.mean(march)


  sigma_over_sqrtn = sigma / np.sqrt(len(feb))

  z_statistic = (x_bar - mu_0) / sigma_over_sqrtn

  if np.abs(z_statistic) > 1.96:
    print("|Z| = ", np.abs(z_statistic), "> 1.96 therefore reject the null hypothesis")
  else:
    print("|Z| = ", np.abs(z_statistic), "<= 1.96therefore accept the null hypothesis")


In [None]:
z_one(state['VA confirmed'], cases_count_VA_feb, cases_count_VA_mar)

|Z| =  1.371705720445352 therefore accept the null hypothesis


In [None]:
z_one(state['VA deaths'], deaths_count_VA_feb, deaths_count_VA_mar)

|Z| =  5.108579550253789 therefore reject the null hypothesis


In [None]:
z_one(state['UT confirmed'], cases_count_UT_feb, cases_count_UT_mar)

|Z| =  0.7020619767080907 therefore accept the null hypothesis


In [None]:
z_one(state['UT deaths'], deaths_count_UT_feb, deaths_count_UT_mar)

|Z| =  1.8576534274536443 therefore accept the null hypothesis


In [None]:
def walds_two(feb, march):
  x_bar = np.mean(feb)
  y_bar = np.mean(march)
  delta_hat = x_bar - y_bar

  #Since we're using Poisson MLE, variance will be lambda / n
  
  se_hat = np.sqrt((x_bar / len(feb)) + (y_bar / len(march)))

  walds_statistic = delta_hat / se_hat

  if np.abs(walds_statistic) > 1.96:
    print("|W| = ", np.abs(walds_statistic), " > 1.96, therefore reject the Null Hypothesis")
  else:
    print("|W| = ", np.abs(walds_statistic), " <= 1.96, therefore accept the Null Hypothesis")


In [None]:
walds_two(cases_count_UT_feb, cases_count_UT_mar)

|W| =  115.44969347830576  > 1.96, therefore reject the Null Hypothesis


In [None]:
walds_two(deaths_count_UT_feb, deaths_count_UT_mar)

|W| =  20.83218579296163  > 1.96, therefore reject the Null Hypothesis


In [None]:
walds_two(cases_count_VA_feb, cases_count_VA_mar)

|W| =  260.4310819769535  > 1.96, therefore reject the Null Hypothesis


In [None]:
walds_two(deaths_count_VA_feb, deaths_count_VA_mar)

|W| =  114.81653434010379  > 1.96, therefore reject the Null Hypothesis


In [None]:
def t_one(feb, march):
  x_bar = np.mean(feb)
  mu_0 = np.mean(march)

  ssd = np.std(feb)

  s_over_n = ssd / np.sqrt(len(feb))
  t = (x_bar - mu_0) / s_over_n

  if np.abs(t) > 1.703:
    print("|t| = ", np.abs(t), "> 1.703 therefore reject the null hypothesis")
  else:
    print("|t| = ", np.abs(t), "<= 1.703 therefor accept the null hypothesis")

In [None]:
t_one(cases_count_UT_feb, cases_count_UT_mar)

|t| =  13.711719528357026 > 1.703 therefore reject the null hypothesis


In [None]:
t_one(deaths_count_UT_feb, deaths_count_UT_mar)

|t| =  17.85120765089191 > 1.703 therefore reject the null hypothesis


In [None]:
t_one(cases_count_VA_feb, cases_count_VA_mar)

|t| =  13.113863589190032 > 1.703 therefore reject the null hypothesis


In [None]:
t_one(deaths_count_VA_feb, deaths_count_VA_mar)

|t| =  26.739252053774308 > 1.703 therefore reject the null hypothesis


In [None]:
def t_two_unpaired(feb, march):
  x_bar = np.mean(feb)
  y_bar = np.mean(march)
  d_bar = x_bar - y_bar

  sx = np.var(feb) / len(feb)
  sy = np.var(march) / len(march)
  sx_sy = np.sqrt(sx + sy)

  t = d_bar / sx_sy

  if np.abs(t) > 1.672:
    print("|t| = ", np.abs(t), "> 1.672 therefore reject the null hypothesis")
  else:
    print("|t| = ", np.abs(t), "<= 1.672 therefore reject the null hypothesis")


In [None]:
t_two_unpaired(cases_count_UT_feb, cases_count_UT_mar)

|t| =  11.910784809405355 > 1.672 therefore reject the null hypothesis


In [None]:
t_two_unpaired(deaths_count_UT_feb, deaths_count_UT_mar)

|t| =  14.159815154650781 > 1.672 therefore reject the null hypothesis


In [None]:
t_two_unpaired(cases_count_VA_feb, cases_count_VA_mar)

|t| =  11.357687622323128 > 1.672 therefore reject the null hypothesis


In [None]:
t_two_unpaired(deaths_count_VA_feb, deaths_count_VA_mar)

|t| =  21.770162931599856 > 1.672 therefore reject the null hypothesis
