# Hypothesis
##### Is the sum of every county equal to the reported statistic at a nation-wide level on any given date?

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"../data/Trips_by_Distance.csv")

In [3]:
df_national = df.groupby(by='Level').get_group('National').reset_index()
df_state = df.groupby(by='Level').get_group('State').reset_index()
df_county = df.groupby(by='Level').get_group('County').reset_index()

In [None]:
target_date = "5/11/2019"  # My birthday


aggregation_columns = [
    "Population Staying at Home", "Population Not Staying at Home", "Number of Trips",
    "Number of Trips <1", "Number of Trips 1-3", "Number of Trips 3-5",
    "Number of Trips 5-10", "Number of Trips 10-25", "Number of Trips 25-50",
    "Number of Trips 50-100", "Number of Trips 100-250", "Number of Trips 250-500",
    "Number of Trips >=500"
]


df_county_filtered = df_county[df_county["Date"] == target_date]
df_national_filtered = df_national[df_national["Date"] == target_date]



county_agg = df_county_filtered[aggregation_columns].sum()
national_values = df_national_filtered[aggregation_columns].sum()  

common_columns = county_agg.index.intersection(national_values.index)

percentage_differences = []
for col in common_columns:
    reported_value = national_values[col]
    
    if abs(reported_value) < 1e-9:
        diff = float('nan')
    else:
        diff = abs((county_agg[col] - reported_value) / reported_value) * 100
    
    percentage_differences.append((col, diff))

diff_df = pd.DataFrame(percentage_differences, columns=["Column Name", "% Difference (Aggregated vs National)"])
diff_df = diff_df.sort_values(by="% Difference (Aggregated vs National)", ascending=False)  
print(diff_df)


                       Column Name  % Difference (Aggregated vs National)
1   Population Not Staying at Home                               0.780502
9           Number of Trips 50-100                               0.008713
10         Number of Trips 100-250                               0.007965
4              Number of Trips 1-3                               0.007384
3               Number of Trips <1                               0.007094
5              Number of Trips 3-5                               0.006951
0       Population Staying at Home                               0.006839
2                  Number of Trips                               0.006725
8            Number of Trips 25-50                               0.006243
12           Number of Trips >=500                               0.006214
6             Number of Trips 5-10                               0.006192
11         Number of Trips 250-500                               0.006066
7            Number of Trips 10-25    

# Conclusion
#### Within a reasonable margin, the sum of all county-level statistics on any given date significantly represent the corresponding nation-wide reported statistic