# Hypothesis
### Does the sum of all counties in a state equal the reported stat for said state?

In [34]:
import pandas as pd
import numpy as np

In [35]:
df = pd.read_csv(r"../data/Trips_by_Distance.csv")

In [36]:
df_state = df.groupby(by='Level').get_group('State').reset_index()
df_county = df.groupby(by='Level').get_group('County').reset_index()

In [37]:
target_date = "5/11/2019"# My birthday 

# Select relevant columns
grouping_columns = ["State FIPS", "State Postal Code"]
aggregation_columns = [
    "Population Staying at Home", "Population Not Staying at Home", "Number of Trips",
    "Number of Trips <1", "Number of Trips 1-3", "Number of Trips 3-5",
    "Number of Trips 5-10", "Number of Trips 10-25", "Number of Trips 25-50",
    "Number of Trips 50-100", "Number of Trips 100-250", "Number of Trips 250-500",
    "Number of Trips >=500"
]

# Only the desired dates
df_county_filtered = df_county[df_county["Date"] == target_date]
df_state_filtered = df_state[df_state["Date"] == target_date]

# Aggregate county-level data by state
county_agg = df_county_filtered.groupby(grouping_columns)[aggregation_columns].sum().reset_index()

# Merge with state-level data
merged_df = county_agg.merge(df_state_filtered, on=grouping_columns, suffixes=("_county", "_state"))

# Compute the differences
for col in aggregation_columns:
    merged_df[f"{col}"] = merged_df[f"{col}_county"] - merged_df[f"{col}_state"]

# Select relevant columns for final dataset
comparison_df = merged_df[["State FIPS", "State Postal Code"] + [f"{col}" for col in aggregation_columns]]



In [38]:
comparison_means = comparison_df.mean(numeric_only=True)
state_means = df_state.mean(numeric_only=True)

# Find common columns to ensure correct matching
common_columns = comparison_means.index.intersection(state_means.index).drop('State FIPS')

# Calculate percentage of difference
percentage_differences = [
    (col, abs((comparison_means[col] / state_means[col]) * 100))
    for col in common_columns
]


diff_df = pd.DataFrame(percentage_differences, columns=["Column Name", "% Difference"])
diff_df = diff_df.sort_values(by="% Difference", ascending=False) 
print(diff_df)

                       Column Name  % Difference
1   Population Not Staying at Home      0.809717
12           Number of Trips >=500      0.012080
4              Number of Trips 1-3      0.009350
9           Number of Trips 50-100      0.008927
5              Number of Trips 3-5      0.008641
2                  Number of Trips      0.007963
3               Number of Trips <1      0.007809
6             Number of Trips 5-10      0.007498
10         Number of Trips 100-250      0.006916
8            Number of Trips 25-50      0.006803
7            Number of Trips 10-25      0.006164
0       Population Staying at Home      0.005880
11         Number of Trips 250-500      0.005721


# Conclusion
### Within a reasonable margin, the sum of counties within a state represent the state itself in a statistically significant manner
##### Thus, further analises can be done just at a county-wide level, and no statistical precision will be lost.
It can be stated that for any given day, the sum of all counties make up the corresponding nation and state-wide number of trips