In [1]:
import pandas as pd
import geopandas as gpd
import requests
from pathlib import Path
import os

# Set up paths
PROJECT_ROOT = Path.cwd()
print(f"Project root directory: {PROJECT_ROOT}")
RAW_DATA = PROJECT_ROOT / 'data' / 'raw'
PROCESSED_DATA = PROJECT_ROOT / 'data' / 'processed'

# Create directories if they don't exist
RAW_DATA.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA.mkdir(parents=True, exist_ok=True)

print(f"Raw data will be saved to: {RAW_DATA}")
print(f"Processed data will be saved to: {PROCESSED_DATA}")

Project root directory: /Users/lukehatchl/Development/Personal/covid-spatial-analysis
Raw data will be saved to: /Users/lukehatchl/Development/Personal/covid-spatial-analysis/data/raw
Processed data will be saved to: /Users/lukehatchl/Development/Personal/covid-spatial-analysis/data/processed


In [None]:
# NYT download function
def download_nyt_covid_data(save_path, force_redownload=False):
    """
    Download NYT COVID-19 county-level data.
    
    Parameters:
    -----------
    save_path : Path or str
        Where to save the downloaded CSV
    force_redownload : bool
        If True, download even if file exists. Default False.
    
    Returns:
    --------
    pd.DataFrame : The downloaded data
    """
    save_path = Path(save_path)
    
    # Check if file already exists
    if save_path.exists() and not force_redownload:
        print(f"✓ File already exists at {save_path}")
        print(f"  Loading existing file... (use force_redownload=True to re-download)")
        df = pd.read_csv(save_path)
        print(f"  Loaded {len(df):,} rows")
        print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
        return df
    
    url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
    
    print(f"Downloading NYT COVID data from {url}...")
    
    try:
        df = pd.read_csv(url)
        df.to_csv(save_path, index=False)
        print(f"✓ Downloaded {len(df):,} rows")
        print(f"✓ Saved to {save_path}")
        print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
        return df
    except Exception as e:
        print(f"✗ Error downloading data: {e}")
        return None

covid_data = download_nyt_covid_data(RAW_DATA / 'nyt_covid_counties.csv')

✓ File already exists at /Users/lukehatchl/Development/Personal/covid-spatial-analysis/data/raw/nyt_covid_counties.csv
  Loading existing file... (use force_redownload=True to re-download)
  Loaded 2,502,832 rows
  Date range: 2020-01-21 to 2022-05-13


In [8]:
# Quick exploration of COVID data
if covid_data is not None:
    print("\n--- Data Preview ---")
    print(covid_data.head())
    print("\n--- Data Info ---")
    print(covid_data.info())
    print("\n--- Sample Statistics ---")
    print(f"Unique counties: {covid_data['county'].nunique():,}")
    print(f"Unique states: {covid_data['state'].nunique()}")
    print(f"Total cases (latest): {covid_data.groupby('fips')['cases'].last().sum():,.0f}")


--- Data Preview ---
         date     county       state     fips  cases  deaths
0  2020-01-21  Snohomish  Washington  53061.0      1     0.0
1  2020-01-22  Snohomish  Washington  53061.0      1     0.0
2  2020-01-23  Snohomish  Washington  53061.0      1     0.0
3  2020-01-24       Cook    Illinois  17031.0      1     0.0
4  2020-01-24  Snohomish  Washington  53061.0      1     0.0

--- Data Info ---
<class 'pandas.DataFrame'>
RangeIndex: 2502832 entries, 0 to 2502831
Data columns (total 6 columns):
 #   Column  Dtype  
---  ------  -----  
 0   date    str    
 1   county  str    
 2   state   str    
 3   fips    float64
 4   cases   int64  
 5   deaths  float64
dtypes: float64(2), int64(1), str(3)
memory usage: 114.6 MB
None

--- Sample Statistics ---
Unique counties: 1,932
Unique states: 56
Total cases (latest): 79,406,747


In [None]:
# Shapefile download function
def download_county_shapefile(save_dir, year=2020, force_redownload=False):
    """
    Download US county boundaries from Census Bureau.
    
    Parameters:
    -----------
    save_dir : Path or str
        Directory to save shapefile
    year : int
        Census year (default 2020)
    force_redownload : bool
        If True, download even if file exists. Default False.
    
    Returns:
    --------
    str : Path to downloaded zip file
    """
    save_dir = Path(save_dir)
    shp_file = save_dir / f'tl_{year}_us_county.shp'
    zip_path = save_dir / f'tl_{year}_us_county.zip'
    
    # Check if shapefile already exists
    if shp_file.exists() and not force_redownload:
        print(f"✓ Shapefile already exists at {shp_file}")
        print(f"  Skipping download (use force_redownload=True to re-download)")
        return str(zip_path)
    
    url = f"https://www2.census.gov/geo/tiger/TIGER{year}/COUNTY/tl_{year}_us_county.zip"
    
    print(f"Downloading county shapefile for {year}...")
    
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        with open(zip_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f"✓ Downloaded to {zip_path}")
        print(f"  File size: {zip_path.stat().st_size / 1024 / 1024:.1f} MB")
        
        # Extract the zip
        import zipfile
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(save_dir)
        print(f"✓ Extracted shapefile components")
        
        return str(zip_path)
    except Exception as e:
        print(f"✗ Error downloading shapefile: {e}")
        return None

# Run it
shapefile_path = download_county_shapefile(RAW_DATA)

✓ Shapefile already exists at /Users/lukehatchl/Development/Personal/covid-spatial-analysis/data/raw/tl_2020_us_county.shp
  Skipping download (use force_redownload=True to re-download)


In [10]:
# Load and explore shapefile
def load_county_shapefile(data_dir, year=2020):
    """
    Load county shapefile into GeoDataFrame.
    
    Parameters:
    -----------
    data_dir : Path or str
        Directory containing the shapefile
    year : int
        Year of shapefile
    
    Returns:
    --------
    gpd.GeoDataFrame : County boundaries
    """
    shp_file = Path(data_dir) / f'tl_{year}_us_county.shp'
    
    print(f"Loading shapefile from {shp_file}...")
    
    try:
        counties = gpd.read_file(shp_file)
        print(f"✓ Loaded {len(counties):,} counties")
        print(f"  CRS: {counties.crs}")
        print(f"  Columns: {list(counties.columns)}")
        return counties
    except Exception as e:
        print(f"✗ Error loading shapefile: {e}")
        return None

counties_gdf = load_county_shapefile(RAW_DATA)

Loading shapefile from /Users/lukehatchl/Development/Personal/covid-spatial-analysis/data/raw/tl_2020_us_county.shp...
✓ Loaded 3,234 counties
  CRS: EPSG:4269
  Columns: ['STATEFP', 'COUNTYFP', 'COUNTYNS', 'GEOID', 'NAME', 'NAMELSAD', 'LSAD', 'CLASSFP', 'MTFCC', 'CSAFP', 'CBSAFP', 'METDIVFP', 'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'geometry']


In [11]:
# Explore shapefile
if counties_gdf is not None:
    print("\n--- Shapefile Preview ---")
    print(counties_gdf.head())
    print("\n--- Key Columns ---")
    print(f"GEOID (FIPS): {counties_gdf['GEOID'].iloc[0]}")
    print(f"NAME: {counties_gdf['NAME'].iloc[0]}")
    print(f"State FIPS: {counties_gdf['STATEFP'].iloc[0]}")


--- Shapefile Preview ---
  STATEFP COUNTYFP  COUNTYNS  GEOID       NAME          NAMELSAD LSAD CLASSFP  \
0      31      039  00835841  31039     Cuming     Cuming County   06      H1   
1      53      069  01513275  53069  Wahkiakum  Wahkiakum County   06      H1   
2      35      011  00933054  35011    De Baca    De Baca County   06      H1   
3      31      109  00835876  31109  Lancaster  Lancaster County   06      H1   
4      31      129  00835886  31129   Nuckolls   Nuckolls County   06      H1   

   MTFCC CSAFP CBSAFP METDIVFP FUNCSTAT       ALAND    AWATER     INTPTLAT  \
0  G4020   NaN    NaN      NaN        A  1477645345  10690204  +41.9158651   
1  G4020   NaN    NaN      NaN        A   680976231  61568965  +46.2946377   
2  G4020   NaN    NaN      NaN        A  6016818946  29090018  +34.3592729   
3  G4020   339  30700      NaN        A  2169272970  22847034  +40.7835474   
4  G4020   NaN    NaN      NaN        A  1489645188   1718484  +40.1764918   

       INTPTLON  

In [12]:
# Summary of collected data
print("="*60)
print("DATA COLLECTION SUMMARY")
print("="*60)
print(f"\n✓ NYT COVID data: {'Available' if covid_data is not None else 'Missing'}")
if covid_data is not None:
    print(f"  - {len(covid_data):,} rows, {covid_data['date'].nunique()} unique dates")

print(f"\n✓ County shapefiles: {'Available' if counties_gdf is not None else 'Missing'}")
if counties_gdf is not None:
    print(f"  - {len(counties_gdf):,} counties")

DATA COLLECTION SUMMARY

✓ NYT COVID data: Available
  - 2,502,832 rows, 844 unique dates

✓ County shapefiles: Available
  - 3,234 counties
