# Loading NOAA Locations Data into DataFrames

This notebook demonstrates how to load and work with NOAA location data from JSON files.

In [15]:
import os
import json
import pandas as pd
import glob

# Define the data directory
DATA_DIR = '../DataAcqusitionLab/data/'
print(f"Looking for data in: {os.path.abspath(DATA_DIR)}")

Looking for data in: /Users/iara/Projects/Week5JupyterNotebooks/DataAcqusitionLab/data


## 1. Examine Directory Contents

In [22]:
# List all JSON files in the data directory
json_files = glob.glob(os.path.join(DATA_DIR, 'data/locations_*.json'))
json_files.sort()

print(f"Found {len(json_files)} JSON files in {DATA_DIR}")
if json_files:
    print(f"First file: {os.path.basename(json_files[0])}")
    print(f"Last file: {os.path.basename(json_files[-1])}")

Found 0 JSON files in ../DataAcqusitionLab/data/


## 2. Examine a Single JSON File Structure

In [23]:
# Load and examine the first JSON file
if json_files:
    sample_file = json_files[0]
    print(f"Examining: {os.path.basename(sample_file)}")
    
    with open(sample_file, 'r') as f:
        data = json.load(f)
    
    print(f"\nJSON structure:")
    print(f"Keys: {list(data.keys())}")
    
    if 'results' in data:
        results = data['results']
        print(f"Number of records in this file: {len(results)}")
        print(f"\nSample record:")
        print(results[0] if results else "No records found")
    
    if 'metadata' in data:
        metadata = data['metadata']
        print(f"\nMetadata:")
        for key, value in metadata.items():
            print(f"  {key}: {value}")

## 3. Load All JSON Files and Combine Data

In [24]:
# Function to load and combine all JSON files
def load_all_location_data(data_dir):
    """Load all location JSON files and combine into a single list."""
    json_files = glob.glob(os.path.join(data_dir, 'locations_*.json'))
    json_files.sort()
    
    all_results = []
    total_files = len(json_files)
    
    print(f"Loading {total_files} JSON files...")
    
    for i, file_path in enumerate(json_files):
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            if 'results' in data:
                results = data['results']
                all_results.extend(results)
                if i == 0 or (i + 1) % 10 == 0 or i == total_files - 1:
                    print(f"  Loaded {i+1}/{total_files} files, {len(all_results)} total records so far")
            else:
                print(f"  Warning: No 'results' key in {os.path.basename(file_path)}")
                
        except Exception as e:
            print(f"  Error loading {os.path.basename(file_path)}: {e}")
    
    print(f"\nCompleted loading {total_files} files with {len(all_results)} total records")
    return all_results

# Load all the data
all_location_data = load_all_location_data(DATA_DIR)

Loading 39 JSON files...
  Loaded 1/39 files, 1000 total records so far
  Loaded 10/39 files, 10000 total records so far
  Loaded 20/39 files, 20000 total records so far
  Loaded 30/39 files, 30000 total records so far
  Loaded 39/39 files, 38862 total records so far

Completed loading 39 files with 38862 total records


## 4. Create DataFrame from Combined Data

In [25]:
# Create DataFrame from the combined data
if all_location_data:
    df_locations = pd.DataFrame(all_location_data)
    
    print(f"DataFrame created successfully!")
    print(f"Shape: {df_locations.shape}")
    print(f"Columns: {list(df_locations.columns)}")
    
    # Display basic info
    print(f"\nData types:")
    print(df_locations.dtypes)
    
    print(f"\nFirst 5 rows:")
    display(df_locations.head())
else:
    print("No data loaded - cannot create DataFrame")

DataFrame created successfully!
Shape: (38862, 5)
Columns: ['mindate', 'maxdate', 'name', 'datacoverage', 'id']

Data types:
mindate          object
maxdate          object
name             object
datacoverage    float64
id               object
dtype: object

First 5 rows:


Unnamed: 0,mindate,maxdate,name,datacoverage,id
0,1983-01-01,2025-08-03,"Abu Dhabi, AE",0.997,CITY:AE000001
1,1944-03-01,2025-08-03,"Ajman, AE",1.0,CITY:AE000002
2,1944-03-01,2025-08-03,"Dubai, AE",1.0,CITY:AE000003
3,1944-03-01,2025-08-03,"Sharjah, AE",1.0,CITY:AE000006
4,1966-03-02,2021-08-30,"Kabul, AF",0.9969,CITY:AF000007


## 5. Data Summary and Analysis

In [26]:
# Basic data analysis
if 'df_locations' in locals() and not df_locations.empty:
    print("=== NOAA Locations Data Summary ===")
    print(f"Total locations: {len(df_locations):,}")
    print(f"Date range: {df_locations['mindate'].min()} to {df_locations['maxdate'].max()}")
    
    # Analyze location types by ID prefix
    print(f"\n=== Location Types ===")
    location_types = df_locations['id'].str.split(':', expand=True)[0].value_counts()
    print(location_types)
    
    # Data coverage statistics
    print(f"\n=== Data Coverage Statistics ===")
    print(df_locations['datacoverage'].describe())
    
    # Sample of different location types
    print(f"\n=== Sample Locations by Type ===")
    for loc_type in location_types.index[:3]:  # Show top 3 types
        sample = df_locations[df_locations['id'].str.startswith(loc_type + ':')].iloc[0]
        print(f"{loc_type}: {sample['name']} ({sample['id']})")
else:
    print("DataFrame not available for analysis")

=== NOAA Locations Data Summary ===
Total locations: 38,862
Date range: 1750-08-06 to 2025-08-05

=== Location Types ===
0
ZIP     30415
FIPS     3438
HUC      2667
CITY     1989
CLIM      353
Name: count, dtype: int64

=== Data Coverage Statistics ===
count    38862.000000
mean         0.985740
std          0.030188
min          0.075800
25%          0.950000
50%          1.000000
75%          1.000000
max          1.000000
Name: datacoverage, dtype: float64

=== Sample Locations by Type ===
ZIP: Highgate Center, VT 05459 (ZIP:05459)
FIPS: Alabama (FIPS:01)
HUC: New England Hydrologic Unit (HUC:01)


## 6. Data Validation

In [27]:
# Validate the data
if 'df_locations' in locals() and not df_locations.empty:
    print("=== Data Validation ===")
    
    # Check for missing values
    missing_counts = df_locations.isnull().sum()
    print(f"Missing values per column:")
    print(missing_counts)
    
    # Check for duplicates
    duplicate_count = df_locations.duplicated().sum()
    print(f"\nDuplicate rows: {duplicate_count}")
    
    # Check unique IDs
    unique_ids = df_locations['id'].nunique()
    total_rows = len(df_locations)
    print(f"Unique IDs: {unique_ids:,}")
    print(f"Total rows: {total_rows:,}")
    print(f"ID uniqueness: {'✓ All unique' if unique_ids == total_rows else '⚠ Duplicates found'}")
    
    # Expected total from NOAA API
    expected_total = 38862  # Current known total
    print(f"\n=== Record Count Validation ===")
    print(f"Expected records: {expected_total:,}")
    print(f"Actual records: {total_rows:,}")
    print(f"Status: {'✓ Match' if total_rows == expected_total else '⚠ Count mismatch'}")
else:
    print("DataFrame not available for validation")

=== Data Validation ===
Missing values per column:
mindate         0
maxdate         0
name            0
datacoverage    0
id              0
dtype: int64

Duplicate rows: 0
Unique IDs: 38,862
Total rows: 38,862
ID uniqueness: ✓ All unique

=== Record Count Validation ===
Expected records: 38,862
Actual records: 38,862
Status: ✓ Match


## Summary

This notebook successfully loads NOAA location data from JSON files into a pandas DataFrame for analysis. The data includes various location types (ZIP codes, FIPS codes, weather stations, etc.) with their associated date ranges and data coverage information.