<h1 style="text-align: center;">NOAA Data Analysis</h1>

### Part 1 - Gathering data

Automating API calls

In [1]:
# importing all libraries
import urllib.request
import urllib.error
import json
import os
from key import token 

In [None]:
def call_api(url, token):
    """
        Make a GET request to the API and return a JSON response

        Args:
            url (str): The API endpoint URL
            token (str): The API token for authentication
        Returns:
            dict: The parsed JSON response from the API
        Summary of What Happens
        1. A Request object is created for the URL.
        2. A custom header (token) is added to the request.
        3. The request is sent to the server using urlopen.
        4. The server's response is read, decoded from bytes to a string, and parsed as JSON.
        5. The resulting Python object (data) contains the parsed JSON data.
    """
    try:
        req = urllib.request.Request(url)
        req.add_header('token', token)
        with urllib.request.urlopen(req) as response:
            data = json.loads(response.read().decode('utf-8'))
            # Get the remaining rate limit from the headers
            rate_limit_remaining = int(response.headers.get('X-RateLimit-Remaining', 0))
            return data, rate_limit_remaining

    except urllib.error.HTTPError as e:
        # Handle HTTP errors (e.g., 404, 401)
        print(f"HTTP Error: {e.code} - {e.reason}")
    except urllib.error.URLError as e:
        # Handle URL errors (e.g., network issues)
        print(f"URL Error: {e.reason}")
    except json.JSONDecodeError as e:
        # Handle JSON parsing errors
        print(f"JSON Decode Error: {e.msg}")
    except Exception as e:
        # Handle any other unexpected errors
        print(f"An unexpected error occurred: {e}")

    return None,0  # Return None if an error occurs

def save_json_to_file(data, filename):
    """
        Save JSON data to a file

        Args:
            data (dict): The JSON data to save
            filename (str): The name of the file to save the data to
    """
    try:
        with open(f'./data/{filename}', 'w') as f:
            json.dump(data, f, indent=4) # Pretty print the JSON, the indentation level is 4 spaces
            print(f'Data saved to {filename}')
    except IOError as e:
        # Handle file I/O errors
        print(f'File I/O Error: {e}')
    except Exception as e:
        # Handle any other unexpected errors
        print(f'An unexpected error occurred while saving the file: {e}')

In [3]:
web_site = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/locations'

limit = 1000  # The maximum number of records to retrieve in one request
offset_increment = limit  # Increment offset by the limit for each request



for i in range(0,40):  # Loop 39 times to create 39 files
    offset = i * offset_increment  # Calculate the offset for this iteration
    url = f"{web_site}?limit={limit}&offset={offset}"  # Construct the URL
    output_file = f"locations_{i}.json"  # Name the output file

    print(f"Calling API with: {url}")
    response, remaining_requests = call_api(url, token)

        # Check if the response is valid
    if response:
        results = response.get('results', [])
        print(f"Fetched {len(results)} records in this batch.")

        # Save the response to a file
        save_json_to_file(response, output_file)
        print(f"Data saved to {output_file}")

        # Stop if fewer records are returned than the limit
        if len(results) < limit:
            print("No more records to fetch.")
            break
    else:
        print(f"Failed to retrieve data for offset {offset}, exiting loop.")
        break

    


Calling API with: https://www.ncdc.noaa.gov/cdo-web/api/v2/locations?limit=1000&offset=0
Fetched 1000 records in this batch.
Data saved to locations_0.json
Data saved to locations_0.json
Calling API with: https://www.ncdc.noaa.gov/cdo-web/api/v2/locations?limit=1000&offset=1000
Fetched 1000 records in this batch.
Data saved to locations_1.json
Data saved to locations_1.json
Calling API with: https://www.ncdc.noaa.gov/cdo-web/api/v2/locations?limit=1000&offset=2000
Fetched 1000 records in this batch.
Data saved to locations_2.json
Data saved to locations_2.json
Calling API with: https://www.ncdc.noaa.gov/cdo-web/api/v2/locations?limit=1000&offset=3000
Fetched 1000 records in this batch.
Data saved to locations_3.json
Data saved to locations_3.json
Calling API with: https://www.ncdc.noaa.gov/cdo-web/api/v2/locations?limit=1000&offset=4000
Fetched 1000 records in this batch.
Data saved to locations_4.json
Data saved to locations_4.json
Calling API with: https://www.ncdc.noaa.gov/cdo-web/a

### Part 2 - Transforming Data

Convert Json files to Pandas Dataframe



In [4]:
import pandas as pd
import os
import json

In [5]:

def read_json(file_path):
    '''
        Read the json file from specified file path

        Args:
            file_path (str): the location of the file

        Returns:
            data(dict): the data stored in the json file 
    
    '''
    with open(file_path, 'r') as files:
        data = json.load(files)
    return data


def read_all_json_files(root):
    '''
        Read all json files in a directory and store in a data frame

        Args:
            root(str): the directory where the files are stored

        Returns:
            json_df (pd.Dataframe): Pandas Dataframe
    """
    
    '''
    json_df = pd.DataFrame() # create an empty DataFrame

    # loop through all files in the given directory
    for files in os.listdir(root):
        full_path = f'{root}/{files}' # creates the full path for each file, os.path.join(root, files) also works

        try:
            data = read_json(full_path)

            holder_df = pd.DataFrame(data['results']) # temp Dataframe to hold data until joined into master dataframe
            holder_df['source'] = files # add 'source' column to track the data source
            json_df = pd.concat([json_df,holder_df], ignore_index=True)

        except Exception as e:
            print(f'Error reading {full_path}: {e}')
    return json_df



In [None]:
# check on the first few rows
climate_data_df = read_all_json_files('./data')
climate_data_df.head()

Unnamed: 0,mindate,maxdate,name,datacoverage,id,source
0,1997-03-06,2025-07-28,"Grand Isle, VT 05458",0.95,ZIP:05458,locations_10.json
1,1948-05-01,2025-07-28,"Highgate Center, VT 05459",1.0,ZIP:05459,locations_10.json
2,1995-05-08,2025-07-28,"Hinesburg, VT 05461",1.0,ZIP:05461,locations_10.json
3,1955-11-01,2025-07-28,"Huntington, VT 05462",1.0,ZIP:05462,locations_10.json
4,1997-03-06,2025-07-28,"Isle la Motte, VT 05463",0.95,ZIP:05463,locations_10.json


In [7]:
# validate data was accurately aquried
climate_data_df.shape

(38863, 6)

#### Explore Data

In [None]:
# check for duplicated rows
dups = climate_data_df[climate_data_df.duplicated()] # filters Climate df to return only rows where .duplicated() is True
dups

Unnamed: 0,mindate,maxdate,name,datacoverage,id,source


In [13]:
# secondary check for unique

if climate_data_df['id'].is_unique:
    print("All entries in the 'id' column are unique")
else:
    print("There are duplicate entries in the 'id' column")

There are duplicate entries in the 'id' column


In [17]:
# Find duplicate IDs

dup_id = climate_data_df[climate_data_df.duplicated(subset='id', keep=False)]

dup_id

Unnamed: 0,mindate,maxdate,name,datacoverage,id,source
30863,2003-03-01,2025-07-26,"Kutahya, TU",1.0,CITY:TU000037,locations_1.json
36862,2003-03-01,2025-07-26,"Kutahya, TU",1.0,CITY:TU000037,locations_0.json


In [None]:
# deleting the duplicate row
climate_data = climate_data_df.drop(30863)

In [None]:
# rechecking for duplicates
dups2 = climate_data[climate_data.duplicated()] # filters Climate df to return only rows where .duplicated() is True
dups2

Unnamed: 0,mindate,maxdate,name,datacoverage,id,source


In [21]:
# secondary check for unique

if climate_data['id'].is_unique:
    print("All entries in the 'id' column are unique")
else:
    print("There are duplicate entries in the 'id' column")

All entries in the 'id' column are unique
