# Imports

In [1]:
import os
import pandas as pd
import boto3
from botocore import UNSIGNED
from botocore.config import Config

# Constants

In [2]:
# Change path depending on where data is located
data_root = 'LOCAL_PATH_TO_DATA'
column_names = ['ID', 'DATE', 'ELEMENT', 'DATA VALUE', 'M-FLAG', 'Q-FLAG', 'S-FLAG', 'OBS-TIME']

# Data Download

The data can is located in an AWS bucket. This part of the notebook connects to the bucket, checks if the files are present in the local path, and downloads them if they are not.

### Setup Bucket connection

In [3]:
s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
s3_resource = boto3.resource("s3", config=Config(signature_version=UNSIGNED))
bucket = s3_resource.Bucket('noaa-ghcn-pds')

### Download files

There are two types of files in the bucket: one `.csv` per year between 1763 and 2022, and one `.csv` per station. The yearly files are an aggregated version of the ones per station, therefore we are only using the yearly files for our analysis. 
The `.csv` by station are disregarded.

As they are sorted (yearly files first, files per station after), the iteration stops after the first element that has `by_station` in its key.

In [6]:
size = 0
for obj in bucket.objects.all(): 
    if 'by_station' not in obj.key: 
        size += 1 
    else:
        break

print(f'Number of yearly files: {size}')

Number of yearly files:  260


In [26]:
for obj in bucket.objects.all():
    if 'by_station' not in obj.key:
        _, filename = os.path.split(obj.key)
        local_path = data_root + filename
        if not os.path.isfile(local_path):
            s3_client.download_file('noaa-ghcn-pds', obj.key, local_path)
            print(f'{filename} downloaded')
    else:
        break

# Data Analysis

### Load file

In [3]:
def load_df(year):
    df = pd.read_csv(data_root + str(year) + '.csv.gz', header = 0, names = column_names)
    return df

In [6]:
df = load_df(1887)