In [1]:
import os
import urllib.request as req
from bs4 import BeautifulSoup as bsoup
import tarfile

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('colheader_justify', 'left')
pd.set_option('display.max_rows', 20)

import psycopg2 as psql
import creds

dataFolder = './data/normals/'

* 1981-2010
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/1981-2010/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/1981-2010/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/1981-2010/archive/)
    
* 1991-2020
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/1991-2020/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/1991-2020/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/1991-2020/archive/)
    
* 2006-2020
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/2006-2020/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/2006-2020/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/2006-2020/archive/)

In [2]:
#Function to grab the tar files for each of the normal variations:
def get_tar_extract(main_url, normal_type, normal_period):
    
    normalFolder = f'{dataFolder}{normal_period}/{normal_type}/'
    
    if os.path.exists(normalFolder):
        # print(f'{normal_period} {normal_type} already downloaded...')
        return normalFolder
        
    base_url = f'{main_url}{normal_type}/{normal_period}/archive/'
   
    if normal_period == '1981-2010':
        tar_url = f'{base_url}{normal_type}.tar.gz'
    else:
        html_page = req.urlopen(base_url)
        soup = bsoup(html_page, "html.parser")
        for link in soup.findAll('a'):
            if 'station' in  link.text:
                tar_url = f'{base_url}{link.text}'
        
    ftpstream = req.urlopen(tar_url)
    file = tarfile.open(fileobj = ftpstream, mode = 'r|gz')
    
    print(f'Getting {normal_period} normals...')
    print(f'*** {normal_type} extracting to {normalFolder} ***')
    file.extractall(normalFolder)
    print(f'*** {normal_type} extraction completed ***')
    return normalFolder


In [3]:
# function to create station inventory files for 1981-2010 dailys and monthlys normals. Returns station inventory file location to add to climates dict

def get_station_inventory(normal_period, normal_type):
    location = f'{climates[normal_period][normal_type]["location"]}'
    inventory_csvName = f'{location}station-inventory.csv'
    # climates[normal_period][normal_type]['inventory-file'] = inventory_csvName
    
    if os.path.exists(inventory_csvName):
        # print(f'****** {inventory_csvName} exists... ******')
        return inventory_csvName
    
    print(f'----- Creating {inventory_csvName}... -----')
    df = pd.read_csv(f'{dataFolder}{period}/station-inventory.csv', dtype={'wmo_id': 'Int64'})
    all_stations_list = list(df.station_id)
    
    station_list = []
    for file in os.listdir(location):
        station = file.split('.')[0]
        station_list.append(all_stations_list.index(station))

    inventory_df = df.iloc[station_list]
    inventory_df.to_csv(inventory_csvName, index=False)
    
    return inventory_csvName


In [4]:
normal_periods = ['1981-2010','1991-2020','2006-2020']
normal_list = ['normals-hourly','normals-daily', 'normals-monthly']
main_url = 'https://www.ncei.noaa.gov/data/'

climates = {}
for normal_period in normal_periods:
    climates[normal_period] = {}
    for normal_type in normal_list:
        climates[normal_period][normal_type] = {'location' : get_tar_extract(main_url, normal_type, normal_period)}

file_list =[]
for normal_period in normal_periods:
    for normal_type in normal_list:
        inventory_file = get_station_inventory(normal_period, normal_type)
        climates[normal_period][normal_type]['inventory-file'] = inventory_file
        file_list.append(inventory_file)

#creates massive csv file which combines all station inventory files
mainInventory_file = f'./txt_files/all-stations.csv'        
with open(mainInventory_file, 'w') as outfile:
    outfile.write('station_id,latitude,longitude,elevation,state,name,network,wmo_id\n')
    for file in file_list:
        with open(file, 'r') as infile:
            infile.readline()
            outfile.write(infile.read())

# opens massive csv file into pandas DataFrame to drop duplicate rows and then resaves csv file
df = pd.read_csv(mainInventory_file)
df = df.drop_duplicates(keep='last').sort_values(by=['state', 'name']).reset_index(drop=True)
df.to_csv(mainInventory_file,index=False)


In [5]:
def connect_db(db_Name):
    try:
        conn = psql.connect(
                user=creds.USER,
                password=creds.PASS,
                host=creds.HOST,
                port=creds.PORT,
                database=db_Name
                )
    except psql.OperationalError as e:
        print(f'There is no database named {db_Name}...')
        create_db(db_Name)
        return connect_db(db_Name)
    else:
        print(f'Connected: {db_Name}')
        return conn
        
def create_db(db_Name):
    conn = psql.connect(
                user=creds.USER,
                password=creds.PASS,
                host=creds.HOST,
                port=creds.PORT,
                database='postgres'
                )
    conn.autocommit = True
    cursor = conn.cursor()
    sql=f'CREATE database {db_Name}'

    try:
        cursor.execute(sql)
    except psql.errors.lookup('42P04'): #psql error code for duplicatedatabase
        conn.close()
        return print(f"....Database: {db_Name} already exists....")
    else:
        print(f'....Database: {db_Name} created....')      
        conn.close()
        return print(f"....Database: {db_Name} created successfully....")

In [6]:
a = connect_db('climate_normals_db')

Connected: climate_normals_db


In [7]:
a.close()


In [8]:
create_db('climate_normals_db')

....Database: climate_normals_db already exists....


In [54]:
#creates massive csv file which combines all normal files per type 
mainInventory_file = f'./txt_files/all-stations.csv'        
with open(mainInventory_file, 'w') as outfile:
    outfile.write('station_id,latitude,longitude,elevation,state,name,network,wmo_id\n')
    for file in file_list:
        with open(file, 'r') as infile:
            infile.readline()
            outfile.write(infile.read())

# 1981-2010 headers
#RRR-EEEE-SSSSSS[-CCCCCCC]#
#RRR
reporting_period = {
    'ann': 'annual',
    'djf': 'December, January, February',
    'dly': 'daily',
    'hly': 'hourly',
    'jja': 'June, July, August',
    'mam': 'March, April, May',
    'mly': 'monthly',
    'mtd': 'month-to-date',
    'rtp': 'return periods',
    'son': 'September, October, November',
    'ytd': 'year-to-date'
}
#metorological element, EEEE
met_elem = {
    'cldd': 'cooling degree days',
    'cldh': 'cooling degree hours',
    'clod': 'clouds',
    'dewp': 'dew point temperature',
    'dutr': 'diurnal temperature range',
    'hidx': 'heat index',
    'htdd': 'heating degree days',
    'htdh': 'heating degree hours',
    'prcp': 'precipitation',
    'pres': 'sea level pressure',
    'snow': 'snowfall',
    'snwd': 'snow depth',
    'tavg': 'daily mean temperature (average of tmax and tmin)',
    'temp': 'temperature',
    'tmax': 'daily maximum temperature',
    'tmin': 'daily minimum temperature',
    'wchl': 'wind chill',
    'wind': 'wind'
}

#Statistic, SSSSSS
statistic = {
    '10pctl': 'Climatological 10th percentile',
    '1stdir': 'Prevailing Wind Direction',
    '1stpct': 'Prevailing Wind Percentage',
    '2nddir': 'Secondary Wind Direction',
    '2ndpct': 'Secondary Wind Percentage',
    '25pctl': 'Climatological 25th percentile',
    '50pctl': 'Climatological 50th percentile',
    '75pctl': 'Climatological 75th percentile',
    '90pctl': 'Climatological 90th percentile',
    'avgnds': 'Average Number of Days (followed by a condition)',
    'avgspd': 'Average Wind Speed',
    'baseNN': 'Average of base NN (other than 65F) Heating or Cooling Degree Days
    'normal': 'Climatological Average',
    'pctall': 'Probability of Occurrence (followed by a condition)',
    'pctbkn': 'Percent Broken (clouds)',
    'pctclm': 'Percent Calm (winds)',
    'pctclr': 'Percent Clear (clouds)',
    'pctfew': 'Percent Few (clouds)',
    'pctovc': 'Percent Overcast (clouds)',
    'pctsct': 'Percent Scattered (clouds)',
    'vctdir': 'Mean Wind Vector Direction',
    'vctspd': 'Mean Wind Vector Magnitude'
}

#Condition, -CCCCCCC
condition = {
    'geNNNhi' : 'greater than or equal to NNN hundredths of inches NNN can be 001,010,050,100 (for precipitation)', 
    'geNNNti' : 'greater than or equal to NNN tenths of inches NNN can be 001,010,030,050,100 (for snowfall)',
    'geNNNwi' : 'greater than or equal to NNN whole inches NNN can be 001,003,005,010 (for snow depth)',
    'grthNNN' : 'greater than or equal to NNN whole degrees Fahrenheit NNN can be 040,050,060,070,080,090,100',
    'lsthNNN' : 'less than or equal to NNN whole degrees Fahrenheit NNN can be 000,010,020,032,040,050,060'
}