In [1]:
import os
from pathlib import Path
import importlib

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('colheader_justify', 'left')
pd.set_option('display.max_rows', 20)

import psycopg2 as psql
import creds

dataFolder = './data/station_normals/'
txtFiles_Folder = './txt_files/'
normals_dict = { '1981-2010': {'normals-hourly': { },'normals-daily': { }, 'normals-monthly': { }},
                 '1991-2020': {'normals-hourly': { },'normals-daily': { }, 'normals-monthly': { }},
                 '2006-2020': {'normals-hourly': { },'normals-daily': { }, 'normals-monthly': { }}
                 }   

* 1981-2010
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/1981-2010/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/1981-2010/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/1981-2010/archive/)
    
* 1991-2020
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/1991-2020/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/1991-2020/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/1991-2020/archive/)
    
* 2006-2020
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/2006-2020/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/2006-2020/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/2006-2020/archive/)

In [2]:
def combine_Files(combined_file, files_toCombine):
    if os.path.exists(combined_file):
        return print(f'{combined_file} exists...')
    
    with open(combined_file, 'w') as outfile:
        initial = 1
        for file in files_toCombine:
            with open(file, 'r') as infile:
                header = infile.readline()
                if initial:
                    outfile.write(f'{header}')
                    initial = 0
                outfile.write(infile.read())
    return print(f'{combined_file} created...')

In [169]:
import climate_normal_scripts as cns
importlib.reload(cns) # reloads script if script was changed after notebook kernel started

# get climate normals files
normal_periods = ['1981-2010','1991-2020','2006-2020']
normal_types = ['normals-hourly','normals-daily', 'normals-monthly']
main_url = 'https://www.ncei.noaa.gov/data/'

for normal_period, normal_types in normals_dict.items():
    for normal_type in normal_types:
        location = f'{dataFolder}{normal_period}/{normal_type}/'
        cns.get_climate_normals(main_url, normal_type, normal_period, location)
        normals_dict[normal_period][normal_type] = {'files_location': location,
                                                   'inventory_file': f'{txtFiles_Folder}station-inventory-{normal_period}_{normal_type}.csv'}

cns.generate_header_files()
    
# mainInventory_file = f'{txtFiles_Folder}station_inventory/all-stations.csv'  
# combine_Files(mainInventory_file, file_list)
# del file_list
# # opens massive csv file into pandas DataFrame to drop duplicate rows and then resaves csv file
# # note some station id values may be repeated due to other fields being different
# df = pd.read_csv(mainInventory_file)
# df = df.drop_duplicates(keep='last').sort_values(by=['state', 'name']).reset_index(drop=True)
# df.wmo_id = df.wmo_id.astype(dtype='Int64')
# df.to_csv(mainInventory_file,index=False)


1981-2010 normals-hourly already downloaded...
1981-2010 normals-daily already downloaded...
1981-2010 normals-monthly already downloaded...
1991-2020 normals-hourly already downloaded...
1991-2020 normals-daily already downloaded...
1991-2020 normals-monthly already downloaded...
2006-2020 normals-hourly already downloaded...
2006-2020 normals-daily already downloaded...
2006-2020 normals-monthly already downloaded...
Headers files generated at ./txt_files/csv_headers/


In [4]:
def format_date(df, normal_type):
    '''
    Function which reads in the DATE field and compares to the expected Full time range. If there are any missing time records, those missing times are added
    into the record, with nan values for the corresponding weather climate normals for those missing times. Populates the month,day,hour fields based off the new DATE field. 

    Years are dummy values. For Daily files, the year is set to a leap year to also include the Leap day. Leap day is needed in hours and months files only.
    '''

    if normal_type == 'normals-hourly':
        full_range = pd.date_range(start= '1900-01-01 00:00:00', end = '1900-12-31 23:00:00', freq = 'H') # timeindex for all hours 
        dateOUT_format = '%b-%d %H:%M'
        df.DATE = pd.to_datetime(df.DATE, format = '%m-%dT%X')                                            # reformats DATE column into timeindex for comparison
        
    elif normal_type == 'normals-daily':
        full_range = pd.date_range(start= '1904-01-01', end = '1904-12-31', freq = 'D')
        dateOUT_format = '%b-%d'
        df.DATE = pd.to_datetime(df.DATE + '-1904', format = '%m-%d-%Y')                                  # reformats DATE column, adds year column to avoid error for leap year
    
    elif normal_type == 'normals-monthly':
        full_range = pd.date_range(start= '1900-01', end = '1900-12', freq = 'MS')
        dateOUT_format = '%b'
        df.DATE = pd.to_datetime(df.DATE, format = '%m')                                                  # reformats DATE column

    df = df.set_index('DATE').reindex(full_range)                                                     # sets DATE column to index then adds rows to record based off missing dates
    df.reset_index(inplace=True,drop=True)                                                            # resets index back to default dropping old DATE column
    df['DATE'] = full_range.strftime(dateOUT_format)
    
    df.month = full_range.month
    df.day = full_range.day
    df.hour = full_range.hour

    return df

In [183]:
def format_variables(df, headersFormat):
    tenths = [header.split(',')[0] for header in headersFormat if header.split(',')[1] == 'Tenths']
    hundredths = [header.split(',')[0] for header in headersFormat if header.split(',')[1] == 'Hundredths']
    wind_dir = [header.split(',')[0] for header in headersFormat if header.split(',')[1] == 'Wind_Direction']
    wind_dir_labels = {1.0:'N', 2.0:'NE', 3.0:'E', 4.0:'SE', 5.0: 'S', 6.0:'SW', 7.0:'W', 8.0:'NW'} 

    df[tenths] = df[tenths].divide(10)
    df[hundredths] = df[hundredths].divide(100)
    df[wind_dir] = df[wind_dir].replace(wind_dir_labels)
    return df

In [194]:

# standardizing normal files  
normal_period = '1981-2010'
# normal_period = '1991-2020'
normal_type = 'normals-hourly'
# normal_type = 'normals-daily'
# normal_type = 'normals-monthly'


# for normal_period in normal_periods:
#     for normal_type in normal_types:

headerFile = f'{txtFiles_Folder}csv_headers/headers-{normal_period}-{normal_type}.txt'
reformatted_File = f'{normals_dict[normal_period][normal_type]["files_location"]}test.csv'  
headers = []
headersFormat = []
with open(headerFile, 'r') as hfile:
    lines = hfile.readlines()
    for line in lines:
        headers.append(line.strip('\n').split(',')[0])
        headersFormat.append(line.strip('\n'))
     
normal_files = Path(f'{normals_dict[normal_period][normal_type]["files_location"]}station_files/').glob('*.csv')
for normal_file in normal_files:
    df = pd.read_csv(normal_file)
    df = df.reindex(columns=headers) # standardize headers: adds missing columns(if any) and reorders columns based off headerFile
    df = df.replace(to_replace=[-9999.0, -7777.0, -6666.0, -4444.0, ' '], value= np.nan) 
    df = format_date(df, normal_type)
    
    df = format_variables(df, headersFormat[5:])
    
    station_meta = {'STATION': df.STATION[0]} # station metadata to replace NaN in the metadata column
    df.fillna(value= station_meta, inplace=True)
    df = df.reindex(columns=headers) # ensures columns in correct order after formatting file
    break


df.head(2)

Unnamed: 0,STATION,DATE,month,day,hour,HLY-TEMP-NORMAL,HLY-TEMP-NORMAL_ATTRIBUTES,HLY-TEMP-10PCTL,HLY-TEMP-10PCTL_ATTRIBUTES,HLY-TEMP-90PCTL,...,HLY-CLOD-PCTCLR,HLY-CLOD-PCTCLR_ATTRIBUTES,HLY-CLOD-PCTFEW,HLY-CLOD-PCTFEW_ATTRIBUTES,HLY-CLOD-PCTSCT,HLY-CLOD-PCTSCT_ATTRIBUTES,HLY-CLOD-PCTBKN,HLY-CLOD-PCTBKN_ATTRIBUTES,HLY-CLOD-PCTOVC,HLY-CLOD-PCTOVC_ATTRIBUTES
0,AQW00061705,Jan-01 00:00,1,1,0,80.4,P,75.9,P,83.8,...,0.5,C,7.3,C,15.2,C,38.3,C,38.8,C
1,AQW00061705,Jan-01 01:00,1,1,1,80.3,C,76.6,C,83.5,...,0.0,C,6.5,C,14.0,C,36.3,C,43.2,C


In [102]:
normal_periods = ['1981-2010','1991-2020','2006-2020']
normal_types = ['normals-hourly','normals-daily', 'normals-monthly']

for normal_period, normal_types in normals_dict.items():
    header_list =[]
    for normal_type in normal_types:
        combined_file = f'{normals_dict[normal_period][normal_type]["files_location"]}{normal_type}-{normal_period}.csv'
        files_location = f'{normals_dict[normal_period][normal_type]["files_location"]}station_files/'
        files_toCombine = Path(files_location).glob('*.csv')
        # combine_Files(combined_file, files_toCombine)

        for file in files_toCombine:
            with open(file, 'r') as infile:
                header_list.append([file.name, infile.readline()])

        break
    break

In [9]:
def connect_db(db_Name):
    try:
        conn = psql.connect(
                user=creds.USER,
                password=creds.PASS,
                host=creds.HOST,
                port=creds.PORT,
                database=db_Name
                )
    except psql.OperationalError as e:
        print(f'There is no database named {db_Name}...')
        create_db(db_Name)
        return connect_db(db_Name)
    else:
        print(f'Connected: {db_Name}')
        return conn
        
def create_db(db_Name):
    conn = psql.connect(
                user=creds.USER,
                password=creds.PASS,
                host=creds.HOST,
                port=creds.PORT,
                database='postgres'
                )
    conn.autocommit = True
    cursor = conn.cursor()
    sql=f'CREATE database {db_Name}'

    try:
        cursor.execute(sql)
    except psql.errors.lookup('42P04'): #psql error code for duplicatedatabase
        conn.close()
        return print(f"....Database: {db_Name} already exists....")
    else:
        print(f'....Database: {db_Name} created....')      
        conn.close()
        return print(f"....Database: {db_Name} created successfully....")

In [10]:
a = connect_db('climate_normals_db')

Connected: climate_normals_db


In [11]:
a.close()


In [12]:
create_db('climate_normals_db')

....Database: climate_normals_db already exists....
