In [1]:
import os
from pathlib import Path
import importlib

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('colheader_justify', 'left')
pd.set_option('display.max_rows', 20)

import psycopg2 as psql
import creds

dataFolder = './data/station_normals/'

* 1981-2010
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/1981-2010/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/1981-2010/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/1981-2010/archive/)
    
* 1991-2020
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/1991-2020/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/1991-2020/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/1991-2020/archive/)
    
* 2006-2020
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/2006-2020/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/2006-2020/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/2006-2020/archive/)

In [2]:
def combine_Files(combined_file, files_toCombine):
    if os.path.exists(combined_file):
        return print(f'{combined_file} exists...')
    
    with open(combined_file, 'w') as outfile:
        initial = 1
        for file in files_toCombine:
            with open(file, 'r') as infile:
                header = infile.readline()
                if initial:
                    outfile.write(f'{header}')
                    initial = 0
                outfile.write(infile.read())
    return print(f'{combined_file} created...')

In [3]:
import climate_normal_scripts as cns
importlib.reload(cns) # reloads script if script was changed after notebook kernel started

# get climate normals files
normal_periods = ['1981-2010','1991-2020','2006-2020']
normal_types = ['normals-hourly','normals-daily', 'normals-monthly']
main_url = 'https://www.ncei.noaa.gov/data/'

climates = {}
for normal_period in normal_periods:
    climates[normal_period] = {}
    for normal_type in normal_types:
        location = f'{dataFolder}{normal_period}/{normal_type}/'
        cns.get_climate_normals(main_url, normal_type, normal_period, location)
        climates[normal_period][normal_type] = {'location': location, 'inventory_file': f'{location}station-inventory.csv'}
        
        # file_list.append(f'{location}station-inventory.csv')
    
# mainInventory_file = f'./txt_files/station_inventory/all-stations.csv'  
# combine_Files(mainInventory_file, file_list)
# del file_list
# # opens massive csv file into pandas DataFrame to drop duplicate rows and then resaves csv file
# # note some station id values may be repeated due to other fields being different
# df = pd.read_csv(mainInventory_file)
# df = df.drop_duplicates(keep='last').sort_values(by=['state', 'name']).reset_index(drop=True)
# df.wmo_id = df.wmo_id.astype(dtype='Int64')
# df.to_csv(mainInventory_file,index=False)


1981-2010 normals-hourly already downloaded...
1981-2010 normals-daily already downloaded...
1981-2010 normals-monthly already downloaded...
1991-2020 normals-hourly already downloaded...
1991-2020 normals-daily already downloaded...
1991-2020 normals-monthly already downloaded...
2006-2020 normals-hourly already downloaded...
2006-2020 normals-daily already downloaded...
2006-2020 normals-monthly already downloaded...


In [4]:
def format_date(df, normal_type):
    '''
    Function which reads in the DATE field and compares to the expected Full time range. If there are any missing time records, those missing times are added
    into the record, with nan values for the corresponding weather climate normals for those missing times. Populates the month,day,hour fields based off the new DATE field. 

    Years are dummy values. For Daily files, the year is set to a leap year to also include the Leap day. Leap day is needed in hours and months files only.
    '''

    if normal_type == 'normals-hourly':
        full_range = pd.date_range(start= '1900-01-01 00:00:00', end = '1900-12-31 23:00:00', freq = 'H') # timeindex for all hours 
        dateOUT_format = '%b-%d %H:%M'
        df.DATE = pd.to_datetime(df.DATE, format = '%m-%dT%X')                                            # reformats DATE column into timeindex for comparison
        
    elif normal_type == 'normals-daily':
        full_range = pd.date_range(start= '1904-01-01', end = '1904-12-31', freq = 'D')
        dateOUT_format = '%b-%d'
        df.DATE = pd.to_datetime(df.DATE + '-1904', format = '%m-%d-%Y')                                  # reformats DATE column, adds year column to avoid error for leap year
    
    elif normal_type == 'normals-monthly':
        full_range = pd.date_range(start= '1900-01', end = '1900-12', freq = 'MS')
        dateOUT_format = '%b'
        df.DATE = pd.to_datetime(df.DATE, format = '%m')                                                  # reformats DATE column

    df = df.set_index('DATE').reindex(full_range)                                                     # sets DATE column to index then adds rows to record based off missing dates
    df.reset_index(inplace=True,drop=True)                                                            # resets index back to default dropping old DATE column
    df['DATE'] = full_range.strftime(dateOUT_format)
    
    df.month = full_range.month
    df.day = full_range.day
    df.hour = full_range.hour

    return df

In [60]:
def format_hourly_variables(df, normal_period):
    headers= df.columns
    ignore_headers = ['STATION', 'DATE','month', 'day', 'hour','HLY-WIND-VCTDIR', 'HLY-WIND-1STDIR', 'HLY-WIND-2NDDIR']    # fields to not be changed to floats
    
    wind_dir = ['HLY-WIND-1STDIR', 'HLY-WIND-2NDDIR']                                                                      # fields that will be changed to wind direction label based off value from 1-8
    wind_dir_labels = {1.0:'N', 2.0:'NE', 3.0:'E', 4.0:'SE', 5.0: 'S', 6.0:'SW', 7.0:'W', 8.0:'NW'} 
    df[wind_dir] = df[wind_dir].replace(wind_dir_labels)

    if normal_period == '1981-2010':
        ignore_headers += ['_ATTRIBUTES']  
    else:
        ignore_headers += ['_flags', 'years_']  

    normal_variables = [x for x in headers if not any(y in x for y in ignore_headers)]                # grabs all headers to be formatted   
    if normal_period == '1981-2010':
        df[normal_variables] = df[normal_variables].divide(10)

    return df

In [146]:
def format_daily_variables(df, normal_period):
    headers= df.columns
    ignore_headers = ['STATION', 'DATE','month', 'day', 'hour', 'CLDD', 'HTDD', 'GRDD']    # fields to not be formatted/changed

    if normal_period == '1981-2010':
        ignore_headers += ['_ATTRIBUTES']  
    else:
        ignore_headers += ['_flags', 'years_']  

    normal_variables = [x for x in headers if not any(y in x for y in ignore_headers)]                # grabs all headers to be formatted   
    
    if normal_period == '1981-2010':
        # convert precipitation values to hundredths of inches
        prcp_variables  = [ x for x in normal_variables if 'PRCP' in x and 'PCTALL' not in x]
        df[prcp_variables] = df[prcp_variables].divide(100)

        snwdpctl_variables  = [ x for x in normal_variables if 'SNWD' in x and 'PCTALL' not in x]    # snow depth variables are whole numbers

        # convert to tenths float
        normal_variables = [x for x in normal_variables if x not in prcp_variables and x not in snwdpctl_variables]  # removing variables already formatted
        df[normal_variables] = df[normal_variables].divide(10)
    
    
    return df

In [None]:
def format_monthly_variables(df, normal_period):
    pass

In [147]:

# standardizing normal files  
normal_period = '1981-2010'
# normal_period = '1991-2020'
normal_type = 'normals-hourly'
normal_type = 'normals-daily'
# normal_type = 'normals-monthly'


# for normal_period in normal_periods:
#     for normal_type in normal_types:

headerFile = f'./txt_files/csv_headers/headers-{normal_period}-{normal_type}.txt'
reformatted_File = f'{climates[normal_period][normal_type]["location"]}test.csv'  
headers = []
with open(headerFile, 'r') as hfile:
    lines = hfile.readlines()
    for line in lines:
        headers.append(line.strip('\n'))
     
normal_files = Path(f'{climates[normal_period][normal_type]["location"]}station_files/').glob('*.csv')
for normal_file in normal_files:
    df = pd.read_csv(normal_file)
    df = df.reindex(columns=headers) # standardize headers: adds missing columns(if any) and reorders columns based off headerFile
    df = df.replace(to_replace=[-9999.0, -7777.0, -6666.0, -4444.0, ' '], value= np.nan) 
    df = format_date(df, normal_period, normal_type)

    ## now to do formatting for individual variables based off documentation
    if normal_type == 'normals-hourly':
        df = format_hourly_variables(df, normal_period)
    elif normal_type == 'normals-daily':
        df = format_daily_variables(df, normal_period)
        pass
    elif normal_type == 'normals-monthly':
        # df = format_monthly_variables(df, normal_period)
        pass


    station_meta = {'STATION': df.STATION[0]} # station metadata to replace NaN in the metadata column
    df.fillna(value= station_meta, inplace=True)
    df = df.reindex(columns=headers) # ensures columns in correct order after formatting file
    break


df.head(2)

Unnamed: 0,STATION,DATE,month,day,hour,DLY-TAVG-NORMAL,DLY-TAVG-NORMAL_ATTRIBUTES,DLY-TAVG-STDDEV,DLY-TAVG-STDDEV_ATTRIBUTES,DLY-TMAX-NORMAL,...,DLY-SNWD-75PCTL,DLY-SNWD-75PCTL_ATTRIBUTES,DLY-SNWD-PCTALL-GE001WI,DLY-SNWD-PCTALL-GE001WI_ATTRIBUTES,DLY-SNWD-PCTALL-GE003WI,DLY-SNWD-PCTALL-GE003WI_ATTRIBUTES,DLY-SNWD-PCTALL-GE005WI,DLY-SNWD-PCTALL-GE005WI_ATTRIBUTES,DLY-SNWD-PCTALL-GE010WI,DLY-SNWD-PCTALL-GE010WI_ATTRIBUTES
0,AQC00914000,Jan-01,1,1,0,,,,,,...,,,0.0,P,0.0,P,0.0,P,0.0,P
1,AQC00914000,Jan-02,1,2,0,,,,,,...,,,0.0,P,0.0,P,0.0,P,0.0,P


366

In [102]:
normal_periods = ['1981-2010','1991-2020','2006-2020']
normal_types = ['normals-hourly','normals-daily', 'normals-monthly']

for normal_period in normal_periods:
    header_list =[]
    for normal_type in normal_types:
        combined_file = f'{climates[normal_period][normal_type]["location"]}{normal_type}-{normal_period}.csv'
        files_location = f'{climates[normal_period][normal_type]["location"]}station_files/'
        files_toCombine = Path(files_location).glob('*.csv')
        # combine_Files(combined_file, files_toCombine)

        for file in files_toCombine:
            with open(file, 'r') as infile:
                header_list.append([file.name, infile.readline()])

        break
    break

In [9]:
def connect_db(db_Name):
    try:
        conn = psql.connect(
                user=creds.USER,
                password=creds.PASS,
                host=creds.HOST,
                port=creds.PORT,
                database=db_Name
                )
    except psql.OperationalError as e:
        print(f'There is no database named {db_Name}...')
        create_db(db_Name)
        return connect_db(db_Name)
    else:
        print(f'Connected: {db_Name}')
        return conn
        
def create_db(db_Name):
    conn = psql.connect(
                user=creds.USER,
                password=creds.PASS,
                host=creds.HOST,
                port=creds.PORT,
                database='postgres'
                )
    conn.autocommit = True
    cursor = conn.cursor()
    sql=f'CREATE database {db_Name}'

    try:
        cursor.execute(sql)
    except psql.errors.lookup('42P04'): #psql error code for duplicatedatabase
        conn.close()
        return print(f"....Database: {db_Name} already exists....")
    else:
        print(f'....Database: {db_Name} created....')      
        conn.close()
        return print(f"....Database: {db_Name} created successfully....")

In [10]:
a = connect_db('climate_normals_db')

Connected: climate_normals_db


In [11]:
a.close()


In [12]:
create_db('climate_normals_db')

....Database: climate_normals_db already exists....


In [6]:
import csv

baselines = [['STATION'], ['DATE'],['month'],['day'],['hour']]
for normal_period in normal_periods:
    for normal_type in normal_types:
        headerFile = f'./txt_files/csv_headers/headers-{normal_period}-{normal_type}.txt'
        varFile = f'./txt_files/variables/variables-{normal_period}-{normal_type}.csv'

        with open(headerFile, 'w' , newline='') as outfile:
            writer = csv.writer(outfile)
            writer.writerows(baselines)
            with open(varFile, 'r') as infile:
                reader = csv.reader(infile)
                for row in reader:
                    writer.writerow([row[0]])
                    if normal_period == '1981-2010':
                        writer.writerow([row[0] + '_ATTRIBUTES'])
                    else:
                        writer.writerow(['meas_flag_' + row[0]])
                        writer.writerow(['comp_flag_' + row[0]])
                        writer.writerow(['years_' + row[0]]) 