In [1]:
import os
from pathlib import Path
import urllib.request as req
from bs4 import BeautifulSoup as bsoup
import tarfile

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('colheader_justify', 'left')
pd.set_option('display.max_rows', 20)

import psycopg2 as psql
import creds

dataFolder = './data/station_normals/'

* 1981-2010
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/1981-2010/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/1981-2010/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/1981-2010/archive/)
    
* 1991-2020
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/1991-2020/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/1991-2020/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/1991-2020/archive/)
    
* 2006-2020
    * [hourly](https://www.ncei.noaa.gov/data/normals-hourly/2006-2020/archive/), [daily](https://www.ncei.noaa.gov/data/normals-daily/2006-2020/archive/), [monthly](https://www.ncei.noaa.gov/data/normals-monthly/2006-2020/archive/)

In [2]:
#Function to grab the tar files for each of the normal variations:
def get_tar_extract(main_url, normal_type, normal_period):
    
    normalFolder = f'{dataFolder}{normal_period}/{normal_type}/'
    
    if os.path.exists(f'{normalFolder}station_files'):
        print(f'{normal_period} {normal_type} already downloaded...')
        return normalFolder
        
    base_url = f'{main_url}{normal_type}/{normal_period}/archive/'
   
    if normal_period == '1981-2010':
        tar_url = f'{base_url}{normal_type}.tar.gz'
    else:
        html_page = req.urlopen(base_url)
        soup = bsoup(html_page, "html.parser")
        for link in soup.findAll('a'):
            if 'station' in  link.text:
                tar_url = f'{base_url}{link.text}'
        
    ftpstream = req.urlopen(tar_url)
    file = tarfile.open(fileobj = ftpstream, mode = 'r|gz')
    
    print(f'Getting {normal_period} normals...')
    print(f'*** {normal_type} extracting to {normalFolder}station_files ***')
    file.extractall(f'{normalFolder}station_files')
    print(f'*** {normal_type} extraction completed ***')
    return normalFolder


In [5]:
def combine_Files(combined_file, files_toCombine):
    if os.path.exists(combined_file):
        return print(f'{combined_file} exists...')
    
    with open(combined_file, 'w') as outfile:
        initial = 1
        for file in files_toCombine:
            with open(file, 'r') as infile:
                header = infile.readline()
                if initial:
                    outfile.write(f'{header}')
                    initial = 0
                outfile.write(infile.read())
    return print(f'{combined_file} created...')

In [133]:
# get climate normals files and creates master station list

normal_periods = ['1981-2010','1991-2020','2006-2020']
normal_types = ['normals-hourly','normals-daily', 'normals-monthly']
main_url = 'https://www.ncei.noaa.gov/data/'

climates = {}
file_list =[]
for normal_period in normal_periods:
    climates[normal_period] = {}
    for normal_type in normal_types:
        location = get_tar_extract(main_url, normal_type, normal_period)
        climates[normal_period][normal_type] = {'location': location, 'inventory_file': f'{location}station-inventory.csv'}
        file_list.append(f'{location}station-inventory.csv')
    
mainInventory_file = f'./txt_files/all-stations.csv'  
combine_Files(mainInventory_file, file_list)
del file_list
# opens massive csv file into pandas DataFrame to drop duplicate rows and then resaves csv file
# note some station id values may be repeated due to other fields being different
df = pd.read_csv(mainInventory_file)
df = df.drop_duplicates(keep='last').sort_values(by=['state', 'name']).reset_index(drop=True)
df.wmo_id = df.wmo_id.astype(dtype='Int64')
df.to_csv(mainInventory_file,index=False)


1981-2010 normals-hourly already downloaded...
1981-2010 normals-daily already downloaded...
1981-2010 normals-monthly already downloaded...
1991-2020 normals-hourly already downloaded...
1991-2020 normals-daily already downloaded...
1991-2020 normals-monthly already downloaded...
2006-2020 normals-hourly already downloaded...
2006-2020 normals-daily already downloaded...
2006-2020 normals-monthly already downloaded...
./txt_files/all-stations.csv exists...


In [145]:
normal_period = '1981-2010'
normal_type = 'normals-hourly'
headerFile = f'./txt_files/csv_headers/headers-{normal_period}-{normal_type}.txt'
reformatted_File = f'{climates[normal_period][normal_type]["location"]}test.csv'  

headers = []
with open(headerFile, 'r') as hfile:
    lines = hfile.readlines()
    for line in lines:
        headers.append(line.strip('\n'))
     
normal_files = Path(f'{climates[normal_period][normal_type]["location"]}station_files/').glob('*.csv')
for normal_file in normal_files:
    df = pd.read_csv(normal_file)
    # if normal_period == '1981-2010':
    #     dates_dict = format_date(df.DATE, '%m-%dT%X')
    #     df = df.append(dates_dict, ignore_index=True)
    # df = df.reindex(columns=headers)
    



    break
df.head()


Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,HLY-CLDH-NORMAL,HLY-CLDH-NORMAL_ATTRIBUTES,HLY-CLOD-PCTBKN,HLY-CLOD-PCTBKN_ATTRIBUTES,...,HLY-WIND-2NDPCT,HLY-WIND-2NDPCT_ATTRIBUTES,HLY-WIND-AVGSPD,HLY-WIND-AVGSPD_ATTRIBUTES,HLY-WIND-PCTCLM,HLY-WIND-PCTCLM_ATTRIBUTES,HLY-WIND-VCTDIR,HLY-WIND-VCTDIR_ATTRIBUTES,HLY-WIND-VCTSPD,HLY-WIND-VCTSPD_ATTRIBUTES
0,AQW00061705,01-01T00:00:00,-14.33056,-170.71361,3.7,"PAGO PAGO WEATHER SERVICE OFFICE AIRPORT, AS AQ",154.0,P,383,C,...,169.0,S,94.0,S,86.0,S,37.0,S,33.0,S
1,AQW00061705,01-01T01:00:00,-14.33056,-170.71361,3.7,"PAGO PAGO WEATHER SERVICE OFFICE AIRPORT, AS AQ",153.0,C,363,C,...,164.0,C,91.0,C,62.0,C,43.0,C,33.0,C
2,AQW00061705,01-01T02:00:00,-14.33056,-170.71361,3.7,"PAGO PAGO WEATHER SERVICE OFFICE AIRPORT, AS AQ",151.0,C,380,C,...,173.0,C,89.0,C,74.0,C,49.0,C,34.0,C
3,AQW00061705,01-01T03:00:00,-14.33056,-170.71361,3.7,"PAGO PAGO WEATHER SERVICE OFFICE AIRPORT, AS AQ",149.0,C,354,C,...,175.0,C,89.0,C,71.0,C,40.0,C,30.0,C
4,AQW00061705,01-01T04:00:00,-14.33056,-170.71361,3.7,"PAGO PAGO WEATHER SERVICE OFFICE AIRPORT, AS AQ",147.0,C,382,C,...,181.0,C,90.0,C,78.0,C,46.0,C,32.0,C


In [149]:
df.isnull().sum()

STATION                        0
DATE                           0
LATITUDE                       0
LONGITUDE                      0
ELEVATION                      0
                              ..
HLY-WIND-PCTCLM_ATTRIBUTES    88
HLY-WIND-VCTDIR               88
HLY-WIND-VCTDIR_ATTRIBUTES    88
HLY-WIND-VCTSPD               88
HLY-WIND-VCTSPD_ATTRIBUTES    88
Length: 58, dtype: int64

In [92]:
def format_date(date_series, format_IN):
    temp = pd.to_datetime(date_series, format = format_IN)
    dates = temp.apply(lambda x: x.strftime('%m-%d %H'))
    months = temp.apply(lambda x: int(x.strftime('%m')))
    days = temp.apply(lambda x: int(x.strftime('%d')))
    hours = temp.apply(lambda x: int(x.strftime('%H')))
    return {'DATE': dates, 'month': months, 'day': days, 'hour': hours}


In [121]:
col = ['a','b','c']
cols= ['b','da','c','a']
a = pd.DataFrame({'a':[1],'b':[2],'c':[3]})
print(a)
b = a.reindex(columns=cols)
print(b)

   a  b  c
0  1  2  3
   b  da  c  a
0  2 NaN  3  1


In [39]:
normal_periods = ['1981-2010','1991-2020','2006-2020']
normal_types = ['normals-hourly','normals-daily', 'normals-monthly']

for normal_period in normal_periods:
    header_list =[]
    for normal_type in normal_types:
        combined_file = f'{climates[normal_period][normal_type]["location"]}{normal_type}-{normal_period}.csv'
        files_location = f'{climates[normal_period][normal_type]["location"]}station_files/'
        files_toCombine = Path(files_location).glob('*.csv')
        # combine_Files(combined_file, files_toCombine)

        for file in files_toCombine:
            with open(file, 'r') as infile:
                header_list.append([file.name, infile.readline()])

        break
    break

In [6]:
a = connect_db('climate_normals_db')

Connected: climate_normals_db


In [5]:
def connect_db(db_Name):
    try:
        conn = psql.connect(
                user=creds.USER,
                password=creds.PASS,
                host=creds.HOST,
                port=creds.PORT,
                database=db_Name
                )
    except psql.OperationalError as e:
        print(f'There is no database named {db_Name}...')
        create_db(db_Name)
        return connect_db(db_Name)
    else:
        print(f'Connected: {db_Name}')
        return conn
        
def create_db(db_Name):
    conn = psql.connect(
                user=creds.USER,
                password=creds.PASS,
                host=creds.HOST,
                port=creds.PORT,
                database='postgres'
                )
    conn.autocommit = True
    cursor = conn.cursor()
    sql=f'CREATE database {db_Name}'

    try:
        cursor.execute(sql)
    except psql.errors.lookup('42P04'): #psql error code for duplicatedatabase
        conn.close()
        return print(f"....Database: {db_Name} already exists....")
    else:
        print(f'....Database: {db_Name} created....')      
        conn.close()
        return print(f"....Database: {db_Name} created successfully....")

In [7]:
a.close()


In [8]:
create_db('climate_normals_db')

....Database: climate_normals_db already exists....


# 1981-2010 headers
#RRR-EEEE-SSSSSS[-CCCCCCC]#
#RRR
reporting_period = {
    'ann': 'annual',
    'djf': 'December, January, February',
    'dly': 'daily',
    'hly': 'hourly',
    'jja': 'June, July, August',
    'mam': 'March, April, May',
    'mly': 'monthly',
    'mtd': 'month-to-date',
    'rtp': 'return periods',
    'son': 'September, October, November',
    'ytd': 'year-to-date'
}
#metorological element, EEEE
met_elem = {
    'cldd': 'cooling degree days',
    'cldh': 'cooling degree hours',
    'clod': 'clouds',
    'dewp': 'dew point temperature',
    'dutr': 'diurnal temperature range',
    'hidx': 'heat index',
    'htdd': 'heating degree days',
    'htdh': 'heating degree hours',
    'prcp': 'precipitation',
    'pres': 'sea level pressure',
    'snow': 'snowfall',
    'snwd': 'snow depth',
    'tavg': 'daily mean temperature (average of tmax and tmin)',
    'temp': 'temperature',
    'tmax': 'daily maximum temperature',
    'tmin': 'daily minimum temperature',
    'wchl': 'wind chill',
    'wind': 'wind'
}

#Statistic, SSSSSS
statistic = {
    '10pctl': 'Climatological 10th percentile',
    '1stdir': 'Prevailing Wind Direction',
    '1stpct': 'Prevailing Wind Percentage',
    '2nddir': 'Secondary Wind Direction',
    '2ndpct': 'Secondary Wind Percentage',
    '25pctl': 'Climatological 25th percentile',
    '50pctl': 'Climatological 50th percentile',
    '75pctl': 'Climatological 75th percentile',
    '90pctl': 'Climatological 90th percentile',
    'avgnds': 'Average Number of Days (followed by a condition)',
    'avgspd': 'Average Wind Speed',
    'baseNN': 'Average of base NN (other than 65F) Heating or Cooling Degree Days
    'normal': 'Climatological Average',
    'pctall': 'Probability of Occurrence (followed by a condition)',
    'pctbkn': 'Percent Broken (clouds)',
    'pctclm': 'Percent Calm (winds)',
    'pctclr': 'Percent Clear (clouds)',
    'pctfew': 'Percent Few (clouds)',
    'pctovc': 'Percent Overcast (clouds)',
    'pctsct': 'Percent Scattered (clouds)',
    'vctdir': 'Mean Wind Vector Direction',
    'vctspd': 'Mean Wind Vector Magnitude'
}

#Condition, -CCCCCCC
condition = {
    'geNNNhi' : 'greater than or equal to NNN hundredths of inches NNN can be 001,010,050,100 (for precipitation)', 
    'geNNNti' : 'greater than or equal to NNN tenths of inches NNN can be 001,010,030,050,100 (for snowfall)',
    'geNNNwi' : 'greater than or equal to NNN whole inches NNN can be 001,003,005,010 (for snow depth)',
    'grthNNN' : 'greater than or equal to NNN whole degrees Fahrenheit NNN can be 040,050,060,070,080,090,100',
    'lsthNNN' : 'less than or equal to NNN whole degrees Fahrenheit NNN can be 000,010,020,032,040,050,060'
}