NOAA stores Free Hourly Weather Data at: ftp://ftp.ncdc.noaa.gov/pub/data/noaa/

The most detailed data is in a very cumbersome format, but a subset of easy to parse data can be found at: ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite/

In [None]:
# Column Names were determined from ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite/isd-lite-format.pdf
# That pdf describes what data is contained in the subset of data that I'll focus on.
isd_fwf_cols = ['year', 'month', 'day', 'hour', 'air_temp_c', 'dew_pt_temp_c',
                 'sea_lvl_press_hectoPa', 'wnd_dir_360', 'wnd_spd_mtrpersec',
                 'sky_condition', 'precip_hrly', 'precip_6hr_accum']

In [27]:
# Importing the python libraries that I use.
import pandas as pd
import numpy as np

In [36]:
# Importing the table defining the available data. 
# There is a row for each station and it includes the begin and end date of available data.
isd_stations_data = pd.read_csv('ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.csv')
isd_stations_data.head()

In [45]:
# I want data for DC, so I've chosen to search for the local airport. Reagan National Airport (DCA).
# Note that all of the Station Names are uppercase.
DCA_search = isd_stations_data.loc[(isd_stations_data['STATION NAME'].isna() == False) 
                                   & (isd_stations_data['STATION NAME'].str.contains('REAGAN'))]

In [81]:
# Slicing out the BEGIN and END years to create the range of years for which I'll download data.
start_year = str(list(DCA_search.BEGIN)[0])[0:4]
end_year = str(list(DCA_search.END)[0])[0:4]
year_range = range(int(start_year), int(end_year)+1)
year_range

range(1936, 2020)

In [80]:
# Creating the station ID by which the ftp site is organized.
# Note that it is the concatenation of two columns separated by a hyphen.
station_id = str(list(DCA_search.USAF)[0])+'-'+str(list(DCA_search.WBAN)[0])
station_id

'724050-13743'

In [94]:
# Function to loop through a given station ID for a given range of years.
def download_isd_lite(station_id, year_range):
    isd_df = pd.DataFrame()
    for year in year_range:
        # There can be gaps of missing years in the data, so try and except were required. 
        #The gaps that I've seen are from decades ago.
        try:
            new_isd_df = pd.read_fwf('ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite/'+str(year)+'/'+station_id+'-'+str(year)+'.gz',
                                     header=None)
            isd_df = pd.concat([isd_df, new_isd_df])
        except:
            continue
    
    # Resetting the index of the concatenated DataFrame
    isd_df.reset_index(inplace=True, drop=True)
    
    # Setting the column names
    isd_df.columns = isd_fwf_cols
   
    # NOAA populates missing values with -9999, but I've chosen to replace them with NaN's.
    isd_df.replace({-9999: np.nan}, inplace=True)
    
    # Some of the columns are scaled by a factor of 10 to eliminate decimal points,
    # which would complicate the fixed width format that NOAA has chosen to utilize
    scaled_columns = ['air_temp_c', 'dew_pt_temp_c', 'sea_lvl_press_hectoPa', 
                  'wnd_spd_mtrpersec', 'precip_hrly', 'precip_6hr_accum']
    scaling_factor = 10
    # Resolving the scaling factor
    isd_df[scaled_columns] = isd_df[scaled_columns] / 10
    
    # Creating a date_time column from the various time-based columns NOAA provides.
    # The first step is creating a properly formatted string that pandas can parse, and then I parse them.
    isd_df['date_time'] = isd_df.day.astype('int').astype('str').str.zfill(2)+'/'\
                         +isd_df.month.astype('int').astype('str').str.zfill(2)+'/'\
                         +isd_df.year.astype('int').astype('str')+'/'\
                         +isd_df.hour.astype('int').astype('str').str.zfill(2)
    isd_df['date_time'] = pd.to_datetime(isd_df['date_time'], format='%d/%m/%Y/%H')
    
    return isd_df

# Excel Users
You could create the ftp URLs in Excel and then manually click all the links you create. If you had a column with the station_id and year for each file you want then you could construct the URLs as follows:

=HYPERLINK("ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite/"&year_cell&"/"&station_id_cell&"-"&year_cell&".gz")

You can then unzip each file, and open it in Excel. Using Excel's "Text to Columns" feature with the "Original data type" option set to "Fixed width", Excel will correctly separate the data in to columns. You can then manually add the column headers as desired. And you could manually aggregate the data for multiple years and/or stations as needed.

In [96]:
# Running the function for DCA for all years
isd_df = download_isd_lite(station_id, year_range)

In [99]:
# Inspecting the results
isd_df.info()

In [103]:
isd_df.tail()

Unnamed: 0,year,month,day,hour,air_temp_c,dew_pt_temp_c,sea_lvl_press_hectoPa,wnd_dir_360,wnd_spd_mtrpersec,sky_condition,precip_hrly,precip_6hr_accum
0,1936,9,1,11,157,135,-9999,0,0,-9999,-9999,-9999
1,1936,9,1,12,180,141,-9999,270,10,-9999,-9999,-9999
2,1936,9,1,13,174,124,-9999,0,0,-9999,-9999,-9999
3,1936,9,1,14,235,135,-9999,225,15,-9999,-9999,-9999
4,1936,9,1,15,252,130,-9999,180,21,-9999,-9999,-9999


In [104]:
# The populate missing values with -9999, but I've chosen to replace them with NaN's.
isd_df.replace({-9999: np.nan}, inplace=True)

In [105]:
# Voila!
isd_df.tail()

Unnamed: 0,year,month,day,hour,air_temp_c,dew_pt_temp_c,sea_lvl_press_hectoPa,wnd_dir_360,wnd_spd_mtrpersec,sky_condition,precip_hrly,precip_6hr_accum
0,1936.0,9.0,1.0,11.0,157.0,135.0,,0.0,0.0,,,
1,1936.0,9.0,1.0,12.0,180.0,141.0,,270.0,10.0,,,
2,1936.0,9.0,1.0,13.0,174.0,124.0,,0.0,0.0,,,
3,1936.0,9.0,1.0,14.0,235.0,135.0,,225.0,15.0,,,
4,1936.0,9.0,1.0,15.0,252.0,130.0,,180.0,21.0,,,


In [106]:

scaled_columns = ['air_temp_c', 'dew_pt_temp_c', 'sea_lvl_press_hectoPa', 
                  'wnd_spd_mtrpersec', 'precip_hrly', 'precip_6hr_accum']
scaling_factor = 10

In [107]:
# removing scaling factor
isd_df[scaled_columns] = isd_df[scaled_columns] / 10

In [108]:
isd_df.head()

Unnamed: 0,year,month,day,hour,air_temp_c,dew_pt_temp_c,sea_lvl_press_hectoPa,wnd_dir_360,wnd_spd_mtrpersec,sky_condition,precip_hrly,precip_6hr_accum
0,1936.0,9.0,1.0,11.0,15.7,13.5,,0.0,0.0,,,
1,1936.0,9.0,1.0,12.0,18.0,14.1,,270.0,1.0,,,
2,1936.0,9.0,1.0,13.0,17.4,12.4,,0.0,0.0,,,
3,1936.0,9.0,1.0,14.0,23.5,13.5,,225.0,1.5,,,
4,1936.0,9.0,1.0,15.0,25.2,13.0,,180.0,2.1,,,


In [121]:
isd_df['date_time'] = isd_df.day.astype('int').astype('str').str.zfill(2)+'/'\
                      +isd_df.month.astype('int').astype('str').str.zfill(2)+'/'\
                      +isd_df.year.astype('int').astype('str')+'/'\
                      +isd_df.hour.astype('int').astype('str').str.zfill(2)

In [122]:
isd_df.tail()

Unnamed: 0,year,month,day,hour,air_temp_c,dew_pt_temp_c,sea_lvl_press_hectoPa,wnd_dir_360,wnd_spd_mtrpersec,sky_condition,precip_hrly,precip_6hr_accum,date_time
6243,2019.0,9.0,18.0,3.0,21.7,15.0,1020.1,130.0,3.6,,0.0,,18/09/2019/03
6244,2019.0,9.0,18.0,4.0,20.0,15.0,1020.0,140.0,2.1,,0.0,,18/09/2019/04
6245,2019.0,9.0,18.0,5.0,20.0,15.0,1020.3,80.0,1.5,,0.0,,18/09/2019/05
6246,2019.0,9.0,18.0,6.0,19.4,15.0,1020.5,80.0,2.1,,,,18/09/2019/06
6247,2019.0,9.0,18.0,7.0,18.9,15.0,1020.4,70.0,2.6,,,,18/09/2019/07


In [123]:
isd_df['date_time'] = pd.to_datetime(isd_df['date_time'], format='%d/%m/%Y/%H')

In [124]:
isd_df.tail()

Unnamed: 0,year,month,day,hour,air_temp_c,dew_pt_temp_c,sea_lvl_press_hectoPa,wnd_dir_360,wnd_spd_mtrpersec,sky_condition,precip_hrly,precip_6hr_accum,date_time
6243,2019.0,9.0,18.0,3.0,21.7,15.0,1020.1,130.0,3.6,,0.0,,2019-09-18 03:00:00
6244,2019.0,9.0,18.0,4.0,20.0,15.0,1020.0,140.0,2.1,,0.0,,2019-09-18 04:00:00
6245,2019.0,9.0,18.0,5.0,20.0,15.0,1020.3,80.0,1.5,,0.0,,2019-09-18 05:00:00
6246,2019.0,9.0,18.0,6.0,19.4,15.0,1020.5,80.0,2.1,,,,2019-09-18 06:00:00
6247,2019.0,9.0,18.0,7.0,18.9,15.0,1020.4,70.0,2.6,,,,2019-09-18 07:00:00


In [125]:
isd_df.reset_index(inplace=True, drop=True)

In [126]:
isd_df.tail()

Unnamed: 0,year,month,day,hour,air_temp_c,dew_pt_temp_c,sea_lvl_press_hectoPa,wnd_dir_360,wnd_spd_mtrpersec,sky_condition,precip_hrly,precip_6hr_accum,date_time
555497,2019.0,9.0,18.0,3.0,21.7,15.0,1020.1,130.0,3.6,,0.0,,2019-09-18 03:00:00
555498,2019.0,9.0,18.0,4.0,20.0,15.0,1020.0,140.0,2.1,,0.0,,2019-09-18 04:00:00
555499,2019.0,9.0,18.0,5.0,20.0,15.0,1020.3,80.0,1.5,,0.0,,2019-09-18 05:00:00
555500,2019.0,9.0,18.0,6.0,19.4,15.0,1020.5,80.0,2.1,,,,2019-09-18 06:00:00
555501,2019.0,9.0,18.0,7.0,18.9,15.0,1020.4,70.0,2.6,,,,2019-09-18 07:00:00


In [128]:
isd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555502 entries, 0 to 555501
Data columns (total 13 columns):
year                     555502 non-null float64
month                    555502 non-null float64
day                      555502 non-null float64
hour                     555502 non-null float64
air_temp_c               555497 non-null float64
dew_pt_temp_c            554120 non-null float64
sea_lvl_press_hectoPa    494033 non-null float64
wnd_dir_360              553441 non-null float64
wnd_spd_mtrpersec        555449 non-null float64
sky_condition            453674 non-null float64
precip_hrly              459890 non-null float64
precip_6hr_accum         33787 non-null float64
date_time                555502 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(12)
memory usage: 55.1 MB


In [129]:
# Not much missing temperature data.
isd_df.loc[isd_df.air_temp_c.isna()]

Unnamed: 0,year,month,day,hour,air_temp_c,dew_pt_temp_c,sea_lvl_press_hectoPa,wnd_dir_360,wnd_spd_mtrpersec,sky_condition,precip_hrly,precip_6hr_accum,date_time
278625,1988.0,1.0,4.0,8.0,,,,340.0,3.1,8.0,1.3,,1988-01-04 08:00:00
281732,1988.0,5.0,17.0,5.0,,,,999.0,99.9,,0.0,,1988-05-17 05:00:00
282002,1988.0,5.0,28.0,17.0,,11.7,1020.3,180.0,1.5,,0.0,0.0,1988-05-28 17:00:00
283239,1988.0,7.0,20.0,22.0,,,,310.0,10.8,8.0,9.9,,1988-07-20 22:00:00
283256,1988.0,7.0,21.0,17.0,,,,999.0,99.9,,0.0,,1988-07-21 17:00:00
