In [1]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import glob as glob
import datetime as dt

import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.io import shapereader
import cartopy.io.img_tiles as cimgt

In [2]:
df = pd.read_csv("GHCND_Illinois_Stations.csv", index_col=0)

illinois = df

station_names = illinois["STATION_ID"].tolist()

# station_names

In [3]:
len(station_names)

80

In [58]:
lons = illinois["LONGITUDE"].astype(float).values
lats = illinois["LATITUDE"].astype(float).values
names = illinois["STATION_NAME"].values

fig = plt.figure(figsize=(15, 10))
ax = plt.axes(projection=ccrs.PlateCarree())

ax.add_feature(cfeature.STATES.with_scale('10m'), edgecolor='black', facecolor='none')
ax.add_feature(cfeature.RIVERS.with_scale('10m'))
ax.add_feature(cfeature.LAKES.with_scale('10m'), alpha=0.5)

ax.scatter(lons, lats, s=20, marker='o', edgecolor='k', transform=ccrs.PlateCarree())

for lon, lat, name in zip(lons, lats, names):
    ax.text(lon + 0.05, lat + 0.05, name, fontsize=10,
            transform=ccrs.PlateCarree())

ax.set_extent([-92.8, -86.0, 36, 43.5], crs=ccrs.PlateCarree())

ax.set_title("Illinois Weather Stations that has data from 1940-2024")
plt.show()

KeyboardInterrupt: 

In [4]:
illinois["STATION_ID_MOD"] = (
    illinois["STATION_ID"]
      .str.replace("GHCND", "GHCNh", regex=False)
      .str.replace(":", "_",    regex=False)
)

In [5]:
station_ids = illinois["STATION_ID_MOD"].to_list()

In [61]:
station_ids

['GHCNh_ASN00008119',
 'GHCNh_USC00428465',
 'GHCNh_ASN00014910',
 'GHCNh_BR001065002',
 'GHCNh_CA005012545',
 'GHCNh_US1CAMA0005',
 'GHCNh_CA008401251',
 'GHCNh_US1TNRD0108',
 'GHCNh_USC00099506',
 'GHCNh_USC00503205',
 'GHCNh_US1FLLE0041',
 'GHCNh_US1OHMY0021',
 'GHCNh_USC00043928',
 'GHCNh_US1MIVB0004',
 'GHCNh_US1NCGN0007',
 'GHCNh_ASN00075026',
 'GHCNh_USC00044259',
 'GHCNh_IN004010501',
 'GHCNh_US1WAWC0036',
 'GHCNh_US1NMOT0040',
 'GHCNh_ASN00020042',
 'GHCNh_SWE00138090',
 'GHCNh_ASN00091155',
 'GHCNh_RSM00029612',
 'GHCNh_US1TNRD0020',
 'GHCNh_US1NMSN0054',
 'GHCNh_BR002754010',
 'GHCNh_US1COGF0054',
 'GHCNh_US1WYSW0014',
 'GHCNh_USR0000CROC',
 'GHCNh_USC00361581',
 'GHCNh_MXN00024043',
 'GHCNh_US1KSCQ0002',
 'GHCNh_USC00337925',
 'GHCNh_US1NCWK0155',
 'GHCNh_BF1SE000001',
 'GHCNh_FIE00144387',
 'GHCNh_MXN00012153',
 'GHCNh_US1TXTV0273',
 'GHCNh_US1MNDK0036',
 'GHCNh_GME00122746',
 'GHCNh_USC00270215',
 'GHCNh_SWE00139258',
 'GHCNh_BR000661000',
 'GHCNh_CA001030600',
 'GHCNh_SF

In [6]:
import requests
from io import StringIO
import sys

In [21]:
station_ids[0]

'GHCNh_USC00113879'

In [7]:
base_url = "https://www.ncei.noaa.gov/oa/global-historical-climatology-network/index.html#hourly/access/by-station"

data_by_station = {}
sorted_station_ids = []

for i, sid in enumerate(station_ids):
    url = f"{base_url}/{sid}_por.psv"
    try:
        r = requests.get(url)
        r.raise_for_status() 
    except requests.HTTPError as e:
        print(f"Failed to fetch {sid}: {e}")
        continue

    
    #df = pd.read_csv(StringIO(r.text), sep="|")
    #data_by_station[sid] = df
    sorted_station_ids.append(sid)

In [80]:
len(sorted_station_ids)

32

In [13]:
sorted_station_ids[15]

'GHCNh_USC00114317'

In [8]:
base_url = "https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-station"

data_by_station = []

for i, sid in enumerate(sorted_station_ids):
    url = f"{base_url}/{sid}_por.psv"
    try:
        r = requests.get(url)
        r.raise_for_status() 
    except requests.HTTPError as e:
        print(f"Failed to fetch {sid}: {e}")
        continue

    df = pd.read_csv(StringIO(r.text), sep="|", low_memory=False)
    data_by_station.append(df)
    print(f"success: {sid}")

Failed to fetch GHCNh_USC00116642: 404 Client Error: Not Found for url: https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-station/GHCNh_USC00116642_por.psv
success: GHCNh_USC00110583
success: GHCNh_USC00110082
Failed to fetch GHCNh_USC00118870: 404 Client Error: Not Found for url: https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-station/GHCNh_USC00118870_por.psv
success: GHCNh_USC00117014
success: GHCNh_USC00113879
success: GHCNh_USC00115079
success: GHCNh_USW00014834
success: GHCNh_USC00115983
Failed to fetch GHCNh_USC00117354: 404 Client Error: Not Found for url: https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-station/GHCNh_USC00117354_por.psv
Failed to fetch GHCNh_USC00116973: 404 Client Error: Not Found for url: https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-station/GHCNh_USC00116973_por.psv
Failed to fetch GHCNh_USC00117157: 404 Client Error

In [15]:
len(data_by_station)

32

In [10]:
data_by_station[0]['dew_point_temperature']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
576330   NaN
576331   NaN
576332   NaN
576333   NaN
576334   NaN
Name: dew_point_temperature, Length: 576335, dtype: float64

In [39]:
with open(f"GHCNh_USC00113879_por.psv", "w", encoding="utf-8") as f:
    f.write(r.text)

In [39]:
data_by_station[i]

Unnamed: 0,Station_ID,Station_name,Year,Month,Day,Hour,Minute,Latitude,Longitude,Elevation,...,precipitation_24_hour_Quality_Code,precipitation_24_hour_Report_Type,precipitation_24_hour_Source_Code,precipitation_24_hour_Source_Station_ID,remarks,remarks_Measurement_Code,remarks_Quality_Code,remarks_Report_Type,remarks_Source_Code,remarks_Source_Station_ID
0,USC00112140,DANVILLE,1951,5,2,19,0,40.1391,-87.6479,169.2,...,,,,,,,,,,
1,USC00112140,DANVILLE,1951,5,2,20,0,40.1391,-87.6479,169.2,...,,,,,,,,,,
2,USC00112140,DANVILLE,1951,5,2,21,0,40.1391,-87.6479,169.2,...,,,,,,,,,,
3,USC00112140,DANVILLE,1951,5,2,22,0,40.1391,-87.6479,169.2,...,,,,,,,,,,
4,USC00112140,DANVILLE,1951,5,2,23,0,40.1391,-87.6479,169.2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592558,USC00112140,DANVILLE,2022,1,1,2,0,40.1391,-87.6479,169.2,...,,,,,,,,,,
592559,USC00112140,DANVILLE,2022,1,1,3,0,40.1391,-87.6479,169.2,...,,,,,,,,,,
592560,USC00112140,DANVILLE,2022,1,1,4,0,40.1391,-87.6479,169.2,...,,,,,,,,,,
592561,USC00112140,DANVILLE,2022,1,1,5,0,40.1391,-87.6479,169.2,...,,,,,,,,,,


In [18]:
len(sorted_station_ids)

80

In [11]:
tmp_station = []

def has_over_500_non_nans(lst):
    # count how many entries are NOT np.nan
    n_non_nan = sum(1 for x in lst if not np.isnan(x))
    return n_non_nan > 500

for i, sid in enumerate(sorted_station_ids):
    try:
        if has_over_500_non_nans(data_by_station[i]["dew_point_temperature"]):
            tmp_station.append(data_by_station[i])
            print(sid)
        else:
            print(f"{sid} doesn't have temperature")
    except:
        print(f"{sid} does not exist")

GHCNh_USC00116642 doesn't have temperature
GHCNh_USC00110583 doesn't have temperature
GHCNh_USC00110082 doesn't have temperature
GHCNh_USC00118870 doesn't have temperature
GHCNh_USC00117014 doesn't have temperature
GHCNh_USC00113879
GHCNh_USC00115079 doesn't have temperature
GHCNh_USW00014834 doesn't have temperature
GHCNh_USC00115983 doesn't have temperature
GHCNh_USC00117354 doesn't have temperature
GHCNh_USC00116973 doesn't have temperature
GHCNh_USC00117157
GHCNh_USC00111700 doesn't have temperature
GHCNh_USC00112687 doesn't have temperature
GHCNh_USC00116765 doesn't have temperature
GHCNh_USC00114317 doesn't have temperature
GHCNh_USC00117391 doesn't have temperature
GHCNh_USC00111302 doesn't have temperature
GHCNh_USC00113940 doesn't have temperature
GHCNh_USC00111475 doesn't have temperature
GHCNh_USC00118630 doesn't have temperature
GHCNh_USC00116753 doesn't have temperature
GHCNh_USC00129112 doesn't have temperature
GHCNh_USC00113312 doesn't have temperature
GHCNh_USC00112931 