In [11]:
import pandas as pd
import requests
import os


class RadiosondeStationsHarvester:
    """Harvests IGRA stations by region and time interval, saving data availability to CSV."""

    def __init__(self, continent, start_year, end_year):
        """Initialize with continent and time interval."""
        self.continent = continent
        self.start_year = start_year
        self.end_year = end_year
        self.station_list_url = "https://www.ncei.noaa.gov/data/integrated-global-radiosonde-archive/doc/igra2-station-list.txt"
        self.region_bounds = {
            'Europe': {'lat_min': 35, 'lat_max': 70, 'lon_min': -10, 'lon_max': 40},
            'Africa': {'lat_min': -35, 'lat_max': 37, 'lon_min': -17, 'lon_max': 51}
        }

    # def load_station_list(self):
    #     """Load IGRA station list into a DataFrame."""
    #     response = requests.get(self.station_list_url)
    #     lines = response.text.splitlines()
    #     records = []
    #     for line in lines:
    #         if line.startswith('#') or not line.strip():
    #             continue
    #         station_id = line[0:11].strip()
    #         latitude = float(line[12:20].strip())
    #         longitude = float(line[21:30].strip())
    #         elevation = float(line[31:37].strip())
    #         name = line[41:71].strip()
    #         first_year = int(line[82:86].strip())
    #         last_year = int(line[77:81].strip())
    #         records.append([station_id, latitude, longitude,
    #                        elevation, name, first_year, last_year])
    #     self.stations_df = pd.DataFrame(records, columns=[
    #                                     'station_id', 'latitude', 'longitude', 'elevation', 'name', 'first_year', 'last_year'])
    
    def load_igra_stations(self):
        df = pd.read_fwf(self.station_list_url, header=None, names=[
            'station_id', 'latitude', 'longitude', 'elevation', 'state', 'name',
            'first_year', 'last_year', 'nobs'
        ], colspecs=[
            (0, 11), (12, 20), (21, 30), (31, 37), (38, 40), (41, 71),
            (72, 76), (77, 81), (82, 88)
        ])
        self.stations_df = df
        return df

    def filter_by_region(self):
        """Filter stations by continent's latitude/longitude bounds."""
        bounds = self.region_bounds.get(self.continent)
        if bounds is None:
            raise ValueError(f"Continent {self.continent} not supported.")
        self.stations_df = self.stations_df[
            (self.stations_df['latitude'] >= bounds['lat_min']) &
            (self.stations_df['latitude'] <= bounds['lat_max']) &
            (self.stations_df['longitude'] >= bounds['lon_min']) &
            (self.stations_df['longitude'] <= bounds['lon_max'])
        ]

    def filter_by_time(self):
        """Filter stations with data in the specified year range."""
        self.stations_df = self.stations_df[
            (self.stations_df['first_year'] <= self.end_year) &
            (self.stations_df['last_year'] >= self.start_year)
        ]

    def get_data_range(self):
        """Create output DataFrame with station data availability."""
        self.output_df = self.stations_df[['station_id', 'latitude', 'longitude', 'name', 'first_year', 'last_year']].rename(
            columns={'first_year': 'has_data_from',
                     'last_year': 'has_data_up_to'}
        )

    def save_to_csv(self, output_path):
        """Save output DataFrame to CSV."""
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        self.output_df.to_csv(output_path, index=False)

    def run(self, output_path):
        """Execute the harvesting process and save results."""
        self.load_igra_stations()
        self.filter_by_region()
        self.filter_by_time()
        self.get_data_range()
        self.save_to_csv(output_path)

In [12]:
if __name__ == "__main__":
    harvester = RadiosondeStationsHarvester('Europe', 2020, 2025)
    harvester.run('../data/igra_stations_europe_2000_2020.csv')

In [14]:
%pip install siphon -q

Note: you may need to restart the kernel to use updated packages.


In [16]:
from datetime import datetime
from siphon.simplewebservice.igra2 import IGRAUpperAir

# Station ID for Tallinn-Harku (ENM00026038)
station_id = 'ENM00026038'

# Dates and times for June 10, 2022
dates = [
    datetime(1947, 5, 4, 3),  # 12:00 UTC
    datetime(2022, 6, 10, 6)   # 06:00 UTC
]

# Fetch and print data for each time
for date in dates:
    try:
        df, header = IGRAUpperAir.request_data(date, station_id)
        if not df.empty:
            print(f"\nData for {station_id} on {date}:")
            print(f"Pressure (hPa): {df['pressure'].dropna().tolist()}")
            print(f"Temperature (°C): {df['temperature'].dropna().tolist()}")
            print(
                f"Relative Humidity (%): {df['relative_humidity'].dropna().tolist()}")
            print(
                f"Wind Direction (degrees): {df['direction'].dropna().tolist()}")
            print(f"Wind Speed (m/s): {df['speed'].dropna().tolist()}")
            print(f"Height (m): {df['height'].dropna().tolist()}")
        else:
            print(f"\nNo data found for {station_id} on {date}")
    except Exception as e:
        print(f"\nError fetching data for {station_id} on {date}: {e}")


Data for ENM00026038 on 1947-05-04 03:00:00:
Pressure (hPa): [1019.0, 700.0, 500.0]
Temperature (°C): [10.5, -4.0, -17.0]
Relative Humidity (%): [51.0]
Wind Direction (degrees): []
Wind Speed (m/s): []
Height (m): [44, 3020, 5580]

Error fetching data for ENM00026038 on 2022-06-10 06:00:00: No dates match selection. This selection has data from 1947-05-04 03:00:00 to 1947-05-04 03:00:00.


In [None]:
import pandas as pd
import os
from datetime import datetime
from siphon.simplewebservice.igra2 import IGRAUpperAir
import re


class RadiosondeStationsHarvester:
    """Filters IGRA stations by continent and records data availability ranges."""

    def __init__(self, continent, output_path, static_date=datetime(2026, 1, 1, 12)):
        self.continent = continent
        self.output_path = output_path
        self.static_date = static_date
        self.station_list_url = "https://www.ncei.noaa.gov/data/integrated-global-radiosonde-archive/doc/igra2-station-list.txt"
        self.region_bounds = {
            'Europe': {'lat_min': 35, 'lat_max': 70, 'lon_min': -10, 'lon_max': 40},
            'Africa': {'lat_min': -35, 'lat_max': 37, 'lon_min': -17, 'lon_max': 51}
        }

    def load_station_list(self):
        """Loads IGRA station list into a DataFrame."""
        colspecs = [(0, 11), (12, 20), (21, 30), (31, 37),
                    (41, 71), (72, 76), (77, 81), (82, 88)]
        names = ['station_id', 'latitude', 'longitude',
                 'elevation', 'name', 'first_year', 'last_year', 'nobs']
        self.stations_df = pd.read_fwf(
            self.station_list_url, colspecs=colspecs, names=names, header=None, skiprows=1)

    def filter_by_region(self):
        """Filters stations by continent's geographical bounds."""
        bounds = self.region_bounds.get(self.continent)
        if bounds is None:
            raise ValueError(f"Continent {self.continent} not supported.")
        self.stations_df = self.stations_df[
            (self.stations_df['latitude'] >= bounds['lat_min']) &
            (self.stations_df['latitude'] <= bounds['lat_max']) &
            (self.stations_df['longitude'] >= bounds['lon_min']) &
            (self.stations_df['longitude'] <= bounds['lon_max'])
        ]

    def fetch_data_range(self, station_id):
        """Attempts to fetch data for a station on the static date and returns the data range."""
        try:
            df, header = IGRAUpperAir.request_data(
                self.static_date, station_id)
            if not df.empty:
                return f"{self.static_date.strftime('%Y-%m-%d %H:%M:%S')} to {self.static_date.strftime('%Y-%m-%d %H:%M:%S')}"
            return "No data found"
        except Exception as e:
            error_msg = str(e)
            match = re.search(
                r"data from (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) to (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})", error_msg)
            if match:
                return f"{match.group(1)} to {match.group(2)}"
            return "Error: Unknown date range"

    def save_to_txt(self):
        """Saves station details with data range to a text file."""
        os.makedirs(os.path.dirname(self.output_path), exist_ok=True)
        with open(self.output_path, 'w') as f:
            f.write('station_id,name,latitude,longitude,elevation,date_range\n')
        for _, row in self.stations_df.iterrows():
            date_range = self.fetch_data_range(row['station_id'])
            record = f"{row['station_id']},{row['name']},{row['latitude']},{row['longitude']},{row['elevation']},{date_range}\n"
            with open(self.output_path, 'a') as f:
                f.write(record)

    def run(self):
        """Executes the harvesting process and saves results."""
        self.load_station_list()
        #self.filter_by_region()
        self.save_to_txt()


if __name__ == "__main__":
    harvester = RadiosondeStationsHarvester(
        continent='Europe', output_path='./data/european_stations_2026.txt')
    harvester.run()
    print("Station data saved to './data/european_stations_2026.txt'")

Station data saved to './data/european_stations_2026.txt'


In [1]:
import pandas as pd
import os


def filter_recent_stations(input_path, output_path, start_year=2010, end_year=2025):
    """Filters stations with data ranges ending between start_year and end_year."""
    df = pd.read_csv(input_path)
    df['end_date'] = df['date_range'].str.split(' to ').str[1]
    df['end_year'] = pd.to_datetime(df['end_date']).dt.year
    filtered_df = df[(df['end_year'] >= start_year)
                     & (df['end_year'] <= end_year)]
    filtered_df = filtered_df.drop(columns=['end_date', 'end_year'])
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    filtered_df.to_csv(output_path, index=False)


if __name__ == "__main__":
    input_path = './data/european_stations_2026.txt'
    output_path = './data/european_stations_2010_2025.csv'
    filter_recent_stations(input_path, output_path)
    print(f"Filtered stations saved to '{output_path}'")

Filtered stations saved to './data/european_stations_2010_2025.csv'
