Read GHCN-daily data from azure blob storage URL into pandas dataframe,\
improve date time format, and write it into `ghcn_{year}`table
- Modify `start_year` and `end_year` to fetch all yearly data

In [None]:
import pandas as pd

new_columns = ['ID', 'Time', 'Element', 'Value', 'M-Flag', 'Q-Flag', 'S-Flag', 'OBS-Time']

start_year = 2022
end_year = 2022

for year in range(start_year, end_year + 1):
    
    URL = f'https://ghcn.blob.core.windows.net/ghcn/csv/daily/by_year/{year}.csv'

    df = pd.read_csv(URL, names = new_columns)
    
    df['Time'] = pd.to_datetime(df.Time, format='%Y%m%d')
    
    spark.createDataFrame(df).write.mode("overwrite").saveAsTable(f"ghcn.ghcn_{year}")


In SQL Queries, join GHCN and station_metadata and filter countries you are interested in
- Daily data in year 2022 from weather stations in Central and South America 

In [None]:
%sql
SELECT g.Time, g.ID, s.Country, s.StationName, g.Element, g.Value
FROM ghcn.ghcn_2022 g
JOIN ghcn.station_metadata s
ON g.ID = s.ID 
WHERE s.Country IN ("Belize", "Costa Rica", "El Salvador", "Guatemala", "Honduras", "Mexico", "Nicaragua", "Panama", "Argentina", "Bolivia", "Brazil", "Chile", "Colombia", "Ecuador", "French Guiana", "Guyana", "Paraguay", "Peru", "Suriname", "Uruguay", "Venezuela")
ORDER BY g.Time, g.ID

- Aggregated monthly precipitation data from weather stations in Central and South America

In [None]:
%sql
SELECT DISTINCT month(g.Time) as Month, round(mean(g.Value) / 10, 2) as Precipitation
FROM ghcn.ghcn_2022 g
JOIN ghcn.station_metadata s
ON g.ID = s.ID 
WHERE s.Country IN ("Belize", "Costa Rica", "El Salvador", "Guatemala", "Honduras", "Mexico", "Nicaragua", "Panama", "Argentina", "Bolivia", "Brazil", "Chile", "Colombia", "Ecuador", "French Guiana", "Guyana", "Paraguay", "Peru", "Suriname", "Uruguay", "Venezuela") AND (g.Element == 'PRCP')
GROUP BY Month
ORDER BY Month

- Aggregated daily average temperature data from weather stations in the US

In [None]:
%sql
SELECT DISTINCT date(g.Time) as Date, round(mean(g.Value) / 10, 2) as AvgTemperature
FROM ghcn.ghcn_2022 g
JOIN ghcn.station_metadata s
ON g.ID = s.ID 
WHERE s.Country_code = 'US' AND g.Element = 'TAVG'
GROUP BY Date
ORDER BY Date