#### 1. Add `start_year` and `end_year` parameters for job workflows usage

In [0]:
dbutils.widgets.text("start_year", "")
dbutils.widgets.text("end_year", "")

#### 2. Read GHCN-daily data from azure blob storage URL into pandas dataframe, improve date time format, and write it into `ghcn.ghcn_{year}`table


In [0]:
import pandas as pd

new_columns = ['ID', 'Time', 'Element', 'Value', 'M-Flag', 'Q-Flag', 'S-Flag', 'OBS-Time']

for year in range(int(dbutils.widgets.get("start_year")), int(dbutils.widgets.get("end_year")) + 1):
    
    URL = f'https://ghcn.blob.core.windows.net/ghcn/csv/daily/by_year/{year}.csv'

    df = pd.read_csv(URL, names = new_columns)
    
    df['Time'] = pd.to_datetime(df.Time, format='%Y%m%d')
    
    spark.createDataFrame(df).write.mode("overwrite").saveAsTable(f"ghcn.ghcn_{year}")


#### 3. In SQL, join `ghcn.station_metadata` and `ghcn.ghcn_{year}` tables to query the results you are interested in
- Aggregated monthly precipitation data from weather stations in Central and South America

In [0]:
%sql
SELECT DISTINCT month(g.Time) as Month, round(mean(g.Value) / 10, 2) as Precipitation
FROM ghcn.ghcn_${end_year} g
JOIN ghcn.station_metadata s
ON g.ID = s.ID 
WHERE s.Region IN ('Central America', 'South America') AND g.Element == 'PRCP'
GROUP BY Month
ORDER BY Month

Month,Precipitation
1,3.64
2,3.4
3,4.22
4,4.33
5,4.92
6,2.63
7,3.44
8,3.34
9,3.13
10,4.33


Databricks visualization. Run in Databricks to view.

- Aggregated daily average temperature data from weather stations in North America

In [0]:
%sql
SELECT DISTINCT date(g.Time) as Date, round(mean(g.Value) / 10, 2) as AvgTemperature
FROM ghcn.ghcn_${end_year} g
JOIN ghcn.station_metadata s
ON g.ID = s.ID 
WHERE s.Region = 'North America' AND g.Element = 'TAVG'
GROUP BY Date
ORDER BY Date

Date,AvgTemperature
2023-01-01,-0.87
2023-01-02,-2.33
2023-01-03,-2.56
2023-01-04,-2.2
2023-01-05,-1.98
2023-01-06,-1.78
2023-01-07,-1.93
2023-01-08,-1.71
2023-01-09,-1.06
2023-01-10,-1.2


Databricks visualization. Run in Databricks to view.

- Temperature map in July 2023 in the United States

In [0]:
%sql
SELECT s.State, round(mean(g.Value) / 10, 2) as AvgTemperature
FROM ghcn.ghcn_${end_year} g
JOIN ghcn.station_metadata s
ON g.ID = s.ID 
WHERE s.FIPS == 'US' AND g.Element = 'TAVG' AND month(g.Time) == '7'
GROUP BY s.State
ORDER BY s.State

State,AvgTemperature
AK,14.75
AL,26.86
AR,27.05
AZ,25.72
CA,23.69
CO,16.63
CT,24.65
FL,28.51
GA,26.81
HI,22.65


Databricks visualization. Run in Databricks to view.

- Temperature map in Jan 2023 in the world

In [0]:
%sql
SELECT s.`ISO-2`, round(mean(g.Value) / 10, 2) as AvgTemperature
FROM ghcn.ghcn_${end_year} g
JOIN ghcn.station_metadata s
ON g.ID = s.ID 
WHERE g.Element = 'TAVG' AND month(g.Time) == '1'
GROUP BY s.`ISO-2`

ISO-2,AvgTemperature
MM,24.29
DZ,9.73
LT,0.49
CI,26.8
PM,0.83
SC,26.75
AZ,2.52
UA,0.66
RO,3.6
KI,28.15


Databricks visualization. Run in Databricks to view.