In [0]:
%run ./utilityFunctions

In [0]:
# Initialize Variables
import pandas as pd
from pyspark.sql.functions import create_map, col, explode, lit
from pyspark.sql.types import StructField, StructType, NullType, DoubleType, StringType
capitalCityTable = spark.table("gold.capitalcitylist")
capitalCityTablePd = capitalCityTable.toPandas()
units = "metric"
appIdStr = "<Redacted>"
excludeStr = "minutely,alerts"

def getCurrentWeather(df):
    currentDf = df.select("time_data_queried_utc", "capital", "country", "continent", "timezone", "timezone_offset", "current.*")
    return currentDf
  
    
def getDailyWeather(df):
    dailyDf = df.select("time_data_queried_utc", "capital", "country", "continent", "timezone", "timezone_offset", explode("daily").alias("daily"))
    dailyDf = dailyDf.select("time_data_queried_utc", "capital", "country", "continent", "timezone", "timezone_offset", "daily.*")
    return dailyDf


def getHourlyWeather(df):
    hourlyDf = df.select("time_data_queried_utc", "capital", "country", "continent", "timezone", "timezone_offset", explode("hourly").alias("hourly"))
    hourlyDf = hourlyDf.select("time_data_queried_utc", "capital", "country", "continent", "timezone", "timezone_offset", "hourly.*")
    return hourlyDf


def addColumnNotExists(df, columnName):
    if columnName not in df.columns:
        df = df.withColumn(columnName, lit(None))
    return df


def addPrecipColumnDailyNotExists(df, columnName):
    if columnName not in df.columns:
        df = df.withColumn(columnName, lit(0.00))
    return df


In [0]:
# Get the weather data using the OpenWeatherAPI
time_data_queried_utc = getCurrentTimeUTC()
databaseName = "bronze"
dataframeList = []
for index, row in capitalCityTablePd.iterrows():
    continentName = row['ContinentName']
    countryName = row['CountryName']
    capitalName = row['CapitalName']
    lat = row['CapitalLatitude']
    lon = row['CapitalLongitude']
    df = getWeatherData(lat, lon, units, appIdStr, excludeStr)
    df = df.withColumn("continent", lit(continentName)).withColumn("country", lit(countryName)).withColumn("capital", lit(capitalName)).withColumn("time_data_queried_utc", lit(time_data_queried_utc))
    dataframeList.append(df)


In [0]:
# Transform the data before saving it.
CurrentTableName = "currentdata" + time_data_queried_utc
DailyTableName = "dailydata" + time_data_queried_utc
HourlyTableName = "hourlydata" + time_data_queried_utc
initDf = dataframeList[0]
initCurrentDf = getCurrentWeather(initDf).drop("snow", "rain")
initCurrentDf = addColumnNotExists(initCurrentDf, "wind_gust")
initCurrentDf = initCurrentDf.select(sorted(initCurrentDf.columns))
initDailyDf = getDailyWeather(initDf)
initDailyDf = addPrecipColumnDailyNotExists(initDailyDf, "rain")
initDailyDf = addPrecipColumnDailyNotExists(initDailyDf, "snow")
initDailyDf = initDailyDf.select(sorted(initDailyDf.columns))
initHourlyDf = getHourlyWeather(initDf).drop("snow", "rain")
initHourlyDf = initHourlyDf.select(sorted(initHourlyDf.columns))

for counter in range(1, len(dataframeList)):
    df = dataframeList[counter]
    dfCurrent = getCurrentWeather(df).drop("snow", "rain")
    dfCurrent = addColumnNotExists(dfCurrent, "wind_gust")
    dfCurrent = dfCurrent.select(sorted(dfCurrent.columns))
    dfDaily = getDailyWeather(df)
    dfDaily = addPrecipColumnDailyNotExists(dfDaily, "rain")
    dfDaily = addPrecipColumnDailyNotExists(dfDaily, "snow")
    dfDaily = dfDaily.select(sorted(dfDaily.columns))
    dfHourly = getHourlyWeather(df).drop("snow", "rain")
    dfHourly = dfHourly.select(sorted(dfHourly.columns))
    initCurrentDf = initCurrentDf.union(dfCurrent)
    initDailyDf = initDailyDf.union(dfDaily)
    initHourlyDf = initHourlyDf.union(dfHourly)
    
writeDfToDeltaTable(initCurrentDf, databaseName, CurrentTableName, "overwrite", "time_data_queried_utc")
writeDfToDeltaTable(initDailyDf, databaseName, DailyTableName, "overwrite", "time_data_queried_utc")
writeDfToDeltaTable(initHourlyDf, databaseName, HourlyTableName, "overwrite", "time_data_queried_utc")

In [0]:
listRawTables = spark.catalog.listTables("bronze")
print(listRawTables)