In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
# source_data = 'https://geocoding-api.open-meteo.com/v1/search?name=kovilpatti&count=10&language=en&format=json'
source_base_url = 'https://geocoding-api.open-meteo.com/v1/search?name='
source_relative_url = '&count=10&language=en&format=json'
sink_layer = 'bronze'
sink_folder = 'geo-data'
sink_account_name ='madhupavanadls'
destination_location = f'abfss://{sink_layer}@{sink_account_name}.dfs.core.windows.net/{sink_folder}'

### Read the data from source

In [0]:
geo_df = spark.read.format('json').load(destination_location)
# geo_df.limit(10).display()

In [0]:
geo_df.limit(10).display()

In [0]:
limit_col_geo = geo_df.select(
    col('results.admin1').alias('state_name'),
    col('results.admin2').alias('district_name'),
    col('results.country').alias('country_name'),
    col('results.name').alias('market_name'),
    col('results.latitude').alias('latitude'),
    col('results.longitude').alias('longitude'),
    col('results.population').alias('population')
    )
    

In [0]:
# Explode each column with position index
state_df      = limit_col_geo.select(explode("state_name").alias("state_name"),
                                     monotonically_increasing_id().alias("statesequenceID")
                                     )
district_df   = limit_col_geo.select(explode("district_name").alias( "district_name"),
                                      monotonically_increasing_id().alias("districtsequenceID"))
country_df    = limit_col_geo.select(explode("country_name").alias("country_name"),
                                      monotonically_increasing_id().alias("countrysequenceID"))
market_df     = limit_col_geo.select(explode("market_name").alias("market_name"), monotonically_increasing_id().alias("marketsequenceID"))
latitude_df   = limit_col_geo.select(explode("latitude").alias("latitude"), monotonically_increasing_id().alias("latitudesequenceID"))
longitude_df  = limit_col_geo.select(explode("longitude").alias("longitude"), monotonically_increasing_id().alias("longitudesequenceID"))
population_df = limit_col_geo.select(explode("population").alias("population"), monotonically_increasing_id().alias("populationsequenceID"))

# Join all DataFrames on position index
final_df = state_df \
    .join(district_df, col('statesequenceID') == col('districtsequenceID')) \
    .join(country_df, col('statesequenceID') == col('countrysequenceID')) \
    .join(market_df, col('statesequenceID') == col('marketsequenceID')) \
    .join(latitude_df,col('statesequenceID') == col('latitudesequenceID')) \
    .join(longitude_df, col('statesequenceID') == col('longitudesequenceID')) \
    .join(population_df, col('statesequenceID') == col('populationsequenceID')) \
    .select(col('state_name'),col('district_name'),col('country_name') , col('market_name'),col('latitude'),col('longitude'),col('population'))
# final_df.limit(10).display()
# Show final result


### writing the data into silver layer

In [0]:
final_df.write \
    .mode('overwrite') \
    .option("overwriteSchema", "true") \
    .format("delta") \
    .saveAsTable("uc_prod.silver.geo_location_silver")


In [0]:
# %sql
# select * from uc_prod.silver.geo_location_silver as gls
# join uc_prod.silver.daily_pricing_silver as dps on gls.market_name = dps.market_name