In [22]:
years="[2001,2002,2003,2004,2005]"

In [23]:
from pyspark.sql.functions import current_timestamp, month, col, when
from delta.tables import *
import ast

In [24]:
year_list=ast.literal_eval(years)

In [25]:
# Paths Bronze Folders
pathWSRFolder="abfss://bronze@datalakemotogp.dfs.core.windows.net/world-standing-riders/"
pathEventsFolder="abfss://bronze@datalakemotogp.dfs.core.windows.net/events/"
pathFRFolder="abfss://bronze@datalakemotogp.dfs.core.windows.net/full-results/"

# Path Silver
pathWSRSilver="abfss://silver@datalakemotogp.dfs.core.windows.net/api-racing-mike/motogp/world-standing-riders/"
pathEventsSilver="abfss://silver@datalakemotogp.dfs.core.windows.net/api-racing-mike/motogp/events/"
pathFRSilver="abfss://silver@datalakemotogp.dfs.core.windows.net/api-racing-mike/motogp/full-results/"

## ***Common functions***

In [26]:
def replace_undefined_with_null(df):
    for column in df.columns:
        df = df.withColumn(column, when(col(column) == 'undefined', None).otherwise(col(column)))
    return df

In [27]:
spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true")

# ***World-Standing-Riders***

In [28]:
def process_wsr_silver(year):
    df = spark.read.json(f"{pathWSRFolder}{year}/*.json")
    dfWSR=df.select("classification_id","points","position","rider_full_name","rider_country_iso","constructor_name","team_color","text_color","year")
    dfWSR=replace_undefined_with_null(dfWSR)
    dfWSR=dfWSR.withColumn("date_load", current_timestamp())
    dfWSR.dropDuplicates(["classification_id"])
    try:
        wsr_silver=DeltaTable.forPath(spark,pathWSRSilver)
        wsr_silver.alias("existing").merge(dfWSR.alias("new"),\
            "existing.classification_id = new.classification_id")\
            .whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
    except Exception as e:
        if 'not a Delta table' in str(e):
            dfWSR.write.format("delta").partitionBy("year").save(pathWSRSilver)
        else:
            raise e

# ***Events***

In [29]:
def process_events_silver(year):
    df=spark.read.json(f"{pathEventsFolder}{year}/*.json")
    dfEvents=df.select("id","name","short_name","sponsored_name","date_start","date_end","country_iso","country_name","year")
    dfEvents=replace_undefined_with_null(dfEvents)
    dfEvents=dfEvents.withColumn("date_load", current_timestamp()).withColumnRenamed("name","event_name").withColumnRenamed("id","event_id")
    dfEvents=dfEvents.dropDuplicates(["event_id"])
    try:
        events_silver=DeltaTable.forPath(spark,pathEventsSilver)
        events_silver.alias("existing").merge(dfEvents.alias("new"),\
            "existing.event_id=new.event_id")\
            .whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
    except Exception as e:
        if 'not a Delta table' in str(e):
            dfEvents.write.format("delta").partitionBy("year").save(pathEventsSilver)
        else:
            raise e

# ***Full-Results***

In [30]:
def process_full_results_silver(year):
    df=spark.read.json(f"{pathFRFolder}{year}/*.json")
    dfEvents=spark.read.json(f"{pathEventsFolder}{year}/*.json")
    dfEvents=dfEvents.withColumnRenamed("id","event_id").select("event_id","date_end")
    dfFR=df.select("event_id","result_id","position","points","time","gap_first","gap_lap","total_laps","average_speed",\
        "rider_id","rider_full_name","rider_country_iso","rider_country_name","rider_number","years_old","birth_date","birth_city","rider_type",\
        "team_id","team_name","team_color","sponsored_team","constructor_name",\
        "circuit_id","circuit_name","circuit_nation","circuit_place","circuit_country_iso","circuit_country_name",\
        "ground_condition","humidity_condition","weather_condition","track_condition",\
        "profile_picture_url","portrait_picture_url","number_picture_url","picture_url","bike_picture_url","country_flag_url","helmet_picture_url","year")        
    dfFR = replace_undefined_with_null(dfFR)
    dfEvents = replace_undefined_with_null(dfEvents)
    dfFR=dfFR.join(dfEvents,on="event_id",how="left")
    dfFR=dfFR.withColumn("month",month("date_end")).withColumnRenamed("date_end","date").withColumn("date_load", current_timestamp())
    dfFR=dfFR.dropDuplicates(["event_id","result_id","rider_id","team_id","circuit_id"])
    try:
        fr_silver=DeltaTable.forPath(spark,pathFRSilver)
        fr_silver.alias("existing").merge(dfFR.alias("new"),\
            "existing.event_id = new.event_id AND existing.result_id = new.result_id AND existing.rider_id = new.rider_id AND existing.team_id = new.team_id AND existing.circuit_id = new.circuit_id")\
            .whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
    except Exception as e:
        if 'not a Delta table' in str(e):
            dfFR.write.format("delta").partitionBy("year","month").save(pathFRSilver)
        else:
            raise e

# ***Bucle final para transformar las 3 entidades***


In [31]:
for year in year_list:
    process_wsr_silver(year)
    process_events_silver(year)
    process_full_results_silver(year)
        