### 1: Import Necessary Libraries

In [0]:
!pip install pycountry

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import when, col, udf
import pycountry

### 2: Mounting the ADLS (Storage Resource)

In [0]:
spark

In [0]:
scope_name = 'olympic-secret-kv'

try:
    secrets_list = dbutils.secrets.list(scope_name)
    for secret in secrets_list:
        print(f"Secret Name: {secret.key}")
except Exception as e:
    print(f"Error: {e}")


Secret Name: adls-access-key


In [0]:
storageAccountName = "olympicsdatasa"
storageAccountAccessKey = dbutils.secrets.get('olymp-secret-kv', 'adls-access-key')
container_name = "tokyo-olympics-data"

if not any(mount.mountPoint == f"/mnt/{container_name}" for mount in dbutils.fs.mounts()):
    try:
            dbutils.fs.mount(
            source = "wasbs://{}@{}.blob.core.windows.net".format(container_name, storageAccountName),
            mount_point = f"/mnt/{container_name}",
            extra_configs = {'fs.azure.account.key.' + storageAccountName + '.blob.core.windows.net': storageAccountAccessKey}
            )
            print(f"{container_name} mount succeeded!")
    except Exception as e:
        print("mount exception", e)
else:
    print("Already Mounted", f"/mnt/{container_name}")

Already Mounted /mnt/tokyo-olympics-data


In [0]:
dbutils.fs.mounts()

[MountInfo(mountPoint='/databricks-datasets', source='databricks-datasets', encryptionType=''),
 MountInfo(mountPoint='/Volumes', source='UnityCatalogVolumes', encryptionType=''),
 MountInfo(mountPoint='/databricks/mlflow-tracking', source='databricks/mlflow-tracking', encryptionType=''),
 MountInfo(mountPoint='/mnt/tokyo-olympics-data', source='wasbs://tokyo-olympics-data@olympicsdatasa.blob.core.windows.net', encryptionType=''),
 MountInfo(mountPoint='/databricks-results', source='databricks-results', encryptionType=''),
 MountInfo(mountPoint='/databricks/mlflow-registry', source='databricks/mlflow-registry', encryptionType=''),
 MountInfo(mountPoint='/Volume', source='DbfsReserved', encryptionType=''),
 MountInfo(mountPoint='/volumes', source='DbfsReserved', encryptionType=''),
 MountInfo(mountPoint='/', source='DatabricksRoot', encryptionType=''),
 MountInfo(mountPoint='/volume', source='DbfsReserved', encryptionType='')]

In [0]:
%python
%fs ls "/mnt/tokyo-olympics-data/raw-data/"

path,name,size,modificationTime
dbfs:/mnt/tokyo-olympics-data/raw-data/Athletes.csv,Athletes.csv,418514,1737435934000
dbfs:/mnt/tokyo-olympics-data/raw-data/Coaches.csv,Coaches.csv,16893,1737435948000
dbfs:/mnt/tokyo-olympics-data/raw-data/EntriesGender.csv,EntriesGender.csv,1126,1737435986000
dbfs:/mnt/tokyo-olympics-data/raw-data/Medals.csv,Medals.csv,2414,1737436001000
dbfs:/mnt/tokyo-olympics-data/raw-data/Teams.csv,Teams.csv,35303,1737436018000


### 3: Read the data

- **Athletes**

In [0]:
df_athlets = spark.read.format("csv").option("header", "true").load("dbfs:/mnt/tokyo-olympics-data/raw-data/Athletes.csv")
print('*' * 50)
print("Number of rows: ", df_athlets.count())
print('*' * 50)
print("Datatypes")
print(df_athlets.printSchema())
print('*' * 50)
df_athlets.show(5)

**************************************************
Number of rows:  11085
**************************************************
Datatypes
root
 |-- Name: string (nullable = true)
 |-- NOC: string (nullable = true)
 |-- Discipline: string (nullable = true)

None
**************************************************
+-----------------+------+-------------------+
|             Name|   NOC|         Discipline|
+-----------------+------+-------------------+
|  AALERUD Katrine|Norway|       Cycling Road|
|      ABAD Nestor| Spain|Artistic Gymnastics|
|ABAGNALE Giovanni| Italy|             Rowing|
|   ABALDE Alberto| Spain|         Basketball|
|    ABALDE Tamara| Spain|         Basketball|
+-----------------+------+-------------------+
only showing top 5 rows



**Coaches**

In [0]:
df_coaches = spark.read.format("csv").option("header", "true").load("dbfs:/mnt/tokyo-olympics-data/raw-data/Coaches.csv")
print('*' * 50)
print("Number of rows: ", df_coaches.count())
print('*' * 50)
print("Datatypes")
print(df_coaches.printSchema())
print('*' * 50)
df_coaches.show(5)

**************************************************
Number of rows:  394
**************************************************
Datatypes
root
 |-- Name: string (nullable = true)
 |-- NOC: string (nullable = true)
 |-- Discipline: string (nullable = true)
 |-- Event: string (nullable = true)

None
**************************************************
+---------------+-------------+----------+-----+
|           Name|          NOC|Discipline|Event|
+---------------+-------------+----------+-----+
|ABDELMAGID Wael|        Egypt|  Football| NULL|
|      ABE Junya|        Japan|Volleyball| NULL|
|  ABE Katsuhiko|        Japan|Basketball| NULL|
|   ADAMA Cherif|Côte d'Ivoire|  Football| NULL|
|     AGEBA Yuya|        Japan|Volleyball| NULL|
+---------------+-------------+----------+-----+
only showing top 5 rows



**EntriesGender**

In [0]:
df_entry_gender= spark.read.format("csv").option("header", "true").load("dbfs:/mnt/tokyo-olympics-data/raw-data/EntriesGender.csv")
print('*' * 50)
print("Number of rows: ", df_entry_gender.count())
print('*' * 50)
print("Datatypes")
print(df_entry_gender.printSchema())
print('*' * 50)
df_entry_gender.show(5)

**************************************************
Number of rows:  46
**************************************************
Datatypes
root
 |-- Discipline: string (nullable = true)
 |-- Female: string (nullable = true)
 |-- Male: string (nullable = true)
 |-- Total: string (nullable = true)

None
**************************************************
+-------------------+------+----+-----+
|         Discipline|Female|Male|Total|
+-------------------+------+----+-----+
|     3x3 Basketball|    32|  32|   64|
|            Archery|    64|  64|  128|
|Artistic Gymnastics|    98|  98|  196|
|  Artistic Swimming|   105|   0|  105|
|          Athletics|   969|1072| 2041|
+-------------------+------+----+-----+
only showing top 5 rows



**Medals**

In [0]:
df_medals = spark.read.format("csv").option("header", "true").load("dbfs:/mnt/tokyo-olympics-data/raw-data/Medals.csv")
print('*' * 50)
print("Number of rows: ", df_medals.count())
print('*' * 50)
print("Datatypes")
print(df_medals.printSchema())
print('*' * 50)
df_medals.show(5)

**************************************************
Number of rows:  93
**************************************************
Datatypes
root
 |-- Rank: string (nullable = true)
 |-- Team/NOC: string (nullable = true)
 |-- Gold: string (nullable = true)
 |-- Silver: string (nullable = true)
 |-- Bronze: string (nullable = true)
 |-- Total: string (nullable = true)
 |-- Rank by Total: string (nullable = true)

None
**************************************************
+----+--------------------+----+------+------+-----+-------------+
|Rank|            Team/NOC|Gold|Silver|Bronze|Total|Rank by Total|
+----+--------------------+----+------+------+-----+-------------+
|   1|United States of ...|  39|    41|    33|  113|            1|
|   2|People's Republic...|  38|    32|    18|   88|            2|
|   3|               Japan|  27|    14|    17|   58|            5|
|   4|       Great Britain|  22|    21|    22|   65|            4|
|   5|                 ROC|  20|    28|    23|   71|            3|


**Teams**

In [0]:
df_teams = spark.read.format("csv").option("header", "true").load("dbfs:/mnt/tokyo-olympics-data/raw-data/Teams.csv")
print('*' * 50)
print("Number of rows: ", df_teams.count())
print('*' * 50)
print("Datatypes")
print(df_teams.printSchema())
print('*' * 50)
df_teams.show(5)

**************************************************
Number of rows:  743
**************************************************
Datatypes
root
 |-- Name: string (nullable = true)
 |-- Discipline: string (nullable = true)
 |-- NOC: string (nullable = true)
 |-- Event: string (nullable = true)

None
**************************************************
+-------+--------------+--------------------+-----+
|   Name|    Discipline|                 NOC|Event|
+-------+--------------+--------------------+-----+
|Belgium|3x3 Basketball|             Belgium|  Men|
|  China|3x3 Basketball|People's Republic...|  Men|
|  China|3x3 Basketball|People's Republic...|Women|
| France|3x3 Basketball|              France|Women|
|  Italy|3x3 Basketball|               Italy|Women|
+-------+--------------+--------------------+-----+
only showing top 5 rows



### 4: Transformations

- <b>1. Change `Column Names`</b>

In [0]:
# Athlets

df_athlets = (
    df_athlets
    .withColumnRenamed("Name", "athlete_name")
    .withColumnRenamed("NOC", "country_name")  
    .withColumnRenamed("Discipline", "sport")
)


# Coaches
df_coaches = (
    df_coaches
    .withColumnRenamed("Name", "coach_name")
    .withColumnRenamed("NOC", "country_name")   
    .withColumnRenamed("Discipline", "sport")
    .withColumnRenamed("Event", "event_name")
)


# EntriesGender
df_entry_gender = (
    df_entry_gender
    .withColumnRenamed("discipline", "sport")
    .withColumnRenamed("Female", "female_count")
    .withColumnRenamed("Male", "male_count")
    .withColumnRenamed("Total", "total_count")
)


# Medals
df_medals = (
    df_medals
    .withColumnRenamed("Rank", "rank_number")
    .withColumnRenamed("Team/NOC", "country_name")  
    .withColumnRenamed("Gold", "gold_medals")
    .withColumnRenamed("Silver", "silver_medals")
    .withColumnRenamed("Bronze", "bronze_medals")
    .withColumnRenamed("Total", "total_medals")
    .withColumnRenamed("Rank by Total", "rank_by_total_medals")

)


# Teams
df_teams = (
    df_teams
    .withColumnRenamed("Name", "team")
    .withColumnRenamed("NOC", "country_name")   
    .withColumnRenamed("Discipline", "sport")
    .withColumnRenamed("Event", "event_name")
)

In [0]:
df_athlets.show(5)

+-----------------+------------+-------------------+
|     athlete_name|country_name|              sport|
+-----------------+------------+-------------------+
|  AALERUD Katrine|      Norway|       Cycling Road|
|      ABAD Nestor|       Spain|Artistic Gymnastics|
|ABAGNALE Giovanni|       Italy|             Rowing|
|   ABALDE Alberto|       Spain|         Basketball|
|    ABALDE Tamara|       Spain|         Basketball|
+-----------------+------------+-------------------+
only showing top 5 rows



In [0]:
df_coaches.show(5)

+---------------+-------------+----------+----------+
|     coach_name| country_name|     sport|event_name|
+---------------+-------------+----------+----------+
|ABDELMAGID Wael|        Egypt|  Football|      NULL|
|      ABE Junya|        Japan|Volleyball|      NULL|
|  ABE Katsuhiko|        Japan|Basketball|      NULL|
|   ADAMA Cherif|Côte d'Ivoire|  Football|      NULL|
|     AGEBA Yuya|        Japan|Volleyball|      NULL|
+---------------+-------------+----------+----------+
only showing top 5 rows



In [0]:
df_entry_gender.show(5)

+-------------------+------------+----------+-----------+
|              sport|female_count|male_count|total_count|
+-------------------+------------+----------+-----------+
|     3x3 Basketball|          32|        32|         64|
|            Archery|          64|        64|        128|
|Artistic Gymnastics|          98|        98|        196|
|  Artistic Swimming|         105|         0|        105|
|          Athletics|         969|      1072|       2041|
+-------------------+------------+----------+-----------+
only showing top 5 rows



In [0]:
df_medals.show(10)

+-----------+--------------------+-----------+-------------+-------------+------------+--------------------+
|rank_number|        country_name|gold_medals|silver_medals|bronze_medals|total_medals|rank_by_total_medals|
+-----------+--------------------+-----------+-------------+-------------+------------+--------------------+
|          1|United States of ...|         39|           41|           33|         113|                   1|
|          2|People's Republic...|         38|           32|           18|          88|                   2|
|          3|               Japan|         27|           14|           17|          58|                   5|
|          4|       Great Britain|         22|           21|           22|          65|                   4|
|          5|                 ROC|         20|           28|           23|          71|                   3|
|          6|           Australia|         17|            7|           22|          46|                   6|
|          7|      

In [0]:
df_teams.show(5)

+-------+--------------+--------------------+----------+
|   team|         sport|        country_name|event_name|
+-------+--------------+--------------------+----------+
|Belgium|3x3 Basketball|             Belgium|       Men|
|  China|3x3 Basketball|People's Republic...|       Men|
|  China|3x3 Basketball|People's Republic...|     Women|
| France|3x3 Basketball|              France|     Women|
|  Italy|3x3 Basketball|               Italy|     Women|
+-------+--------------+--------------------+----------+
only showing top 5 rows



- <b>2:  Handle `NULL or Missing` Values </b>

In [0]:
df_coaches.show(10)

+--------------------+--------------------+-----------------+----------+
|          coach_name|        country_name|            sport|event_name|
+--------------------+--------------------+-----------------+----------+
|     ABDELMAGID Wael|               Egypt|         Football|      NULL|
|           ABE Junya|               Japan|       Volleyball|      NULL|
|       ABE Katsuhiko|               Japan|       Basketball|      NULL|
|        ADAMA Cherif|       Côte d'Ivoire|         Football|      NULL|
|          AGEBA Yuya|               Japan|       Volleyball|      NULL|
|AIKMAN Siegfried ...|               Japan|           Hockey|       Men|
|       AL SAADI Kais|             Germany|           Hockey|       Men|
|       ALAMEDA Lonni|              Canada|Baseball/Softball|  Softball|
|     ALEKNO Vladimir|Islamic Republic ...|       Volleyball|       Men|
|     ALEKSEEV Alexey|                 ROC|         Handball|     Women|
+--------------------+--------------------+--------

In [0]:
from pyspark.sql.functions import when, col


# Replace NULL in "Event" with "Not Specified"
df_coaches = df_coaches.withColumn(
    "event_name",
    when(col("event_name").isNull(), "Not Specified").otherwise(col("event_name"))
)

In [0]:
df_coaches.show(10)

+--------------------+--------------------+-----------------+-------------+
|          coach_name|        country_name|            sport|   event_name|
+--------------------+--------------------+-----------------+-------------+
|     ABDELMAGID Wael|               Egypt|         Football|Not Specified|
|           ABE Junya|               Japan|       Volleyball|Not Specified|
|       ABE Katsuhiko|               Japan|       Basketball|Not Specified|
|        ADAMA Cherif|       Côte d'Ivoire|         Football|Not Specified|
|          AGEBA Yuya|               Japan|       Volleyball|Not Specified|
|AIKMAN Siegfried ...|               Japan|           Hockey|          Men|
|       AL SAADI Kais|             Germany|           Hockey|          Men|
|       ALAMEDA Lonni|              Canada|Baseball/Softball|     Softball|
|     ALEKNO Vladimir|Islamic Republic ...|       Volleyball|          Men|
|     ALEKSEEV Alexey|                 ROC|         Handball|        Women|
+-----------

- <b>3: Remove any `duplicate` values </b>

In [0]:
# Remove duplicates for all dataframes
df_athlets = df_athlets.dropDuplicates()
df_coaches = df_coaches.dropDuplicates()
df_entry_gender = df_entry_gender.dropDuplicates()
df_medals = df_medals.dropDuplicates()
df_teams = df_teams.dropDuplicates()

- <b>4: Change `data types`</b>

In [0]:
df_entry_gender.show(5)
print('*' * 50)
print(df_entry_gender.printSchema())


+--------------------+------------+----------+-----------+
|               sport|female_count|male_count|total_count|
+--------------------+------------+----------+-----------+
|       Skateboarding|          40|        40|         80|
|        Canoe Slalom|          41|        41|         82|
|Cycling Mountain ...|          38|        38|         76|
|      Sport Climbing|          20|        20|         40|
|             Fencing|         107|       108|        215|
+--------------------+------------+----------+-----------+
only showing top 5 rows

**************************************************
root
 |-- sport: string (nullable = true)
 |-- female_count: string (nullable = true)
 |-- male_count: string (nullable = true)
 |-- total_count: string (nullable = true)

None


In [0]:
df_medals.show(3)
print('*' * 50)
print(df_medals.printSchema())


+-----------+------------+-----------+-------------+-------------+------------+--------------------+
|rank_number|country_name|gold_medals|silver_medals|bronze_medals|total_medals|rank_by_total_medals|
+-----------+------------+-----------+-------------+-------------+------------+--------------------+
|         48|       India|          1|            2|            4|           7|                  33|
|         46|   Venezuela|          1|            3|            0|           4|                  47|
|         53|     Austria|          1|            1|            5|           7|                  33|
+-----------+------------+-----------+-------------+-------------+------------+--------------------+
only showing top 3 rows

**************************************************
root
 |-- rank_number: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- gold_medals: string (nullable = true)
 |-- silver_medals: string (nullable = true)
 |-- bronze_medals: string (nullable 

In [0]:
from pyspark.sql.types import IntegerType

# Entry Gender
df_entry_gender = df_entry_gender.withColumn("female_count", df_entry_gender["female_count"].cast(IntegerType())) \
       .withColumn("male_count", df_entry_gender["male_count"].cast(IntegerType())) \
       .withColumn("total_count", df_entry_gender["total_count"].cast(IntegerType()))

# Medals
columns_to_convert = ["rank_number", "gold_medals", "silver_medals", "bronze_medals", "total_medals", "rank_by_total_medals"]

for col in columns_to_convert:
    df_medals = df_medals.withColumn(col, df_medals[col].cast(IntegerType()))

In [0]:
print(df_entry_gender.printSchema())
print('*' * 50)
print(df_medals.printSchema())

root
 |-- sport: string (nullable = true)
 |-- female_count: integer (nullable = true)
 |-- male_count: integer (nullable = true)
 |-- total_count: integer (nullable = true)

None
**************************************************
root
 |-- rank_number: integer (nullable = true)
 |-- country_name: string (nullable = true)
 |-- gold_medals: integer (nullable = true)
 |-- silver_medals: integer (nullable = true)
 |-- bronze_medals: integer (nullable = true)
 |-- total_medals: integer (nullable = true)
 |-- rank_by_total_medals: integer (nullable = true)

None


- <b>5: Add `Country Codes`</b>

In [0]:
!pip install pycountry

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
rename_countries = {
    "Chinese Taipei": "Taiwan",
    "Great Britain": "United Kingdom",
    "Republic of Korea": "South Korea",
    "Turkey": "Türkiye",
    "Hong Kong, China": "Hong Kong"
}


countries = [
    "Belgium", 
    "People's Republic of China", 
    "France", 
    "Italy", 
    "Japan", 
    "Latvia", 
    "Mongolia", 
    "Netherlands", 
    "Poland", 
    "ROC", 
    "Romania", 
    "Serbia", 
    "United States of America", 
    "Australia", 
    "Bangladesh", 
    "Belarus", 
    "Brazil", 
    "Canada", 
    "Chinese Taipei", 
    "Colombia", 
    "Egypt", 
    "Germany", 
    "Great Britain", 
    "India", 
    "Indonesia", 
    "Kazakhstan", 
    "Malaysia", 
    "Mexico", 
    "Republic of Moldova", 
    "Republic of Korea", 
    "Spain", 
    "Tunisia", 
    "Turkey", 
    "Ukraine", 
    "Vietnam", 
    "Switzerland", 
    "Austria", 
    "Greece", 
    "Israel", 
    "Liechtenstein", 
    "South Africa", 
    "Bahamas", 
    "Botswana", 
    "Cuba", 
    "Czech Republic", 
    "Denmark", 
    "Dominican Republic", 
    "Ecuador", 
    "Ghana", 
    "Ireland", 
    "Jamaica", 
    "Nigeria", 
    "Trinidad and Tobago", 
    "Argentina", 
    "Islamic Republic of Iran", 
    "Puerto Rico", 
    "Slovenia", 
    "Chile", 
    "Kenya", 
    "Morocco", 
    "Norway", 
    "Qatar", 
    "Hong Kong, China", 
    "Lithuania", 
    "New Zealand", 
    "Estonia", 
    "Hungary", 
    "Côte d'Ivoire", 
    "Honduras", 
    "Saudi Arabia", 
    "Sweden", 
    "Zambia", 
    "Angola", 
    "Bahrain", 
    "Montenegro", 
    "Portugal", 
    "Azerbaijan", 
    "Bulgaria", 
    "Uzbekistan", 
    "Fiji", 
    "Croatia", 
    "Singapore", 
    "Slovakia", 
    "Venezuela"
]


In [0]:
import pycountry

def get_country_code(country_name):
    try:
        country = pycountry.countries.lookup(country_name)
        return country.alpha_3
    except LookupError:
        return None
    
# Map country names to their codes
country_code_mapping = {}
for country in countries:
    if country == "ROC":
        country_code_mapping[country] = "RUS"
    elif country in rename_countries.keys():
        country_code = get_country_code(rename_countries[country])
        country_code_mapping[country] = country_code
    else:
        country_code = get_country_code(country)
        country_code_mapping[country] = country_code
    

# print all 
for country, code in country_code_mapping.items():
    print(f"{country}: {code}")

Belgium: BEL
People's Republic of China: CHN
France: FRA
Italy: ITA
Japan: JPN
Latvia: LVA
Mongolia: MNG
Netherlands: NLD
Poland: POL
ROC: RUS
Romania: ROU
Serbia: SRB
United States of America: USA
Australia: AUS
Bangladesh: BGD
Belarus: BLR
Brazil: BRA
Canada: CAN
Chinese Taipei: TWN
Colombia: COL
Egypt: EGY
Germany: DEU
Great Britain: GBR
India: IND
Indonesia: IDN
Kazakhstan: KAZ
Malaysia: MYS
Mexico: MEX
Republic of Moldova: MDA
Republic of Korea: KOR
Spain: ESP
Tunisia: TUN
Turkey: TUR
Ukraine: UKR
Vietnam: VNM
Switzerland: CHE
Austria: AUT
Greece: GRC
Israel: ISR
Liechtenstein: LIE
South Africa: ZAF
Bahamas: BHS
Botswana: BWA
Cuba: CUB
Czech Republic: CZE
Denmark: DNK
Dominican Republic: DOM
Ecuador: ECU
Ghana: GHA
Ireland: IRL
Jamaica: JAM
Nigeria: NGA
Trinidad and Tobago: TTO
Argentina: ARG
Islamic Republic of Iran: IRN
Puerto Rico: PRI
Slovenia: SVN
Chile: CHL
Kenya: KEN
Morocco: MAR
Norway: NOR
Qatar: QAT
Hong Kong, China: HKG
Lithuania: LTU
New Zealand: NZL
Estonia: EST
Hunga

In [0]:
df_teams.show(5)

+-------------+-------------------+--------------------+------------+
|         team|              sport|        country_name|  event_name|
+-------------+-------------------+--------------------+------------+
|Great Britain|Artistic Gymnastics|       Great Britain|Women's Team|
|      Nigeria|         Basketball|             Nigeria|       Women|
|  Netherlands|           Handball|         Netherlands|       Women|
|        Japan|         Volleyball|               Japan|         Men|
|United States|            Archery|United States of ...|Women's Team|
+-------------+-------------------+--------------------+------------+
only showing top 5 rows



In [0]:

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType


# Define a Python function for mapping
def map_country_to_code(country_name):
    return country_code_mapping.get(country_name, None)

# Register the Python function as a UDF
map_country_to_code_udf = udf(map_country_to_code, StringType())

# Use the UDF to create a new column
df_teams = df_teams.withColumn("country_code", map_country_to_code_udf(df_teams["country_name"]))

In [0]:
df_teams.show(5)

+-------------+-------------------+--------------------+------------+------------+
|         team|              sport|        country_name|  event_name|country_code|
+-------------+-------------------+--------------------+------------+------------+
|Great Britain|Artistic Gymnastics|       Great Britain|Women's Team|         GBR|
|      Nigeria|         Basketball|             Nigeria|       Women|         NGA|
|  Netherlands|           Handball|         Netherlands|       Women|         NLD|
|        Japan|         Volleyball|               Japan|         Men|         JPN|
|United States|            Archery|United States of ...|Women's Team|         USA|
+-------------+-------------------+--------------------+------------+------------+
only showing top 5 rows



### 6: Write Transformed Data Back to ADLS (Storage Account)

In [0]:
df_athlets.write.mode("overwrite").parquet("mnt/tokyo-olympics-data/transformed_data/athletes")
df_coaches.write.mode("overwrite").parquet("mnt/tokyo-olympics-data/transformed_data/coaches")
df_entry_gender.write.mode("overwrite").parquet("mnt/tokyo-olympics-data/transformed_data/entry_gender")
df_medals.write.mode("overwrite").parquet("mnt/tokyo-olympics-data/transformed_data/medals")
df_teams.write.mode("overwrite").parquet("mnt/tokyo-olympics-data/transformed_data/teams")