In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("Fact_Table").getOrCreate()

25/03/25 07:36:47 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
hdfs_bronze_path ="hdfs:///data/bronze/Olympic_Athlete_Event_Results_parquet"
df = spark.read.parquet(hdfs_bronze_path)

                                                                                

In [4]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [5]:
spark

In [6]:
df.show(5)

                                                                                

+--------------------+----------+-----------+---------+---------------+---------+---------------+----------+---------+-----+-----------+
|             edition|edition_id|country_noc|    sport|          event|result_id|        athlete|athlete_id|      pos|medal|isTeamSport|
+--------------------+----------+-----------+---------+---------------+---------+---------------+----------+---------+-----+-----------+
|1908 Summer Olympics|         5|        ANZ|Athletics|100 metres, Men|    56265|Ernest Hutcheon|     64710|      DNS| NULL|      False|
|1908 Summer Olympics|         5|        ANZ|Athletics|400 metres, Men|    56313|   Henry Murray|     64756|      DNS| NULL|      False|
|1908 Summer Olympics|         5|        ANZ|Athletics|800 metres, Men|    56338|  Harvey Sutton|     64808|3 h8 r1/2| NULL|      False|
|1908 Summer Olympics|         5|        ANZ|Athletics|800 metres, Men|    56338|    Guy Haskins|    922519|      DNS| NULL|      False|
|1908 Summer Olympics|         5|        

In [7]:
df.printSchema()

root
 |-- edition: string (nullable = true)
 |-- edition_id: string (nullable = true)
 |-- country_noc: string (nullable = true)
 |-- sport: string (nullable = true)
 |-- event: string (nullable = true)
 |-- result_id: string (nullable = true)
 |-- athlete: string (nullable = true)
 |-- athlete_id: string (nullable = true)
 |-- pos: string (nullable = true)
 |-- medal: string (nullable = true)
 |-- isTeamSport: string (nullable = true)



In [6]:
Olympic_Games_Medal_Tally_path ="hdfs:///data/bronze/Olympic_Games_Medal_Tally_parquet"
df2 = spark.read.parquet(Olympic_Games_Medal_Tally_path)

In [14]:
df2.printSchema()

root
 |-- edition: string (nullable = true)
 |-- edition_id: string (nullable = true)
 |-- year: string (nullable = true)
 |-- country: string (nullable = true)
 |-- country_noc: string (nullable = true)
 |-- gold: string (nullable = true)
 |-- silver: string (nullable = true)
 |-- bronze: string (nullable = true)
 |-- total: string (nullable = true)



In [7]:
selected_df = df.select("edition_id", "country_noc", "result_id", "athlete_id", "pos", "medal") \
    .join(df2.select(col("edition_id").alias("ed_id"), 
                     col("country_noc").alias("noc"), 
                     "gold", "silver", "bronze", "total"),
          (col("country_noc") == col("noc")) & (col("edition_id") == col("ed_id")),
          how="inner")

selected_df.show()

                                                                                

+----------+-----------+---------+----------+----------+-----+-----+---+----+------+------+-----+
|edition_id|country_noc|result_id|athlete_id|       pos|medal|ed_id|noc|gold|silver|bronze|total|
+----------+-----------+---------+----------+----------+-----+-----+---+----+------+------+-----+
|         5|        ANZ|    56265|     64710|       DNS| NULL|    5|ANZ|   1|     2|     2|    5|
|         5|        ANZ|    56313|     64756|       DNS| NULL|    5|ANZ|   1|     2|     2|    5|
|         5|        ANZ|    56338|     64808| 3 h8 r1/2| NULL|    5|ANZ|   1|     2|     2|    5|
|         5|        ANZ|    56338|    922519|       DNS| NULL|    5|ANZ|   1|     2|     2|    5|
|         5|        ANZ|    56338|     64735|       DNS| NULL|    5|ANZ|   1|     2|     2|    5|
|         5|        ANZ|    56338|     64756|       DNS| NULL|    5|ANZ|   1|     2|     2|    5|
|         5|        ANZ|    56349|     64735| 5 h2 r1/2| NULL|    5|ANZ|   1|     2|     2|    5|
|         5|        

In [13]:
from pyspark.sql.functions import col

df_final = selected_df.select(
    col("edition_id").alias("Edition ID"),
    col("country_noc").alias("Country NOC"),
    col("result_id").alias("Result ID"),
    col("athlete_id").alias("Athlete ID"),
    col("pos").alias("Position"),
    col("medal").alias("Medal Athlete"),
    col("gold").alias("Gold Medals"),
    col("silver").alias("Silver Medals"),
    col("bronze").alias("Bronze Medals"),
    col("total").alias("Total Medals")
).fillna({"Medal Athlete": "No Medal"})  # Replace NULL values in Medal column with 'No Medal'

df_final.show()

+----------+-----------+---------+----------+----------+-------------+-----------+-------------+-------------+------------+
|Edition ID|Country NOC|Result ID|Athlete ID|  Position|Medal Athlete|Gold Medals|Silver Medals|Bronze Medals|Total Medals|
+----------+-----------+---------+----------+----------+-------------+-----------+-------------+-------------+------------+
|         5|        ANZ|    56265|     64710|       DNS|     No Medal|          1|            2|            2|           5|
|         5|        ANZ|    56313|     64756|       DNS|     No Medal|          1|            2|            2|           5|
|         5|        ANZ|    56338|     64808| 3 h8 r1/2|     No Medal|          1|            2|            2|           5|
|         5|        ANZ|    56338|    922519|       DNS|     No Medal|          1|            2|            2|           5|
|         5|        ANZ|    56338|     64735|       DNS|     No Medal|          1|            2|            2|           5|
|       

In [14]:
df_final.count()

278932

In [15]:
null_counts = df_final.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_final.columns])

# Show the result
null_counts.show()

+----------+-----------+---------+----------+--------+-------------+-----------+-------------+-------------+------------+
|Edition ID|Country NOC|Result ID|Athlete ID|Position|Medal Athlete|Gold Medals|Silver Medals|Bronze Medals|Total Medals|
+----------+-----------+---------+----------+--------+-------------+-----------+-------------+-------------+------------+
|         0|          0|        0|         0|       0|            0|          0|            0|            0|           0|
+----------+-----------+---------+----------+--------+-------------+-----------+-------------+-------------+------------+



In [16]:
df.write.mode("overwrite").parquet("hdfs:///data/sliver/Fact_Table_parquet")

                                                                                