In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
spark = SparkSession.builder.appName("OlympicResults_Dim").getOrCreate()

25/04/13 06:01:36 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
hdfs_bronze_path ="hdfs:///data/bronze/Olympic_Results_parquet"
df = spark.read.parquet(hdfs_bronze_path)



In [4]:
df.count()

                                                                                

20782

In [5]:
df = df.filter(col("result_id").rlike("^[0-9]+$")) 

In [6]:
df.count()

                                                                                

7394

In [7]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [8]:
spark

In [9]:
df.show(10)

+---------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|result_id|         event_title|             edition|edition_id|               sport|           sport_url|         result_date|     result_location| result_participants|       result_format|       result_detail|  result_description|
+---------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    30359|Super-Heavyweight...|2004 Summer Olympics|        26|       Weightlifting|/editions/26/spor...|25 August 2004 — ...|Olympiako Gymnast...|17 from 15 countries|Total of best lif...|                  na|Not so much a com...|
|     1626|Giant Slalom, Women1|1998 Winter Olympics|        46|    

In [10]:
df.printSchema()

root
 |-- result_id: string (nullable = true)
 |-- event_title: string (nullable = true)
 |-- edition: string (nullable = true)
 |-- edition_id: string (nullable = true)
 |-- sport: string (nullable = true)
 |-- sport_url: string (nullable = true)
 |-- result_date: string (nullable = true)
 |-- result_location: string (nullable = true)
 |-- result_participants: string (nullable = true)
 |-- result_format: string (nullable = true)
 |-- result_detail: string (nullable = true)
 |-- result_description: string (nullable = true)



In [11]:
hdfs_bronze_path ="hdfs:///data/bronze/Olympic_Athlete_Event_Results_parquet"
df2 = spark.read.parquet(hdfs_bronze_path)

In [12]:
df2.printSchema()

root
 |-- edition: string (nullable = true)
 |-- edition_id: string (nullable = true)
 |-- country_noc: string (nullable = true)
 |-- sport: string (nullable = true)
 |-- event: string (nullable = true)
 |-- result_id: string (nullable = true)
 |-- athlete: string (nullable = true)
 |-- athlete_id: string (nullable = true)
 |-- pos: string (nullable = true)
 |-- medal: string (nullable = true)
 |-- isTeamSport: string (nullable = true)



In [13]:
df2_renamed = df2.select(col("result_id").alias("df_result_id"), "isTeamSport","event").distinct()

In [14]:
df2_renamed.groupBy("df_result_id").count().filter("count > 1").show()


[Stage 9:>                                                          (0 + 4) / 4]

+------------+-----+
|df_result_id|count|
+------------+-----+
|    19019671|    2|
+------------+-----+



                                                                                

In [15]:
df2_renamed = df2_renamed.dropDuplicates(["df_result_id"])

In [16]:

df_final = df.join(
    df2_renamed,  # Use the renamed df2
    df["result_id"] == df2_renamed["df_result_id"],  # Reference df2_renamed
    how="left"
)

In [17]:
df_final.count()

7394

In [18]:
null_counts = df_final.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_final.columns])

# Show the result
null_counts.show()

[Stage 21:>                                                         (0 + 3) / 3]

+---------+-----------+-------+----------+-----+---------+-----------+---------------+-------------------+-------------+-------------+------------------+------------+-----------+-----+
|result_id|event_title|edition|edition_id|sport|sport_url|result_date|result_location|result_participants|result_format|result_detail|result_description|df_result_id|isTeamSport|event|
+---------+-----------+-------+----------+-----+---------+-----------+---------------+-------------------+-------------+-------------+------------------+------------+-----------+-----+
|        0|          0|      0|         0|    0|        0|          0|              1|                  0|            0|            0|                 0|           0|          0|    0|
+---------+-----------+-------+----------+-----+---------+-----------+---------------+-------------------+-------------+-------------+------------------+------------+-----------+-----+



                                                                                

In [19]:
df_final= df_final.drop("edition","edition_id","df_result_id")

In [20]:
df_final=df_final.dropna(subset="result_location")

In [21]:
df_final.printSchema()

root
 |-- result_id: string (nullable = true)
 |-- event_title: string (nullable = true)
 |-- sport: string (nullable = true)
 |-- sport_url: string (nullable = true)
 |-- result_date: string (nullable = true)
 |-- result_location: string (nullable = true)
 |-- result_participants: string (nullable = true)
 |-- result_format: string (nullable = true)
 |-- result_detail: string (nullable = true)
 |-- result_description: string (nullable = true)
 |-- isTeamSport: string (nullable = true)
 |-- event: string (nullable = true)



In [22]:
df_final.show(20,truncate=False)

[Stage 27:>                                                         (0 + 1) / 1]

+---------+----------------------------------------------------+--------------------+-----------------------+-------------------------------------+---------------------------------------------------------------------------+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [23]:
dd=df_final

In [24]:
df_final=dd

In [25]:
date_pattern = r"(\d{1,2} \w+ \d{4})"

df_final=df_final.withColumn(
    "result_date",
    regexp_extract(col("result_date"), date_pattern, 1)  # Extracts the first valid date
)
df_final = df_final.withColumn(
    "result_date",
    to_date(col("result_date"), "d MMMM yyyy")  # Converts to date type
)

In [26]:
df_final = df_final.select(
    col("result_id").cast("int").alias("Result_ID"),
    col("event_title").cast("string").alias("Event Title"),
    col("event").cast("string").alias("Event Name"),
    col("sport").cast("string").alias("Sport"),
    col("sport_url").cast("string").alias("Sport URL"),
    col("result_date").cast("date").alias("Result Date"),
    col("result_location").cast("string").alias("Result Location"),
    col("result_participants").cast("string").alias("Result Participants"),
    col("isTeamSport").cast("boolean").alias("Is Team Sport")
)

df_final.printSchema()
df_final.show(5)

root
 |-- Result_ID: integer (nullable = true)
 |-- Event Title: string (nullable = true)
 |-- Event Name: string (nullable = true)
 |-- Sport: string (nullable = true)
 |-- Sport URL: string (nullable = true)
 |-- Result Date: date (nullable = true)
 |-- Result Location: string (nullable = true)
 |-- Result Participants: string (nullable = true)
 |-- Is Team Sport: boolean (nullable = true)



                                                                                

+---------+--------------------+--------------------+-------------+--------------------+-----------+--------------------+--------------------+-------------+
|Result_ID|         Event Title|          Event Name|        Sport|           Sport URL|Result Date|     Result Location| Result Participants|Is Team Sport|
+---------+--------------------+--------------------+-------------+--------------------+-----------+--------------------+--------------------+-------------+
|    30359|Super-Heavyweight...|Super-Heavyweight...|Weightlifting|/editions/26/spor...| 2004-08-25|Olympiako Gymnast...|17 from 15 countries|        false|
|     1626|Giant Slalom, Women1| Giant Slalom, Women| Snowboarding|/editions/46/spor...| 1998-02-09|Mt. Yakebitai, Sh...|31 from 14 countries|        false|
|       76|        Singles, Men|        Singles, Men|         Luge|/editions/40/spor...| 1976-02-07|Kunsteis-Bob- und...|43 from 15 countries|        false|
|      962|   1,500 metres, Men|   1,500 metres, Men|Speed

In [27]:
df_final = df_final.dropna(subset="Result Date")

In [28]:
df_final.filter(col("Result Date").isNull()).show()

                                                                                

+---------+-----------+----------+-----+---------+-----------+---------------+-------------------+-------------+
|Result_ID|Event Title|Event Name|Sport|Sport URL|Result Date|Result Location|Result Participants|Is Team Sport|
+---------+-----------+----------+-----+---------+-----------+---------------+-------------------+-------------+
+---------+-----------+----------+-----+---------+-----------+---------------+-------------------+-------------+



In [30]:
df.write.mode("overwrite").parquet("hdfs:///data/silver/OlympicResults_parquet")

                                                                                