In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder.appName("Athlete_Dim").getOrCreate()

In [25]:
hdfs_bronze_path ="hdfs:///data/bronze/Olympic_Athlete_Bio_parquet"
df = spark.read.parquet(hdfs_bronze_path)

In [26]:
df.show(3)

+----------+--------------+----+----------------+------+------+------------------+-----------+--------------------+-------------+
|athlete_id|          name| sex|            born|height|weight|           country|country_noc|         description|special_notes|
+----------+--------------+----+----------------+------+------+------------------+-----------+--------------------+-------------+
|    109014|Lee Jeong-Seon|Male|15 February 1979|   177|    80| Republic of Korea|        KOR|                NULL|         NULL|
|    110132|Viktor Pfeifer|Male|     16 May 1987|   182|    62|           Austria|        AUT|                NULL|         NULL|
|     31750|    Klaus Weiß|Male|20 November 1944|   192|    92|      East Germany|        GDR|Goalie Klaus Weiß...|         NULL|
+----------+--------------+----+----------------+------+------+------------------+-----------+--------------------+-------------+
only showing top 3 rows



In [27]:
df.printSchema()

root
 |-- athlete_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- born: string (nullable = true)
 |-- height: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- country: string (nullable = true)
 |-- country_noc: string (nullable = true)
 |-- description: string (nullable = true)
 |-- special_notes: string (nullable = true)



In [28]:
df = df.withColumn("athlete_id", col("athlete_id").cast("int"))
df = df.withColumn("height", col("height").cast("int"))
df = df.withColumn("weight", col("weight").cast("int"))

In [29]:
df = df.withColumn("sex", when(col("sex") == "Male", 1).otherwise(0))

In [30]:
df.count()

155861

In [35]:
sathlete =df.toPandas()
sathlete.isnull().sum()

athlete_id    0
name          0
sex           0
born          0
height        0
weight        0
country       0
dtype: int64

In [32]:
df = df.drop("country_noc","description", "special_notes")

In [33]:
df = df.na.drop(subset=["born"])

In [34]:
avg_height = df.select(avg(col("height"))).collect()[0][0]
avg_weight = df.select(avg(col("weight"))).collect()[0][0]

df = df.fillna({"height": avg_height, "weight": avg_weight})

In [36]:
df = df.withColumn("born", to_date(col("born"), "d MMMM yyyy"))

In [37]:
df.show(3)

+----------+--------------+---+----------+------+------+------------------+
|athlete_id|          name|sex|      born|height|weight|           country|
+----------+--------------+---+----------+------+------+------------------+
|    109014|Lee Jeong-Seon|  1|1979-02-15|   177|    80| Republic of Korea|
|    110132|Viktor Pfeifer|  1|1987-05-16|   182|    62|           Austria|
|     31750|    Klaus Weiß|  1|1944-11-20|   192|    92|      East Germany|
+----------+--------------+---+----------+------+------+------------------+
only showing top 3 rows



In [38]:
df_final = df.select(
    col("athlete_id").alias("Athlete ID"),
    col("name").alias("Name"),
    col("sex").alias("Sex"),
    col("born").alias("Born"),
    col("height").alias("Height"),
    col("weight").alias("Weight"),
    col("country").alias("Nationality")
)

In [39]:
df_final.write.mode("overwrite").parquet("hdfs:///data/sliver/Athlete_parquet")

                                                                                