In [58]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, regexp_replace, upper, initcap

In [59]:
# Créer une session Spark
spark = SparkSession.builder.getOrCreate()

In [60]:
# Charger les données depuis le fichier CSV
results_df = spark.read.csv("hdfs://namenode:9000/olympic_game/staging/olympic_results.csv", header=True, inferSchema=True)

In [61]:
# Afficher le schéma et quelques lignes des données pour exploration initiale
print("Schema before cleaning:")
results_df.printSchema()

Schema before cleaning:
root
 |-- discipline_title: string (nullable = true)
 |-- event_title: string (nullable = true)
 |-- slug_game: string (nullable = true)
 |-- participant_type: string (nullable = true)
 |-- medal_type: string (nullable = true)
 |-- athletes: string (nullable = true)
 |-- rank_equal: string (nullable = true)
 |-- rank_position: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- country_3_letter_code: string (nullable = true)
 |-- athlete_url: string (nullable = true)
 |-- athlete_full_name: string (nullable = true)
 |-- value_unit: string (nullable = true)
 |-- value_type: string (nullable = true)



In [62]:
# First 5 rows before cleaning
results_df.show(5, truncate=False)

+----------------+-------------+------------+----------------+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+-------------+------------+---------------------+-----------+-----------------+----------+----------+
|discipline_title|event_title  |slug_game   |participant_type|medal_type|athletes                                                                                                                                                  |rank_equal|rank_position|country_name |country_code|country_3_letter_code|athlete_url|athlete_full_name|value_unit|value_type|
+----------------+-------------+------------+----------------+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------+-------------+------------+----------

In [63]:
# Nettoyage des données
# Supprimer les colonnes inutiles
columns_to_drop = ["value_unit", "value_type", "athlete_url", "country_3_letter_code", "country_code"]
results_cleaned_df = results_df.drop(*columns_to_drop)

In [64]:
# Fractionner la colonne 'athletes' en deux colonnes distinctes en supprimant les URL
results_cleaned_df = results_cleaned_df.withColumn("athlete_1", split(split(col("athletes"), "https://olympics.com/en/athletes/")[1], "'")[0]) \
                                       .withColumn("athlete_2", split(split(col("athletes"), "https://olympics.com/en/athletes/")[2], "'")[0])

In [65]:
# Supprimer la colonne 'athletes' désormais inutile
results_cleaned_df = results_cleaned_df.drop("athletes")

In [66]:
# Afficher les données nettoyées
print("Cleaned data:")
results_cleaned_df.show(5, truncate=False)

Cleaned data:
+----------------+-------------+------------+----------------+----------+----------+-------------+-------------+-----------------+--------------------+------------------+
|discipline_title|event_title  |slug_game   |participant_type|medal_type|rank_equal|rank_position|country_name |athlete_full_name|athlete_1           |athlete_2         |
+----------------+-------------+------------+----------------+----------+----------+-------------+-------------+-----------------+--------------------+------------------+
|Curling         |Mixed Doubles|beijing-2022|GameTeam        |GOLD      |False     |1            |Italy        |NULL             |stefania-constantini|amos-mosaner      |
|Curling         |Mixed Doubles|beijing-2022|GameTeam        |SILVER    |False     |2            |Norway       |NULL             |kristin-skaslien    |magnus-nedregotten|
|Curling         |Mixed Doubles|beijing-2022|GameTeam        |BRONZE    |False     |3            |Sweden       |NULL             |a

In [67]:
# Mettre les noms des athlètes en majuscules et supprimer les tirets
results_cleaned_df = results_cleaned_df.withColumn("athlete_1", initcap(regexp_replace("athlete_1", "-", " "))) \
                                       .withColumn("athlete_2", upper(regexp_replace("athlete_2", "-", " ")))

In [47]:
# Afficher les données nettoyées
print("Cleaned data:")
results_cleaned_df.show(5, truncate=False)

Cleaned data:
+----------------+-------------+------------+----------------+----------+----------+-------------+-------------+-----------------+--------------------+------------------+
|discipline_title|event_title  |slug_game   |participant_type|medal_type|rank_equal|rank_position|country_name |athlete_full_name|athlete_1           |athlete_2         |
+----------------+-------------+------------+----------------+----------+----------+-------------+-------------+-----------------+--------------------+------------------+
|Curling         |Mixed Doubles|beijing-2022|GameTeam        |GOLD      |False     |1            |Italy        |NULL             |STEFANIA CONSTANTINI|AMOS MOSANER      |
|Curling         |Mixed Doubles|beijing-2022|GameTeam        |SILVER    |False     |2            |Norway       |NULL             |KRISTIN SKASLIEN    |MAGNUS NEDREGOTTEN|
|Curling         |Mixed Doubles|beijing-2022|GameTeam        |BRONZE    |False     |3            |Sweden       |NULL             |A