In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.getOrCreate()

In [82]:
datahouse = spark.read.csv("hdfs://namenode:9000/data/staging", inferSchema=True,header=True)

In [83]:
# Vérifier la taille de votre DataFrame
data_size = datahouse.count()

# Afficher la taille de votre DataFrame
print("Taille de la data:", data_size)


Taille de la data: 189960


In [84]:
datahouse.show(5)

+--------------------+--------------------+--------------------+----------------+------------------+--------------+----+
|         athlete_url|   athlete_full_name|games_participations|      first_game|athlete_year_birth|athlete_medals| bio|
+--------------------+--------------------+--------------------+----------------+------------------+--------------+----+
|https://olympics....|Cooper WOODS-TOPA...|                   1|    Beijing 2022|              2000|          NULL|NULL|
|https://olympics....|      Felix ELOFSSON|                   2|PyeongChang 2018|              1995|          NULL|NULL|
|https://olympics....|       Dylan WALCZYK|                   1|    Beijing 2022|              1993|          NULL|NULL|
|https://olympics....|       Olli PENTTALA|                   1|    Beijing 2022|              1995|          NULL|NULL|
|https://olympics....|    Dmitriy REIKHERD|                   1|    Beijing 2022|              1989|          NULL|NULL|
+--------------------+----------

In [85]:
from pyspark.sql.functions import sum as spark_sum, when, col

# Créer une expression de somme pour chaque colonne
sum_expressions = [spark_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in datahouse.columns]

# Calculer le nombre total de valeurs manquantes dans chaque colonne
missing_counts = datahouse.agg(*sum_expressions)

# Afficher le nombre de valeurs manquantes dans chaque colonne
missing_counts.show()


+-----------+-----------------+--------------------+----------+------------------+--------------+------+
|athlete_url|athlete_full_name|games_participations|first_game|athlete_year_birth|athlete_medals|   bio|
+-----------+-----------------+--------------------+----------+------------------+--------------+------+
|          0|            94041|               96547|     99234|            104133|        164540|166953|
+-----------+-----------------+--------------------+----------+------------------+--------------+------+



In [86]:
# Supprimer les colonnes 'athlete_medals' et 'bio'
datahouse = datahouse.drop('athlete_medals', 'bio')

# Afficher le schéma mis à jour
datahouse.printSchema()

# Afficher les premières lignes pour vérification
datahouse.show()


root
 |-- athlete_url: string (nullable = true)
 |-- athlete_full_name: string (nullable = true)
 |-- games_participations: string (nullable = true)
 |-- first_game: string (nullable = true)
 |-- athlete_year_birth: string (nullable = true)

+--------------------+--------------------+--------------------+----------------+------------------+
|         athlete_url|   athlete_full_name|games_participations|      first_game|athlete_year_birth|
+--------------------+--------------------+--------------------+----------------+------------------+
|https://olympics....|Cooper WOODS-TOPA...|                   1|    Beijing 2022|              2000|
|https://olympics....|      Felix ELOFSSON|                   2|PyeongChang 2018|              1995|
|https://olympics....|       Dylan WALCZYK|                   1|    Beijing 2022|              1993|
|https://olympics....|       Olli PENTTALA|                   1|    Beijing 2022|              1995|
|https://olympics....|    Dmitriy REIKHERD|        

In [87]:
# Supprimer les lignes contenant des valeurs manquantes
datahouse = datahouse.na.drop()

# Afficher les premières lignes pour vérification
datahouse.show()


+--------------------+--------------------+--------------------+--------------------+------------------+
|         athlete_url|   athlete_full_name|games_participations|          first_game|athlete_year_birth|
+--------------------+--------------------+--------------------+--------------------+------------------+
|https://olympics....|Cooper WOODS-TOPA...|                   1|        Beijing 2022|              2000|
|https://olympics....|      Felix ELOFSSON|                   2|    PyeongChang 2018|              1995|
|https://olympics....|       Dylan WALCZYK|                   1|        Beijing 2022|              1993|
|https://olympics....|       Olli PENTTALA|                   1|        Beijing 2022|              1995|
|https://olympics....|    Dmitriy REIKHERD|                   1|        Beijing 2022|              1989|
|https://olympics....|         Matt GRAHAM|                   3|          Sochi 2014|              1994|
|https://olympics....|     Ikuma HORISHIMA|            

In [88]:
# Vérifier la taille de votre DataFrame après suppression des valeurs manquantes
data_size_after_cleaning = datahouse.count()

# Afficher la taille de votre DataFrame après suppression des valeurs manquantes
print("Taille de la data après suppression des valeurs manquantes:", data_size_after_cleaning)


Taille de la data après suppression des valeurs manquantes: 85808


In [89]:
from pyspark.sql.functions import col

# Formater la colonne athlete_url en tant que string
datahouse = datahouse.withColumn("athlete_url", col("athlete_url").cast("string"))


In [90]:
datahouse = datahouse.withColumn("athlete_full_name", col("athlete_full_name").cast("string"))


In [91]:
datahouse = datahouse.withColumn("games_participations", col("games_participations").cast("integer"))


In [92]:
from pyspark.sql.functions import col, split

# Diviser la colonne first_game en deux colonnes distinctes : place et année
datahouse = datahouse.withColumn("place", split(col("first_game"), " ")[0])
datahouse = datahouse.withColumn("year", split(col("first_game"), " ")[1])

# Afficher le DataFrame pour vérification
datahouse.show()


+--------------------+--------------------+--------------------+--------------------+------------------+-----------+--------+
|         athlete_url|   athlete_full_name|games_participations|          first_game|athlete_year_birth|      place|    year|
+--------------------+--------------------+--------------------+--------------------+------------------+-----------+--------+
|https://olympics....|Cooper WOODS-TOPA...|                   1|        Beijing 2022|              2000|    Beijing|    2022|
|https://olympics....|      Felix ELOFSSON|                   2|    PyeongChang 2018|              1995|PyeongChang|    2018|
|https://olympics....|       Dylan WALCZYK|                   1|        Beijing 2022|              1993|    Beijing|    2022|
|https://olympics....|       Olli PENTTALA|                   1|        Beijing 2022|              1995|    Beijing|    2022|
|https://olympics....|    Dmitriy REIKHERD|                   1|        Beijing 2022|              1989|    Beijing|  

In [93]:
datahouse = datahouse.withColumn("athlete_year_birth", col("athlete_year_birth").cast("integer"))


In [94]:
from pyspark.sql.functions import col, regexp_extract, when, to_date

# Filtrer les lignes où la colonne athlete_year_birth ne contient pas d'année valide
datahouse = datahouse.withColumn("is_valid_year", regexp_extract(col("athlete_year_birth"), "\\d{4}", 0).isNotNull())

# Remplacer les valeurs inappropriées par null
datahouse = datahouse.withColumn("athlete_year_birth", when(col("is_valid_year"), col("athlete_year_birth")).otherwise(None))

# Convertir la colonne en date
#datahouse = datahouse.withColumn("athlete_year_birth", to_date("athlete_year_birth", "yyyy"))

# Supprimer la colonne temporaire
datahouse = datahouse.drop("is_valid_year")

# Afficher le DataFrame pour vérification
datahouse.show()


+--------------------+--------------------+--------------------+--------------------+------------------+-----------+--------+
|         athlete_url|   athlete_full_name|games_participations|          first_game|athlete_year_birth|      place|    year|
+--------------------+--------------------+--------------------+--------------------+------------------+-----------+--------+
|https://olympics....|Cooper WOODS-TOPA...|                   1|        Beijing 2022|              2000|    Beijing|    2022|
|https://olympics....|      Felix ELOFSSON|                   2|    PyeongChang 2018|              1995|PyeongChang|    2018|
|https://olympics....|       Dylan WALCZYK|                   1|        Beijing 2022|              1993|    Beijing|    2022|
|https://olympics....|       Olli PENTTALA|                   1|        Beijing 2022|              1995|    Beijing|    2022|
|https://olympics....|    Dmitriy REIKHERD|                   1|        Beijing 2022|              1989|    Beijing|  

In [95]:
datahouse = datahouse.na.drop()
datahouse.show()

+--------------------+--------------------+--------------------+----------------+------------------+-----------+----+
|         athlete_url|   athlete_full_name|games_participations|      first_game|athlete_year_birth|      place|year|
+--------------------+--------------------+--------------------+----------------+------------------+-----------+----+
|https://olympics....|Cooper WOODS-TOPA...|                   1|    Beijing 2022|              2000|    Beijing|2022|
|https://olympics....|      Felix ELOFSSON|                   2|PyeongChang 2018|              1995|PyeongChang|2018|
|https://olympics....|       Dylan WALCZYK|                   1|    Beijing 2022|              1993|    Beijing|2022|
|https://olympics....|       Olli PENTTALA|                   1|    Beijing 2022|              1995|    Beijing|2022|
|https://olympics....|    Dmitriy REIKHERD|                   1|    Beijing 2022|              1989|    Beijing|2022|
|https://olympics....|         Matt GRAHAM|             

In [97]:
from pyspark.sql.functions import regexp_extract, when

# Définir une expression régulière pour extraire les années au format yyyy
year_regex = r'\b\d{4}\b'

# Appliquer l'expression régulière pour extraire les années de la colonne 'year'
cleaned_year = when(datahouse['year'].rlike(year_regex), datahouse['year']).otherwise(None)

# Ajouter une nouvelle colonne 'cleaned_year' avec les années nettoyées
datahouse = datahouse.withColumn('cleaned_year', cleaned_year)

# Supprimer la colonne 'year' originale
datahouse = datahouse.drop('year')

# Renommer la colonne nettoyée en 'year'
datahouse = datahouse.withColumnRenamed('cleaned_year', 'year')

# Afficher le DataFrame avec la colonne 'year' nettoyée
datahouse.show()


+--------------------+--------------------+--------------------+----------------+------------------+-----------+----+
|         athlete_url|   athlete_full_name|games_participations|      first_game|athlete_year_birth|      place|year|
+--------------------+--------------------+--------------------+----------------+------------------+-----------+----+
|https://olympics....|Cooper WOODS-TOPA...|                   1|    Beijing 2022|              2000|    Beijing|2022|
|https://olympics....|      Felix ELOFSSON|                   2|PyeongChang 2018|              1995|PyeongChang|2018|
|https://olympics....|       Dylan WALCZYK|                   1|    Beijing 2022|              1993|    Beijing|2022|
|https://olympics....|       Olli PENTTALA|                   1|    Beijing 2022|              1995|    Beijing|2022|
|https://olympics....|    Dmitriy REIKHERD|                   1|    Beijing 2022|              1989|    Beijing|2022|
|https://olympics....|         Matt GRAHAM|             

In [98]:
# Supprimer les doublons du DataFrame
datahouse = datahouse.dropDuplicates()

# Afficher le DataFrame sans doublons
datahouse.show()


+--------------------+-------------------+--------------------+----------------+------------------+-----------+----+
|         athlete_url|  athlete_full_name|games_participations|      first_game|athlete_year_birth|      place|year|
+--------------------+-------------------+--------------------+----------------+------------------+-----------+----+
|https://olympics....|  Arianna VALCEPINA|                   1|    Beijing 2022|              1994|    Beijing|2022|
|https://olympics....|         Tess COADY|                   1|    Beijing 2022|              2000|    Beijing|2022|
|https://olympics....|       Lisa SCHULTE|                   1|    Beijing 2022|              2000|    Beijing|2022|
|https://olympics....|         Julia KERN|                   1|    Beijing 2022|              1997|    Beijing|2022|
|https://olympics....|     Gus SCHUMACHER|                   1|    Beijing 2022|              2000|    Beijing|2022|
|https://olympics....|   Delphine CLAUDEL|                   2|P

In [102]:
# Vérifier la taille de votre DataFrame après suppression des valeurs manquantes
data_size_after_duplication = datahouse.count()

# Afficher la taille de votre DataFrame après suppression des valeurs manquantes
print("Taille de la data après suppression des valeurs dupliqué:", data_size_after_duplication)

Taille de la data après suppression des valeurs dupliqué: 73456


In [103]:
# Écrire les données dans un seul fichier CSV
datahouse.coalesce(1).write.csv("work/model/olympic_medals_clean.csv", header=True)
