# Silver To Gold

## speakers transformation

In [0]:
speakers = spark.read.format("parquet").load("abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/silver/speakers/speakers.snappy.parquet")


In [0]:
speakers.columns

Out[4]: ['speaker_name',
 'speaker_Designation',
 'company_represented',
 'speaker_country_origin',
 'Sepeaker_Bio_1',
 'Sepeaker_Bio_2',
 'Sepeaker_Bio_3',
 'Sepeaker_Bio_4',
 'Sepeaker_Bio_5']

In [0]:
from pyspark.sql import functions as F

speakers_bio = speakers.select("speaker_name", "Sepeaker_Bio_1", "Sepeaker_Bio_2", "Sepeaker_Bio_3", "Sepeaker_Bio_4", "Sepeaker_Bio_5")

speakers_bio_filtred = speakers_bio.filter(
    (F.col("speaker_name").isNotNull()) |
    (F.col("Sepeaker_Bio_1").isNotNull()) |
    (F.col("Sepeaker_Bio_2").isNotNull()) |
    (F.col("Sepeaker_Bio_3").isNotNull()) |
    (F.col("Sepeaker_Bio_4").isNotNull()) |
    (F.col("Sepeaker_Bio_5").isNotNull())
)



In [0]:
speakers_bio.columns

Out[6]: ['speaker_name',
 'Sepeaker_Bio_1',
 'Sepeaker_Bio_2',
 'Sepeaker_Bio_3',
 'Sepeaker_Bio_4',
 'Sepeaker_Bio_5']

In [0]:
speakers_bio.show(10)

+--------------------+--------------+--------------+--------------+--------------+--------------+
|        speaker_name|Sepeaker_Bio_1|Sepeaker_Bio_2|Sepeaker_Bio_3|Sepeaker_Bio_4|Sepeaker_Bio_5|
+--------------------+--------------+--------------+--------------+--------------+--------------+
|   H.E Ghita Mezzour|          null|          null|          null|          null|          null|
|H.E Khalid Aït Taleb|          null|          null|          null|          null|          null|
|H.E Dr. Tunji Alausa|         Dr.\n|          null|          null|          null|          null|
| Nigerian Ministe...|          null|          null|          null|          null|          null|
| emphasizing Univ...|          null|          null|          null|          null|          null|
| he's a U.S. boar...|          null|          null|          null|          null|          null|
| known for pionee...|          null|          null|          null|          null|          null|
| focusing on kidn..

In [0]:

nombre_lignes = speakers_bio.count()

print(f"Le nombre total de lignes dans la table est : {nombre_lignes}")

Le nombre total de lignes dans la table est : 325


In [0]:
from pyspark.sql.functions import col
from functools import reduce 

specific_column = "speaker_name"

condition_specific = (
    col(specific_column).isNotNull() & 
    reduce(lambda x, y: x & y, (col(c).isNull() for c in speakers_bio.columns if c != specific_column))
)

condition_all_null = reduce(lambda x, y: x & y, (col(c).isNull() for c in speakers_bio.columns))

final_condition = condition_specific | condition_all_null

speakers_bio_filtered = speakers_bio.filter(~final_condition)


In [0]:
speakers_bio_filtered.show(30)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        speaker_name|      Sepeaker_Bio_1|      Sepeaker_Bio_2|      Sepeaker_Bio_3|      Sepeaker_Bio_4|      Sepeaker_Bio_5|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|H.E Dr. Tunji Alausa|               Dr.\n|                null|                null|                null|                null|
|       Saïd Ibrahimi|Saïd Ibrahimi was...|                null|                null|                null|                null|
|     Hicham el Habti|Hicham El Habti i...|Hicham El Habti h...|He joined OCP Gro...|                null|                null|
|       Sacha Michaud|Sacha Michaud is ...|                null|                null|                null|                null|
|        Akin Abayomi|Professor Akin Ab...|                null|                null|                nul

In [0]:

nombre_lignes = speakers_bio_filtered.count()

print(f"Le nombre total de lignes dans la table est : {nombre_lignes}")

Le nombre total de lignes dans la table est : 44


In [0]:
speakers = speakers.drop("Sepeaker_Bio_1", "Sepeaker_Bio_2", "Sepeaker_Bio_3", "Sepeaker_Bio_4", "Sepeaker_Bio_5")

In [0]:
speakers.columns

Out[13]: ['speaker_name',
 'speaker_Designation',
 'company_represented',
 'speaker_country_origin']

In [0]:
speakers.write.format("delta").mode("overwrite").save("abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/gold/speakers")

In [0]:
speakers_bio_filtered.write.format("delta").mode("overwrite").save("abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/gold/speakers_bio_filtered")

## gitex transformation

In [0]:
gitex = spark.read.format("parquet").load("abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/silver/gitex/gitex.snappy.parquet")


In [0]:
gitex.columns

Out[17]: ['exhibitor_Name',
 'origin_country',
 'exhibitor_description',
 'Primary_sector',
 'Additional_sectors_2',
 'Additional_sectors_3',
 'Additional_sectors_4',
 'Additional_sectors_5',
 'Additional_sectors_6',
 'Additional_sectors_7',
 'Additional_sectors_8',
 'Additional_sectors_9',
 'Additional_sectors_10',
 'Stand',
 'Hall']

In [0]:
all_null_condition = reduce(lambda x, y: x & y, (col(c).isNull() for c in gitex.columns))

any_not_null_condition = reduce(lambda x, y: x | y, (col(c).isNotNull() for c in gitex.columns))

all_null_count = gitex.filter(all_null_condition).count()  
any_not_null_count = gitex.filter(any_not_null_condition).count() 

# Afficher les résultats
print(f"Nombre de lignes où toutes les colonnes sont nulles : {all_null_count}")
print(f"Nombre de lignes où au moins une colonne est non nulle : {any_not_null_count}")

Nombre de lignes où toutes les colonnes sont nulles : 132
Nombre de lignes où au moins une colonne est non nulle : 1777


In [0]:
gitex = gitex.dropna(how="all")

In [0]:
all_null_condition = reduce(lambda x, y: x & y, (col(c).isNull() for c in gitex.columns))

any_not_null_condition = reduce(lambda x, y: x | y, (col(c).isNotNull() for c in gitex.columns))

all_null_count = gitex.filter(all_null_condition).count()  
any_not_null_count = gitex.filter(any_not_null_condition).count() 

# Afficher les résultats
print(f"Nombre de lignes où toutes les colonnes sont nulles : {all_null_count}")
print(f"Nombre de lignes où au moins une colonne est non nulle : {any_not_null_count}")

Nombre de lignes où toutes les colonnes sont nulles : 0
Nombre de lignes où au moins une colonne est non nulle : 1777


In [0]:
gitex.write.format("delta").mode("overwrite").save("abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/gold/gitex")