# Bronze To Silver

In [0]:

dbutils.fs.ls("abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/")

Out[1]: [FileInfo(path='abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/bronze/', name='bronze/', size=0, modificationTime=1731752687000),
 FileInfo(path='abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/gold/', name='gold/', size=0, modificationTime=1731939735000),
 FileInfo(path='abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/silver/', name='silver/', size=0, modificationTime=1731752696000)]

In [0]:
spark

## gitex file transformation

In [0]:
gitex = spark.read.format("csv").option("header", "true").load("abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/bronze/gitex.csv")

In [0]:
gitex.columns

Out[4]: ['group src',
 'heading',
 'web',
 'web 2',
 'list-group-item-text',
 'sector_block',
 'sector_block 2',
 'sector_block 3',
 'sector_block 4',
 'sector_block 5',
 'sector_block 6',
 'sector_block 7',
 'sector_block 8',
 'sector_block 9',
 'sector_block 10',
 'sector_block_outer',
 'sector_block_outer href',
 'btn href',
 'group src 2']

In [0]:
gitex.select("group src").show(n=10, truncate = False)

+----------------------------------------------------------------------------------------------------------------------+
|group src                                                                                                             |
+----------------------------------------------------------------------------------------------------------------------+
|https://exhibitor-manual-004.s3.ap-south-1.amazonaws.com/Production/exb_doc/2011/14296/thumb_2011_14296_14139_2556.png|
|https://exhibitor-manual-004.s3.ap-south-1.amazonaws.com/Production/exb_doc/2011/14443/thumb_2011_14443_14286_8407.png|
|212 Founders’ ambition is to exercise a structuring impact on the entrepreneuria ..."                                 |
|https://exhibitor-manual-004.s3.ap-south-1.amazonaws.com/Production/exb_doc/2011/14297/thumb_2011_14297_14140_6313.png|
|https://exhibitor-manual-004.s3.ap-south-1.amazonaws.com/Production/exb_doc/2011/15566/2011_15566_15392_1267.png      |
|https://exhibitor-manual-004.s3

In [0]:
gitex = gitex.withColumnRenamed("heading", "exhibitor_Name")


In [0]:
gitex.select("web").show(n=30, truncate = False)

+-------------------------------------------+
|web                                        |
+-------------------------------------------+
|Stand No- 14A-16, Hall 14                  |
|Stand No- 9A-10, Hall 9                    |
|Incubator/Accelerator/Investment           |
|Stand No- 12C-61, Hall 12                  |
|Stand No- 9D-23, Hall 9                    |
|Stand No- 2D-12, Hall 2                    |
|Stand No- 1A-12, Hall 1                    |
|Stand No- 4B-25, Hall 4                    |
|Stand No- 2D-22, Hall 2                    |
| IOT and Mobile apps development solutions.|
|Software Services                          |
|Stand No- 8L-04, Hall 8                    |
|Stand No- 14D-60A, 14D-60B, Hall 13        |
|Stand No- 16D-50, Hall 16                  |
|Stand No- 9D-24, Hall 9                    |
| and machine learning. We help you ..."    |
|Stand No- 8M-06, Hall 8                    |
|Stand No- 4E-4, Hall 4                     |
|Stand No- Sponsor, Sponsor       

In [0]:
from pyspark.sql.functions import regexp_extract, when

gitex = gitex.withColumn("Stand", regexp_extract("web", r"Stand No- ([^,]+)", 1)) \
       .withColumn("Hall", regexp_extract("web", r"Hall (\d+)", 1))

gitex = gitex.withColumn("Stand", when(gitex["Stand"] == "", None).otherwise(gitex["Stand"])) \
       .withColumn("Hall", when(gitex["Hall"] == "", None).otherwise(gitex["Hall"]))



In [0]:
gitex.columns

Out[9]: ['group src',
 'exhibitor_Name',
 'web',
 'web 2',
 'list-group-item-text',
 'sector_block',
 'sector_block 2',
 'sector_block 3',
 'sector_block 4',
 'sector_block 5',
 'sector_block 6',
 'sector_block 7',
 'sector_block 8',
 'sector_block 9',
 'sector_block 10',
 'sector_block_outer',
 'sector_block_outer href',
 'btn href',
 'group src 2',
 'Stand',
 'Hall']

In [0]:
gitex = gitex.withColumnRenamed("web 2", "origin_country")

In [0]:
gitex = gitex.withColumnRenamed("list-group-item-text", "exhibitor_description")

In [0]:
gitex = gitex.withColumnRenamed("sector_block", "Primary_sector") \
             .withColumnRenamed("sector_block 2","Additional_sectors_2") \
               .withColumnRenamed("sector_block 3","Additional_sectors_3") \
                 .withColumnRenamed("sector_block 4","Additional_sectors_4") \
                   .withColumnRenamed("sector_block 5","Additional_sectors_5") \
                     .withColumnRenamed("sector_block 6","Additional_sectors_6") \
                       .withColumnRenamed("sector_block 7","Additional_sectors_7") \
                         .withColumnRenamed("sector_block 8","Additional_sectors_8") \
                           .withColumnRenamed("sector_block 9","Additional_sectors_9") \
                             .withColumnRenamed("sector_block 10","Additional_sectors_10") 

In [0]:
gitex.select("sector_block_outer").show(30, truncate = False)

+--------------------------------------------------------------------------------------------------+
|sector_block_outer                                                                                |
+--------------------------------------------------------------------------------------------------+
|Read More                                                                                         |
|null                                                                                              |
|null                                                                                              |
|null                                                                                              |
|null                                                                                              |
|Read More                                                                                         |
|null                                                                                      

In [0]:
from pyspark.sql.functions import col
null_count = gitex.filter(col("sector_block_outer href").isNull()).count()
non_null_count = gitex.filter(col("sector_block_outer href").isNotNull()).count()

print(f"Nombre de valeurs nulles dans 'sector_block_outer href': {null_count}")
print(f"Nombre de valeurs non nulles dans 'sector_block_outer href': {non_null_count}")

Nombre de valeurs nulles dans 'sector_block_outer href': 1454
Nombre de valeurs non nulles dans 'sector_block_outer href': 455


In [0]:
gitex.columns

Out[15]: ['group src',
 'exhibitor_Name',
 'web',
 'origin_country',
 'exhibitor_description',
 'Primary_sector',
 'Additional_sectors_2',
 'Additional_sectors_3',
 'Additional_sectors_4',
 'Additional_sectors_5',
 'Additional_sectors_6',
 'Additional_sectors_7',
 'Additional_sectors_8',
 'Additional_sectors_9',
 'Additional_sectors_10',
 'sector_block_outer',
 'sector_block_outer href',
 'btn href',
 'group src 2',
 'Stand',
 'Hall']

In [0]:

gitex = gitex.drop("group src", "web", "sector_block_outer", "sector_block_outer href", "group src 2", "btn href")


In [0]:
gitex.columns

Out[17]: ['exhibitor_Name',
 'origin_country',
 'exhibitor_description',
 'Primary_sector',
 'Additional_sectors_2',
 'Additional_sectors_3',
 'Additional_sectors_4',
 'Additional_sectors_5',
 'Additional_sectors_6',
 'Additional_sectors_7',
 'Additional_sectors_8',
 'Additional_sectors_9',
 'Additional_sectors_10',
 'Stand',
 'Hall']

In [0]:
gitex.select("Additional_sectors_3").show(30, truncate =False)

+--------------------------------+
|Additional_sectors_3            |
+--------------------------------+
|Cyber security                  |
|null                            |
|null                            |
|null                            |
|null                            |
|Big Data & Analytics            |
|null                            |
|null                            |
|null                            |
|null                            |
|Smart Contracts                 |
|Coding and Development          |
|Computer Vision                 |
|Coding and Development          |
|null                            |
|Future Mobility & Transportation|
|Blockchain                      |
|null                            |
|Energy Tech                     |
|Cyber security                  |
|null                            |
|null                            |
|null                            |
|Broadband Services              |
|Chatbots / Virtual Assistant    |
|null               

In [0]:
gitex.write.format("delta").mode("overwrite").save("abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/silver/gitex")

## speakers file trasformation 

In [0]:
speakers = spark.read.format("csv").option("header", "true").load("abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/bronze/speakers.csv")


In [0]:
speakers.columns

Out[21]: ['PopupLeftSide src',
 'SpeakersLeftInfo',
 'popdesignation',
 'popcompanyName',
 'popcountry',
 'Sepeaker_Bio',
 'Sepeaker_Bio 2',
 'Sepeaker_Bio 3',
 'Sepeaker_Bio 4',
 'Sepeaker_Bio 5']

In [0]:
speakers = speakers.withColumnRenamed("SpeakersLeftInfo","speaker_name") \
                   .withColumnRenamed("popdesignation", "speaker_Designation") \
                     .withColumnRenamed("popcompanyName", "company_represented") \
                       .withColumnRenamed("popcountry", "speaker_country_origin") \
                         .withColumnRenamed("Sepeaker_Bio", "Sepeaker_Bio_1") \
                           .withColumnRenamed("Sepeaker_Bio 2", "Sepeaker_Bio_2") \
                             .withColumnRenamed("Sepeaker_Bio 3", "Sepeaker_Bio_3") \
                               .withColumnRenamed("Sepeaker_Bio 4", "Sepeaker_Bio_4") \
                                 .withColumnRenamed("Sepeaker_Bio 5", "Sepeaker_Bio_5") 

                       
                       
                     


In [0]:
speakers = speakers.drop("PopupLeftSide src")

In [0]:
speakers.columns

Out[24]: ['speaker_name',
 'speaker_Designation',
 'company_represented',
 'speaker_country_origin',
 'Sepeaker_Bio_1',
 'Sepeaker_Bio_2',
 'Sepeaker_Bio_3',
 'Sepeaker_Bio_4',
 'Sepeaker_Bio_5']

In [0]:
speakers.select("Sepeaker_Bio_1").show(50)

+--------------------+
|      Sepeaker_Bio_1|
+--------------------+
|                null|
|                null|
|               Dr.\n|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|Saïd Ibrahimi was...|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|Hicham El Habti i...|
|                null|
|Sacha Michaud is ...|
|                null|
|                null|
|Professor Akin Ab...|
|          

In [0]:
from pyspark.sql import functions as F

duplicates = speakers.groupBy("Sepeaker_Bio_1") \
                  .agg(F.count("*").alias("count")) \
                  .filter(F.col("count") > 1)

duplicates.show()


+--------------+-----+
|Sepeaker_Bio_1|count|
+--------------+-----+
|          null|  281|
+--------------+-----+



In [0]:

from pyspark.sql.functions import col
null_count = speakers.filter(col("Sepeaker_Bio_5").isNull()).count()
non_null_count = speakers.filter(col("Sepeaker_Bio_5").isNotNull()).count()

print(f"Nombre de valeurs nulles : {null_count}")
print(f"Nombre de valeurs non nulles : {non_null_count}")

Nombre de valeurs nulles : 317
Nombre de valeurs non nulles : 8


In [0]:
nombre_lignes = speakers.count()

print(f"Le nombre total de lignes dans la table est : {nombre_lignes}")


Le nombre total de lignes dans la table est : 325


In [0]:
speakers.write.format("delta").mode("overwrite").save("abfss://gitexdatacontainner@gitexdata.dfs.core.windows.net/silver/speakers")