In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, rand, when,current_timestamp, to_timestamp,lit, concat_ws, split, size, expr,regexp_replace, sha2,to_date,col

#Initialize SparkSession
spark = SparkSession.builder \
    .appName("PySparkESG") \
    .getOrCreate()

# BRONZE LAYER

In [2]:
countries = spark.read.option("header", "true").csv("ESG_Data/ESGCountry.csv").cache()

In [3]:
countries_series = spark.read.option("header", "true").csv("ESG_Data/ESGCountry-series.csv").cache()

In [4]:
esg_data = spark.read.option("header", "true").csv("ESG_Data/ESGCSV.csv").cache()

In [5]:
footnote = spark.read.option("header", "true").csv("ESG_Data/ESGfootnote.csv").cache()

In [6]:
series = spark.read.option("header","true").csv("ESG_Data/ESGSeries.csv").cache()

In [7]:
series_time = spark.read.option("header","true").csv("ESG_Data/ESGSeries-time.csv").cache()

# SILVER LAYER

Countries

In [None]:
ecacountries=(countries
              #.where(col("Region")=="Europe & Central Asia")
              .select("Country Code","Region","Income Group","Special Notes","Table Name")
                .withColumnRenamed("Country Code"
                                   ,"CountryCode")
              .join(countries_series, on= "CountryCode").select(countries_series["*"])
              )
#ecacountriesSeries = countries_series..cache()


all countries info dataframe

In [9]:
CleanedCountriesInfo = (
    ecacountries
    .join(ecacountriesSeries, on="CountryCode", how="inner")  # Ensure the join is explicit
    .drop(ecacountriesSeries["CountryCode"])  # Drop duplicate CountryCode from the second DataFrame
    .withColumn(
        "Special Notes",
        when(col("Special Notes").isNull(), "N/A").otherwise(col("Special Notes"))
    )  
    .distinct() 
    .cache()  
)


Series

In [10]:

CleanedSeriesInfo= (series
    .join(footnote, footnote["SeriesCode"]==series["Series Code"], how="inner")
              .select(series["Series Code"],series["Topic"],
                      series["Indicator Name"],series["Source"],footnote["CountryCode"],footnote["Year"].alias("footnoteYear"),footnote["DESCRIPTION"].alias("footnoteDescription"))
    #.where(col("Indicator Name")=="Renewable energy consumption (% of total final energy consumption)")
    
).cache()

Esg Data

In [11]:
#Renewable energy consumption (% of total final energy consumption)
esg_data = (
    CleanedSeriesInfo.join(esg_data, on="Indicator Name")
    )

In [12]:
CleanedEsgData = esg_data.selectExpr(
    "`Country Name`",
    "`Country Code`",
    "`Indicator Name`",
    "`Indicator Code`",
    f"stack({2023 - 1990 + 1}, " + 
    ", ".join([f"'{year}', `{year}`" for year in range(1990, 2024)]) + ") as (Year, Value)"
)
CleanedEsgData = (CleanedEsgData.withColumn("Year", col("Year").cast("int")) 
                           .withColumn("Value", col("Value").cast("float"))
                          .withColumnRenamed("Country Code","CountryCode")
                          .withColumnRenamed("Country Name","CountryName")
                          .withColumnRenamed("Indicator Name","IndicatorName")
                          .withColumnRenamed("Indicator Code","IndicatorCode")
                          .withColumnRenamed("Series Code","SeriesCode")
                          ).cache()

In [None]:
LastCleanedDf= (CleanedEsgData
                .join(CleanedCountriesInfo, on="CountryCode",how="inner")
                
                )
LastCleanedDf.show()

# GOLD LAYER

### I this section i'm going to create a dimensional model of this use case in order to organize my data more

In [None]:
CleanedEsgData.show(truncate=False)

In [None]:
FactMetrics =

Country Dimension

In [None]:
DimCountry = (
    
    CleanedRenewableEnergy.alias("renewable")
    .join(CleanedEca.alias("eca"), on="CountryCode", how="inner")
    .select(
        col("renewable.CountryCode"),
        col("renewable.CountryName"),
        col("eca.Region"),
        col("eca.IncomeGroup"),
        col("eca.SpecialNotes")
    ).distinct()
)
DimCountry.show(truncate=False)

SeriesCode (Primary Key)
Indicator Name
Topic
Source
Long definition

In [None]:
DimIndicators = CleanedSeries.select("*")
DimIndicators.show()
                 

In [None]:
countries.printSchema()