In [0]:
from pyspark.sql.functions import col, when, count, isnan, lag
from pyspark.sql.window import Window

In [0]:
dbutils.fs.mount(source = "wasbs://<container-name>@<storage-acc>.blob.core.windows.net/raw-data", 
                 mount_point = "/mnt/data",
                 extra_configs = {"<config-key>": "<Account-key>"})

True

In [0]:
dbutils.fs.ls("/mnt/data-gcg")

[FileInfo(path='dbfs:/mnt/data-gcg/gcg-rawdata-1900-2023', name='gcg-rawdata-1900-2023', size=65907681, modificationTime=1737563896000)]

In [0]:
spark

In [0]:
rawData = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/data-gcg/gcg-rawdata-1900-2023")

In [0]:
rawData.show()

+----+-------------------------------+-----------------------+-------------------+-----------------------------+
|Year|Global Average Temperature (°C)|CO2 Concentration (ppm)|Sea Level Rise (mm)|Arctic Ice Area (million km²)|
+----+-------------------------------+-----------------------+-------------------+-----------------------------+
|1948|                          13.17|                 397.04|             116.25|                         5.97|
|1996|                           13.1|                 313.17|             277.92|                         9.66|
|2015|                          14.67|                 311.95|             290.32|                          8.4|
|1966|                          14.79|                 304.25|             189.71|                        11.83|
|1992|                          13.15|                 354.52|              14.84|                        11.23|
|1907|                          13.67|                  288.5|              22.37|              

In [0]:
rawData.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Global Average Temperature (°C): double (nullable = true)
 |-- CO2 Concentration (ppm): double (nullable = true)
 |-- Sea Level Rise (mm): double (nullable = true)
 |-- Arctic Ice Area (million km²): double (nullable = true)



In [0]:
rawData.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in rawData.columns]
   ).show()


+----+-------------------------------+-----------------------+-------------------+-----------------------------+
|Year|Global Average Temperature (°C)|CO2 Concentration (ppm)|Sea Level Rise (mm)|Arctic Ice Area (million km²)|
+----+-------------------------------+-----------------------+-------------------+-----------------------------+
|   0|                              0|                      0|                  0|                            0|
+----+-------------------------------+-----------------------+-------------------+-----------------------------+



In [0]:
dataYear = rawData.orderBy("Year")

In [0]:
# Year over year change in CO2 concentration
YOYCo2 = dataYear.withColumn("YOYCo2 change", col("CO2 Concentration (ppm)")- lag("CO2 Concentration (ppm)", 1,0).over(Window.orderBy("Year")))
YOYCo2.show(5)

+----+-------------------------------+-----------------------+-------------------+-----------------------------+------------------+
|Year|Global Average Temperature (°C)|CO2 Concentration (ppm)|Sea Level Rise (mm)|Arctic Ice Area (million km²)|     YOYCo2 change|
+----+-------------------------------+-----------------------+-------------------+-----------------------------+------------------+
|1900|                          14.09|                 411.08|             257.38|                         5.59|            411.08|
|1900|                          15.46|                 298.83|             298.92|                         6.94|           -112.25|
|1900|                           15.6|                 386.69|             261.45|                         3.41| 87.86000000000001|
|1900|                          15.39|                 402.74|             268.05|                        13.48| 16.05000000000001|
|1900|                          15.97|                 318.59|             1

In [0]:
YOYCo2.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Global Average Temperature (°C): double (nullable = true)
 |-- CO2 Concentration (ppm): double (nullable = true)
 |-- Sea Level Rise (mm): double (nullable = true)
 |-- Arctic Ice Area (million km²): double (nullable = true)
 |-- YOYCo2 change: double (nullable = true)



In [0]:
YOYCo2 = YOYCo2.withColumn("YOYCo2 change", col("YOYCo2 change").cast("int"))
YOYCo2.show(5)

+----+-------------------------------+-----------------------+-------------------+-----------------------------+-------------+
|Year|Global Average Temperature (°C)|CO2 Concentration (ppm)|Sea Level Rise (mm)|Arctic Ice Area (million km²)|YOYCo2 change|
+----+-------------------------------+-----------------------+-------------------+-----------------------------+-------------+
|1900|                          14.09|                 411.08|             257.38|                         5.59|         NULL|
|1900|                          15.46|                 298.83|             298.92|                         6.94|         -112|
|1900|                           15.6|                 386.69|             261.45|                         3.41|           87|
|1900|                          15.39|                 402.74|             268.05|                        13.48|           16|
|1900|                          15.97|                 318.59|             169.79|                         6.15