In [None]:
%sh
apt-get install jq
# We switch to the root directory of our driver node
cd /
# We create a directory called eea_input_data and move into it
mkdir eea_input_data
cd eea_input_data/
# We create a set of variables for the data URL
BASE_URL="https://discodata.eea.europa.eu"
TABLE="CO2Emission.latest.co2cars"
ORDER="ORDER%20BY%20%27Enedc%20(g%2Fkm)%27%20DESC"
# We go through the three years via a for loop
CONDITION="year%20%3D%202020"
COLUMNS="*%2C%20cast(%22Enedc%20(g%2Fkm)%22%20as%20float)%20*%201.1%20as%20%22Enedc%20(g%2Fkm)%20V2%22"
SQL_QUERY="SELECT%20$COLUMNS%20FROM%20$TABLE%20WHERE%20$CONDITION%20$ORDER"
FULL_URL="$BASE_URL/sql?query=$SQL_QUERY&p=1&nrOfHits=100000"
curl $FULL_URL | jq '.results' > co2_emissions_passenger_cars_2020.json

In [None]:
display(dbutils.fs.ls("/datalake"))

3. Copying a File from Another File System

In [None]:
# Define the source path (e.g., S3 bucket path where your JSON file currently resides)
source_path = "/mnt/your_mount_name/path/to/your/source_file.json"

# Define the target path on DBFS
target_path = "dbfs:/datalake/raw/co2_passenger_cars_emissions/year=2020/your_file.json"

# Use dbutils.fs.cp to copy the file
dbutils.fs.cp(source_path, target_path)

print(f"File copied successfully to: {target_path}")

In [1]:
%md
## 4. data quality operations (Project 1, Milestone 3)

UsageError: Line magic function `%md` not found.


In [None]:
 ## 6. Take a look at the data

In [None]:
Append the Data to the Existing Delta Table

In [None]:
# Define the path to the Delta table or use the table name if registered in the metastore
delta_table_path = "dbfs:/datalake/curated/eea_curated/co2_emissions"

# Append the data to the Delta table
new_data_df.write.format("delta").mode("append").save(delta_table_path)

In [None]:
# Delta Lake enforces strict schema validation on write operations to ensure data integrity.
# If a DataFrame being written to a Delta table contains columns not present in the table's schema,
# or if there's a data type mismatch, the operation will fail with a schema mismatch exception.
# This behavior protects against accidental schema changes that could lead to data quality issues.
# In cases where schema evolution is desired, Delta Lake provides mechanisms to handle additions
# of new columns or changes in data types, but such changes need to be explicitly allowed.


In [None]:
Drop column

In [None]:
 Rename Columns in the DataFrame

In [None]:
# Assuming 'df' is your DataFrame
updated_df = (df.withColumnRenamed("Enedc_g/km", "Enedc_g/km_deprecated")
                .withColumnRenamed("Enedc_g/km_V2", "Enedc_g/km"))

In [None]:
# Delta Lake provides robust schema validation to prevent accidental schema changes that could lead to data inconsistencies.
# In this case, by renaming 'Enedc_g/km' to 'Enedc_g/km_deprecated' and 'Enedc_g/km_V2' to 'Enedc_g/km', we align the DataFrame's
# schema with our intention to update the emissions data. Using 'option("mergeSchema", "true")' during the write operation,
# we allow Delta Lake to evolve the schema of the existing table to incorporate these changes, ensuring that the correct
# emissions data ('Enedc_g/km') is used for the year 2020. This showcases Delta Lake's capability to handle schema evolution
# in a controlled manner, ensuring data integrity while accommodating necessary changes to the data structure.


In [None]:
from pyspark.sql.functions import col

# Cast the 'Enedc_g/km' column to Integer type
updated_df = updated_df.withColumn("Enedc_g/km", col("Enedc_g/km").cast("int"))

In [None]:
delta_table_path = "/path/to/delta/table/eea_curated.co2_emissions"

# Append the data with schema evolution enabled
(updated_df.write.format("delta")
        .mode("append")
        .option("mergeSchema", "true")
        .save(delta_table_path))

# Or, if using a Delta table name:
# (updated_df.write.format("delta")
#         .mode("append")
#         .option("mergeSchema", "true")
#         .saveAsTable("eea_curated.co2_emissions"))