## Create databases

In [None]:
spark.sql("SHOW DATABASES").show()

+------------------+
|      databaseName|
+------------------+
|           default|
|       eea_curated|
|information_schema|
|        wdi_curate|
+------------------+



In [None]:
spark.sql("SHOW CATALOGS").show()

+--------------+
|       catalog|
+--------------+
|     data_demo|
|hive_metastore|
|       samples|
|        system|
+--------------+



In [None]:
%sql
CREATE DATABASE IF NOT EXISTS data_demo.wdi_curate;
CREATE DATABASE IF NOT EXISTS data_demo.eea_curated;

In [None]:
spark.sql("SHOW DATABASES").show()

+------------------+
|      databaseName|
+------------------+
|           default|
|       eea_curated|
|information_schema|
|        wdi_curate|
+------------------+



In [None]:
spark.sql("USE data_demo.wdi_curate")


DataFrame[]

In [None]:
%sql

SELECT current_schema();

current_database()
wdi_curate


World Development Indicators data

In [None]:
df_world_data = spark.read.format("csv")\
.option("header", "true")\
    .load("dbfs:/datalake/raw/world_development_indicators/date=20240313/WDIData.csv")


In [None]:
df_world_data = spark.read.format("csv").option("header", "true").load("dbfs:/datalake/raw/world_development_indicators/date=20240313/WDIData.csv")
df_world_country = spark.read.format("csv").option("header", "true").load("dbfs:/datalake/raw/world_development_indicators/date=20240313/WDICountry.csv")
df_world_series = spark.read.format("csv").option("header", "true").load("dbfs:/datalake/raw/world_development_indicators/date=20240313/WDISeries.csv")

In [None]:
def rename_columns(df):
    current_columns = df.columns
    for col in current_columns:
        new_col_name = col.replace(" ", "_")
        df = df.withColumnRenamed(col, new_col_name)
    return df

In [None]:
df_world_data = rename_columns(df_world_data)

df_world_country = rename_columns(df_world_country)

df_world_series = rename_columns(df_world_series)

In [None]:
df_world_country.columns[:5]

['Country_Code', 'Short_Name', 'Table_Name', 'Long_Name', '2-alpha_code']

In [None]:
df_world_series.columns

['Series_Code',
 'Topic',
 'Indicator_Name',
 'Short_definition',
 'Long_definition',
 'Unit_of_measure',
 'Periodicity',
 'Base_Period',
 'Other_notes',
 'Aggregation_method',
 'Limitations_and_exceptions',
 'Notes_from_original_source',
 'General_comments',
 'Source',
 'Statistical_concept_and_methodology',
 'Development_relevance',
 'Related_source_links',
 'Other_web_links',
 'Related_indicators',
 'License_Type',
 '_c20']

In [None]:
df_world_data = df_world_data.dropna(how='all')

df_world_country = df_world_country.dropna(how='all')

df_world_series = df_world_series.dropna(how='all')



In [None]:
dataframes = {
  'WDI_Data'   : df_world_data,
  'WDI_Country': df_world_country,
  'WDI_Series' : df_world_series
}
for df_name, df in dataframes.items():
  print(f'Number of records of dataframe {df_name} before dropping nulls: {df.count()}')
 

Number of records of dataframe WDI_Data before dropping nulls: 395276
Number of records of dataframe WDI_Country before dropping nulls: 275
Number of records of dataframe WDI_Series before dropping nulls: 4454


In [None]:

df_world_data = df_world_data.dropDuplicates()
df_world_country = df_world_country.dropDuplicates()
df_world_series = df_world_series.dropDuplicates()

dataframes = {
  'WDI_Data'   : df_world_data,
  'WDI_Country': df_world_country,
  'WDI_Series' : df_world_series
}
for df_name, df in dataframes.items():
  
  print(f'Number of records of dataframe {df_name} after dropping nulls: {dataframes[df_name].count()}')

Number of records of dataframe WDI_Data after dropping nulls: 395276
Number of records of dataframe WDI_Country after dropping nulls: 275
Number of records of dataframe WDI_Series after dropping nulls: 2399


In [None]:
from pyspark.sql.functions import length, col

df_world_data = df_world_data.where(length(df_world_data["Country_Code"]) == 3)
df_world_country = df_world_country.where(length(df_world_country["Country_Code"]) == 3)

df_world_series  = df_world_series.where(~col("Series_Code").contains(" "))

In [None]:
partition ='20240313'
dbfs_base_path   = 'dbfs:/datalake/curated/wdi'
output_partition = f'year={partition[:4]}/month={partition[4:6]}/day={partition[6:]}/'
save_path='dbfs:/datalake/curated/wdi/data/year=2024/month=03/day=13/'

(
  df_world_data
  .coalesce(1)
  .write
   .mode("overwrite")
  .format('parquet')
  .option('path', f'{dbfs_base_path}/data/{output_partition}')
  .save()
)


In [None]:
spark.sql("SHOW CATALOGS").show()

+--------------+
|       catalog|
+--------------+
|     data_demo|
|hive_metastore|
|       samples|
|        system|
+--------------+



In [None]:
spark.sql("SHOW DATABASES").show()

+------------------+
|      databaseName|
+------------------+
|           default|
|       eea_curated|
|information_schema|
|        wdi_curate|
+------------------+



In [None]:
data_wdi_curate = {
  'data'   : df_world_data,
  'country': df_world_country,
  'series' : df_world_series
}
# Specify the S3 bucket and path
s3_bucket = "databricks-workspace-stack-10fab-bucket"
s3_path = "unity-catalog/1803637943354536"

# Write the DataFrame to S3 in Parquet format
df_temp_path = f"s3a://{s3_bucket}/{s3_path}/temp"
#dataframes['WDI_Data'].write.mode("overwrite").parquet(df_temp_path)

# Create an external table
spark.sql("USE data_demo.wdi_curate")
for name, df in data_wdi_curate.items():

    # Write the DataFrame to S3 in Parquet format
    df_temp_path = f"s3a://{s3_bucket}/{s3_path}/temp_{name}"
    data_wdi_curate[name].write.mode("overwrite").parquet(df_temp_path)

    spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {name}
    USING PARQUET
    LOCATION '{df_temp_path}'
    """)


In [None]:
input_path = "/datalake/raw/co2_passenger_cars_emissions"
df_co2_emissions = spark.read.option("multiline", "true").json(input_path)


Unnamed: 0,At1 (mm),At2 (mm),Cn,Cr,Ct,De,Dr,E (g/km),Ec (cm3),Enedc (g/km),Ep (KW),Er (g/km),Ernedc (g/km),Erwltp (g/km),Ewltp (g/km),Fc,Fm,Ft,ID,IT,M (kg),MMS,MS,Man,Mh,Mk,Mp,Mt,R,Status,T,TAN,VFN,Va,Ve,Version_file,Vf,W (mm),Year,Z (Wh/km),Zr,emissions
0,1679.0,1632.0,458 SPECIALE A AD S-A,,M1,,,,4497.0,559,,,,,,,M,petrol,416839,,1485.0,FERRARI,GB,FERRARI SPA,FERRARI,FERRARI,,,1,F,F142,e3*2007/46*0040*10,,AB,L,v16,,2650.0,2017,,,559
1,1679.0,1632.0,458 SPECIALE A AD S-A,,M1,,,,4497.0,559,,,,,,,M,petrol,416839,,1485.0,FERRARI,GB,FERRARI SPA,FERRARI,FERRARI,,,1,P,F142,e3*2007/46*0040*10,,AB,L,v15,,2650.0,2017,,,559
2,,,AUDI A8,M1,M1,,,,3993.0,545,420.0,,,,,,M,PETROL,1360362,,,AUDI AG,DE,AA-IVA,AA-IVA,,,,4,P,F8,,,,,v15,,,2017,,,545
3,,,AUDI A8,M1,M1,,,,3993.0,545,420.0,,,,,,M,PETROL,1360362,,,AUDI AG,DE,AA-IVA,AA-IVA,,,,4,F,F8,,,,,v16,,0.0,2017,,,545
4,1714.0,1617.0,BUGATTIGRANDSPORTVITESSE,M1,M1,,,,7993.0,539,882.0,,,,,,M,PETROL,1406058,,2065.0,BUGATTI (F),DE,BUGATTI AUTOMOBILES SAS,BUGATTI,BUGATTI,VW GROUP PC,,1,F,5B,E1*KS07/46*0008*03,,AEXCLBAX1,AA7AD71C001S,v16,,2710.0,2017,,,539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1583.0,1583.0,M2,M1,M1,,,,2979.0,185,272.0,,,,,,M,Petrol,4526776,,1595.0,BMW M GMBH,FR,BMW M GMBH,BMW GMBH,BMW,BMW GROUP,,1,P,M3,e1*2007/46*0377*12,,1H91,6A040000,v15,,2693.0,2017,,,185
99996,1583.0,1583.0,M2,M1,M1,,,,2979.0,185,272.0,,,,,,M,Petrol,4526783,,1595.0,BMW M GMBH,FR,BMW M GMBH,BMW GMBH,BMW,BMW GROUP,,1,F,M3,e1*2007/46*0377*12,,1H91,6A040000,v16,,2693.0,2017,,,185
99997,1583.0,1583.0,M2,M1,M1,,,,2979.0,185,272.0,,,,,,M,Petrol,4526783,,1595.0,BMW M GMBH,FR,BMW M GMBH,BMW GMBH,BMW,BMW GROUP,,1,P,M3,e1*2007/46*0377*12,,1H91,6A040000,v15,,2693.0,2017,,,185
99998,1583.0,1583.0,M2,M1,M1,,,,2979.0,185,272.0,,,,,,M,Petrol,4526790,,1595.0,BMW M GMBH,FR,BMW M GMBH,BMW GMBH,BMW,BMW GROUP,,1,F,M3,e1*2007/46*0377*12,,1H91,6A040000,v16,,2693.0,2017,,,185


In [None]:
# Renaming the columns
for col_name in df_co2_emissions.columns:
    new_col_name = col_name.replace(' ', '_').replace('(', '').replace(')', '')
    df_co2_emissions = df_co2_emissions.withColumnRenamed(col_name, new_col_name)


In [None]:
df_co2_emissions.columns[:5]

['At1_mm', 'At2_mm', 'Cn', 'Cr', 'Ct']

In [None]:
print(f'Number of records of dataframe co2_emissions before dropping nulls: {df_co2_emissions.count()}')

df_co2_emissions = df_co2_emissions.dropna(how='all')

print(f'Number of records of dataframe co2_emissions after dropping nulls: {df_co2_emissions.count()}')

Number of records of dataframe co2_emissions before dropping nulls: 100000
Number of records of dataframe co2_emissions after dropping nulls: 100000


In [None]:

df_co2_emissions = df_co2_emissions.dropDuplicates()

df_co2_emissions = df_co2_emissions.where(length(df_co2_emissions["MS"]) == 2)

df_co2_emissions = df_co2_emissions.filter(col("MS").rlike("^[A-Z]{2}$"))

# Show the filtered DataFrame
df_co2_emissions.toPandas()

Unnamed: 0,At1_mm,At2_mm,Cn,Cr,Ct,De,Dr,E_g/km,Ec_cm3,Enedc_g/km,Ep_KW,Er_g/km,Ernedc_g/km,Erwltp_g/km,Ewltp_g/km,Fc,Fm,Ft,ID,IT,M_kg,MMS,MS,Man,Mh,Mk,Mp,Mt,R,Status,T,TAN,VFN,Va,Ve,Version_file,Vf,W_mm,Year,Z_Wh/km,Zr,emissions
0,1570.0,1575.0,VANTAGE S V12,,M1,,,,5935.0,395,,,,,,,M,PETROL,404725,,1740.0,ASTON MARTIN,GB,ASTON MARTIN LAGONDA LTD,ASTON MARTIN,ASTON MARTIN,,,28,F,VH2(SE),E11*KS07/46*0207*09,,SE17,R2RMDH,v16,,2600.0,2017,,,395
1,1669.0,1665.0,CONTINENTAL,,m1,,,,5998.0,358,,,,,,,M,Petrol,115791,,2355.0,BENTLEY MOTORS LTD,ES,BENTLEY MOTORS LTD,BENTLEY,BENTLEY,,,2,F,3W,E11*2001/116*0221*24,,FR4X,1,v16,,2746.0,2017,,,358
2,1669.0,1665.0,CONTINENTAL,,M1,,,,5998.0,338,472.0,,,,,,M,PETROL,1812268,,2395.0,BENTLEY MOTORS LTD,LU,BENTLEY MOTORS LTD,BENTLEY,BENTLEY,,,1,P,3W,E11*2001/116*0221*23,,FP4X,1,v15,,2746.0,2017,,,338
3,1586.0,1590.0,unknown,,M1,,,,4691.0,337,,,,,,,M,petrol,432693,,2055.0,MASERATI,GB,MASERATI SPA,MASERATI,MASERATI,,,1,F,M145,e3*2001/116*0262*13,,GD,A,v16,,2942.0,2017,,,337
4,1644.0,1642.0,FLYING SPUR W12,,M1,,,,5998.0,335,460.0,,,,,,M,Petrol,509287,,2565.0,BENTLEY MOTORS LIMITED,SE,BENTLEY MOTORS LTD,BENTLEY,BENTLEY,,,1,F,3W,e11*2001/116*0221*23,,EB4X,1,v16,,3065.0,2017,,,335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1690.0,1690.0,RANGE ROVER SPORT,M1,M1G,,,,2993.0,185,225.0,,,,,,M,Diesel,3079126,,2202.0,JAGUAR LAND ROVER LIMITED,FR,JAGUAR LAND ROVER LIMITED,JAGUAR LAND ROVER LIMITED,LAND ROVER,TATA MOTORS JAGUAR LAND ROVER,,1,P,LW,e11*2007/46*0909*12,,S5CH2F,A5C4,v15,,2923.0,2017,,,185
99996,1690.0,1685.0,Range Rover Sport 3.0 SDV6,M1G,M1G,,,,2993.0,185,225.0,,,,,,M,Diesel,4105305,,2203.0,"JAGUAR LAND ROVER LIMITED, V.",CZ,JAGUAR LAND ROVER LIMITED,JAGUAR LAND ROVER LIMITED,LAND ROVER,TATA MOTORS JAGUAR LAND ROVER,,1,F,LW,e11*2007/46*0909*10,,S5CH2F,A5C2,v16,,2923.0,2017,,,185
99997,1583.0,1583.0,M2,M1,M1,,,,2979.0,185,272.0,,,,,,M,Petrol,4525873,,1595.0,BMW M GMBH,FR,BMW M GMBH,BMW GMBH,BMW,BMW GROUP,,1,F,M3,e1*2007/46*0377*12,,1H91,6A0400V0,v16,,2693.0,2017,,,185
99998,1583.0,1583.0,M2,M1,M1,,,,2979.0,185,272.0,,,,,,M,Petrol,4526197,,1595.0,BMW M GMBH,FR,BMW M GMBH,BMW GMBH,BMW,BMW GROUP,,1,F,M3,e1*2007/46*0377*12,,1H91,6A040000,v16,,2693.0,2017,,,185
