## 2. Fetch the "World Development Indicators" dataset to the local file system

In [None]:
%sh
# We switch to the root directory of our driver node
cd /
# We create a directory called input_wb_data and move into it
mkdir input_wb_data
cd input_wb_data
# We create a variable for the URL from which we'll fetch the data file
DATA_URL='https://databank.worldbank.org/data/download/WDI_csv.zip'
# We use the wget command to fetch the data
wget -O world_development_indicators.zip $DATA_URL
# We unzip the compressed file we received to extract the CSV files
unzip world_development_indicators.zip
rm world_development_indicators.zip

--2024-03-14 23:01:09--  https://databank.worldbank.org/data/download/WDI_csv.zip
Resolving databank.worldbank.org (databank.worldbank.org)... 104.18.42.231, 172.64.145.25, 2606:4700:4400::ac40:9119, ...
Connecting to databank.worldbank.org (databank.worldbank.org)|104.18.42.231|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://databankfiles.worldbank.org/public/ddpext_download/WDI_csv.zip [following]
--2024-03-14 23:01:10--  https://databankfiles.worldbank.org/public/ddpext_download/WDI_csv.zip
Resolving databankfiles.worldbank.org (databankfiles.worldbank.org)... 13.107.213.42, 13.107.246.42, 2620:1ec:bdf::42, ...
Connecting to databankfiles.worldbank.org (databankfiles.worldbank.org)|13.107.213.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73436470 (70M) [application/x-zip-compressed]
Saving to: ‘world_development_indicators.zip’

     0K .......... .......... .......... .......... ..........  0%  553K 2

Archive:  world_development_indicators.zip
  inflating: WDIData.csv             
  inflating: WDICountry.csv          
  inflating: WDISeries.csv           
  inflating: WDICountry-Series.csv   
  inflating: WDISeries-Time.csv      
  inflating: WDIFootNote.csv         


In [None]:
ls /input_wb_data/

WDICountry-Series.csv  WDIData.csv      WDISeries-Time.csv
WDICountry.csv         WDIFootNote.csv  WDISeries.csv


## 3. Move the different CSV files into the data lake’s raw directory 

In [None]:
from datetime import datetime

# Current date in YYYYMMDD format
current_date = datetime.now().strftime("%Y%m%d")

# Directory path
directory_path = f"/datalake/raw/world_development_indicators/date={current_date}/"

# Create the directory in DBFS
dbutils.fs.mkdirs(directory_path)

csv_files = dbutils.fs.ls("file:/input_wb_data/")

for file in csv_files:
    if file.name.endswith('.csv'):
        source_path = file.path
        destination_path = directory_path + file.name
        dbutils.fs.mv(source_path, destination_path)

# List files in the target directory to confirm the move
display(dbutils.fs.ls(directory_path))

path,name,size,modificationTime
dbfs:/datalake/raw/world_development_indicators/date=20240314/WDICountry-Series.csv,WDICountry-Series.csv,1054124,1710457298000
dbfs:/datalake/raw/world_development_indicators/date=20240314/WDICountry.csv,WDICountry.csv,157369,1710457298000
dbfs:/datalake/raw/world_development_indicators/date=20240314/WDIData.csv,WDIData.csv,218787080,1710457294000
dbfs:/datalake/raw/world_development_indicators/date=20240314/WDIFootNote.csv,WDIFootNote.csv,68811925,1710457297000
dbfs:/datalake/raw/world_development_indicators/date=20240314/WDISeries-Time.csv,WDISeries-Time.csv,14917,1710457294000
dbfs:/datalake/raw/world_development_indicators/date=20240314/WDISeries.csv,WDISeries.csv,3999755,1710457294000


## 4. Fetch the CO2 emissions from passenger cars dataset

In [None]:

%sh
 apt-get install jq
 cd /
 mkdir eea_input_data
 cd eea_input_data/
 BASE_URL="https://discodata.eea.europa.eu"
 TABLE="CO2Emission.latest.co2cars"
 ORDER="order%20by%20emissions%20desc"
 for YEAR in 2017 2018 2019
 do
      CONDITION="year%20%3D%20$YEAR"
      SQL_QUERY="SELECT%20*%2C%20%22Enedc%20(g%2Fkm)%22%20as%20emissions%20FROM%20$TABLE%20WHERE%20$CONDITION%20$ORDER"
      FULL_URL="$BASE_URL/sql?query=$SQL_QUERY&p=1&nrOfHits=100000"
      curl $FULL_URL | jq '.results' > co2_emissions_passenger_cars_$YEAR.json
 done

Reading package lists...
Building dependency tree...
Reading state information...
The following NEW packages will be installed:
  jq
0 upgraded, 1 newly installed, 0 to remove and 41 not upgraded.
Need to get 52.5 kB of archives.
After this operation, 102 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 jq amd64 1.6-2.1ubuntu3 [52.5 kB]


debconf: delaying package configuration, since apt-utils is not installed


Fetched 52.5 kB in 0s (852 kB/s)
Selecting previously unselected package jq.
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ... 95%(Reading database ... 100%(Reading database ... 91195 files and directories currently installed.)
Preparing to unpack .../jq_1.6-2.1ubuntu3_amd64.deb ...
Unpacking jq (1.6-2.1ubuntu3) ...
Setting up jq (1.6-2.1ubuntu3) ...
Processing triggers for man-db (2.10.2-1) ...


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:04 --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:05 --:--:--     0 11 59.6M   11 6975k    0     0  1147k      0  0:00:53  0:00:06  0:00:47 1436k 51 59.6M   51 30.5M    0     0  4432k      0  0:00:13  0:00:07  0:00:06 6467k 85 59.6M   85 50.8M    0     0  6411k      0  0:00:09  0:00:08  0:00:01 10.3M100 59.6M  100 59.6M    0     0  7113k      0  0:00

In [None]:
%sh
ls /eea_input_data/

co2_emissions_passenger_cars_2017.json
co2_emissions_passenger_cars_2018.json
co2_emissions_passenger_cars_2019.json


## 5.  Move the different JSON files into the data lake’s raw directory 

In [None]:
from datetime import datetime

# Current date in YYYY format
#current_date = datetime.now().strftime("%Y")

json_files = dbutils.fs.ls("file:/eea_input_data/")

for file in json_files:
    if file.name.endswith('.json'):
        source_path = file.path
        year_file = file.name[-9:-5]
       
        # Directory path
        directory_path = f"/datalake/raw/co2_passenger_cars_emissions/year={year_file}/"

        # Create the directory in DBFS
        dbutils.fs.mkdirs(directory_path)

        destination_path = directory_path + file.name
        dbutils.fs.mv(source_path, destination_path)

# List files in the target directory to confirm the move
display(dbutils.fs.ls(directory_path))

path,name,size,modificationTime
dbfs:/datalake/raw/co2_passenger_cars_emissions/year=2018/co2_emissions_passenger_cars_2018.json,co2_emissions_passenger_cars_2018.json,87675091,1710457341000


## 5. Use Apache Spark to read the two datasets

In [None]:
df_world = spark.read.format("csv").option("header", "true").load("dbfs:/datalake/raw/world_development_indicators/date=20240313/WDIData.csv")
display(df_world)

Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,_c67
Africa Eastern and Southern,AFE,Access to clean fuels and technologies for cooking (% of population),EG.CFT.ACCS.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.5803658833288,11.9079425294377,12.2851752012446,12.6076412016211,12.990589368338,13.3944040634838,13.8352545545734,14.2571197946687,14.687203312201,15.1241124592143,15.5452136820546,16.0286771765974,16.447498712171,16.9146250084891,17.3923491766017,17.8920045785891,18.3599925639738,18.7951511858057,19.2951759160947,19.7881558225926,20.2795987854143,20.7736266715633,,
Africa Eastern and Southern,AFE,"Access to clean fuels and technologies for cooking, rural (% of rural population)",EG.CFT.ACCS.RU.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.54624384950079,3.71375230244013,3.90490222403963,4.09365679549378,4.3120797561399,4.53043520417577,4.75964066633334,5.00414504385387,5.23758052815317,5.48348056189574,5.73385441202071,5.95751373846406,6.22427564223531,6.47330052280147,6.72033082192565,7.01591671529373,7.28139009342782,7.51367314282019,7.80956551360185,8.07588857955621,8.36600966417786,8.68413651796155,,
Africa Eastern and Southern,AFE,"Access to clean fuels and technologies for cooking, urban (% of urban population)",EG.CFT.ACCS.UR.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32.6044998101412,32.9641587834881,33.4096166423109,33.7914967376809,34.2204852383808,34.7274443863933,35.159518390821,35.6949836335275,36.1021883549437,36.4475825793002,36.8496476206059,37.1803138455416,37.540748812663,37.8703467737823,38.1841517136575,38.5431800240538,38.8017190398214,39.039013572783,39.323186412267,39.6438476030676,39.8948301673967,40.2138914990743,,
Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,19.9573016173253,19.9753646761679,21.576500005381,22.5292648277365,23.749531370048,23.4930127792651,25.1912206446574,26.8079371107551,25.9433160820416,26.1938952124019,27.4001102274899,28.9143483184985,31.6669278676995,31.695183470279,31.8592573426147,33.903514646918,38.8514438926569,40.1973318677545,43.0283322203821,44.3897728327672,46.2686205693929,48.1036086917626,,
Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural population)",EG.ELC.ACCS.RU.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.62349738713396,,,9.76549107771651,10.9102432017627,10.4427955703514,12.5180411959847,12.5271501168806,12.9865086541341,15.5277914218334,14.4627604783022,16.1009040222933,19.3751714802833,18.6723059476993,17.6239561137036,16.5166329797999,24.5944742175144,25.3892973709915,27.0417428231018,29.1382848280998,30.9986867049528,32.7726898837157,,
Africa Eastern and Southern,AFE,"Access to electricity, urban (% of urban population)",EG.ELC.ACCS.UR.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,53.6816577808585,53.2353309685579,55.3409972063531,56.3335252331815,56.9948097726403,58.0645714875702,58.6094590195215,60.7681917576527,58.8837407439322,61.2102132896199,62.9601364534058,65.873454513374,66.6302986390862,66.7957345750975,65.9988980187913,67.0223317420289,68.9074035912735,70.663095630933,71.5653759017355,72.6116853088443,74.1299234862283,75.5591742389248,,
Africa Eastern and Southern,AFE,Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),FX.OWN.TOTL.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Africa Eastern and Southern,AFE,"Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)",FX.OWN.TOTL.FE.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Africa Eastern and Southern,AFE,"Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+)",FX.OWN.TOTL.MA.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Africa Eastern and Southern,AFE,"Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+)",FX.OWN.TOTL.OL.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df_world_data = spark.read.format("csv")\
.option("header", "true")\
    .option("delimiter", ",")\
    .load(f"dbfs:/datalake/raw/world_development_indicators/date={current_date}/WDIData.csv")

In [None]:
#from pyspark.sql.types import *
input_path = "/datalake/raw/co2_passenger_cars_emissions"
df_co2_emissions = spark.read.option("multiline", "true").json(input_path)


## 7. a) Schema

In [None]:
df_world.printSchema()

root
 |-- Country Name: string (nullable = true)
 |-- Country Code: string (nullable = true)
 |-- Indicator Name: string (nullable = true)
 |-- Indicator Code: string (nullable = true)
 |-- 1960: string (nullable = true)
 |-- 1961: string (nullable = true)
 |-- 1962: string (nullable = true)
 |-- 1963: string (nullable = true)
 |-- 1964: string (nullable = true)
 |-- 1965: string (nullable = true)
 |-- 1966: string (nullable = true)
 |-- 1967: string (nullable = true)
 |-- 1968: string (nullable = true)
 |-- 1969: string (nullable = true)
 |-- 1970: string (nullable = true)
 |-- 1971: string (nullable = true)
 |-- 1972: string (nullable = true)
 |-- 1973: string (nullable = true)
 |-- 1974: string (nullable = true)
 |-- 1975: string (nullable = true)
 |-- 1976: string (nullable = true)
 |-- 1977: string (nullable = true)
 |-- 1978: string (nullable = true)
 |-- 1979: string (nullable = true)
 |-- 1980: string (nullable = true)
 |-- 1981: string (nullable = true)
 |-- 1982: string (null

In [None]:
df_co2_emissions.printSchema()

root
 |-- At1 (mm): long (nullable = true)
 |-- At2 (mm): long (nullable = true)
 |-- Cn: string (nullable = true)
 |-- Cr: string (nullable = true)
 |-- Ct: string (nullable = true)
 |-- De: double (nullable = true)
 |-- Dr: string (nullable = true)
 |-- E (g/km): string (nullable = true)
 |-- Ec (cm3): long (nullable = true)
 |-- Enedc (g/km): long (nullable = true)
 |-- Ep (KW): long (nullable = true)
 |-- Er (g/km): string (nullable = true)
 |-- Ernedc (g/km): double (nullable = true)
 |-- Erwltp (g/km): double (nullable = true)
 |-- Ewltp (g/km): long (nullable = true)
 |-- Fc: string (nullable = true)
 |-- Fm: string (nullable = true)
 |-- Ft: string (nullable = true)
 |-- ID: long (nullable = true)
 |-- IT: string (nullable = true)
 |-- M (kg): long (nullable = true)
 |-- MMS: string (nullable = true)
 |-- MS: string (nullable = true)
 |-- Man: string (nullable = true)
 |-- Mh: string (nullable = true)
 |-- Mk: string (nullable = true)
 |-- Mp: string (nullable = true)
 |-- Mt: 

## 7. b) Statistics

In [None]:
display(df_world.describe())

summary,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,_c67
count,395276,395276,395276,395276,38882.0,44147.0,45329.0,46209.0,46752.0,48958.0,48960.0,49628.0,50281.0,51044.0,68400.0,75332.0,77117.0,77801.0,79214.0,82934.0,85050.0,87457.0,88216.0,89188.0,93726.0,95716.0,96073.0,97059.0,97700.0,99142.0,99915.0,100131.0,100226.0,101981.0,125486.0,135812.0,139140.0,141659.0,143911.0,151858.0,159036.0,152409.0,162444.0,161158.0,195946.0,182053.0,192018.0,194342.0,199622.0,211596.0,211967.0,213971.0,214781.0,216059.0,227155.0,222931.0,224663.0,221941.0,228962.0,229338.0,225025.0,221368.0,220008.0,215233.0,195566.0,166351.0,115040.0,0.0
mean,,,,,435316502831.0107,410375885455.37726,422524600944.7293,432409221414.9786,463783682170.0332,507477504471.9119,551620205922.166,603112882832.862,679260805758.2865,751807004631.9597,649075297137.3839,676292530837.1776,757965983939.5581,882693140275.2153,1177419142618.4053,1195362545359.4155,1330209293346.261,1286831563849.225,1116652103118.9502,1059618153129.37,836406555107.4633,804514788301.5282,887103791816.4341,1109892689473.4917,1131584670462.9167,1228772224405.77,1145162130315.3325,1155591324266.0146,897461957226.9663,973147182249.6166,1011441466635.0743,986336932241.1648,1004365070333.3068,1087614821224.0676,1166083106303.2468,1176225135386.1846,1108988574470.608,1178852965178.0205,1026355342648.0834,1056206009677.7976,924586071344.3566,1040853154261.332,1200453380397.8333,1170523106679.442,1327571707885.879,1354375185422.1284,1503745527001.9817,1778991904890.8967,1956165354882.0496,2018295985080.1743,2257578259250.597,2568456151704.422,2679877932183.748,2970408062949.453,3121971853796.352,3196345203454.992,3570034099624.2803,3832510940958.471,4303225758662.455,29842161409654.8,422512789039511.6,2601679923810640.5,2.682361530406412e+16,
stddev,,,,,19810703421436.09,20157324097574.336,21187109943069.285,22222373916682.77,23974891896261.13,27158423106472.68,30067118868780.44,33350172176622.49,37835704015921.48,42965378607531.67,41173633286911.35,45097557395507.17,51441606004651.89,58844418111687.23,80377555613607.97,80736047528504.95,91944321614682.3,88396236585690.12,73317343608530.31,68930186209084.21,51207790848272.37,48086263602882.01,54170552055249.016,67085564972749.05,70093618309250.98,82806430231044.06,76925519611244.98,70538251267388.73,46844447058742.445,50374340519160.695,58451218644625.016,57666974445172.8,58679884805547.56,64273260343523.05,68913602057356.22,68766757616553.64,59186769250370.02,59326491437725.93,54968434365973.805,58612918814758.88,58941780651853.0,66552336912711.27,64473639732448.375,75617098256106.3,73870248809649.72,78224205647211.64,82464743764249.02,89899108021698.08,93198907297478.31,94986228081485.22,105317605623169.31,115864168716110.7,116819504137683.05,127912162366384.58,137079452246086.94,139906465467695.12,156923010544926.56,169335405363600.97,193711698526366.84,1.1623132094223054e+16,1.8417880013607622e+17,1.057422779463962e+18,9.092173193463384e+18,
min,Afghanistan,ABW,ARI treatment (% of children under 5 taken to a health provider),AG.AGR.TRAC.NO,-0.0033083903,-0.0058159009,-0.0001556363635454,-0.0002432727263636,-0.000112838,-0.0006414545444545,-0.0005541818184545,-0.0008767272734545,-0.0002550001,-0.0004254536999999,-0.0002970001,-0.0006431430999999,-0.0002246189999999,-0.0004599999966666,-0.0004730224609375,-0.0001685807853699,-0.0001015634999994,-0.000157833,-0.00013113,-0.0001515258608406,-0.0002143096932329,-0.0001830900000641,-0.0004171078478858,-0.0001071072846597,-0.0002525830818812,-0.0002440246788606,-0.0003571163445718,-0.000244140625,-0.0005785792721866,-0.0001416120206931,-0.0024696757333603,-0.0001656240569332,-0.0003524249336775,-0.0003200324015287,-0.0001613809602482,-0.0001193285143601,-0.0002174283122446,-0.0020573879308598,-0.0013451272070363,-0.001434846508378,-0.0005352595244494,-0.0010438851788512,-0.0005637700843092,-0.0005885935938202,-0.0001220703125,-0.0001220703125,-0.00042724609375,-0.0020439824549403,-0.0004119873046875,-0.0003847966830509,-0.0001449688558574,-0.0002706054074224,-0.0001373291015625,-0.000213623046875,-0.0001957456431904,-0.0002046852368954,-0.004052454625679,-0.0001865432225203,-0.0001001358032226,-0.0001101493835449,-0.0003599748015403,-0.0001220703125,-0.0003206681409011,
max,Zimbabwe,ZWE,Young people (ages 15-24) newly infected with HIV,per_si_allsi.cov_q5_tot,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9999999.99993896,99999993.3333333,99999.9977648259,99999.9999999999,99999.9977648258,99999.9977648258,99999.9977648259,99999801300.0,9999999.74668025,999999.979510903,999922565.704,999999.973922969,99999.9977648258,999999.960884452,99999.9999999999,99999.9996274711,99999981203.7542,99993502642.5019,999951022.807046,99999.9977648258,999999.985098838,99996799.5051345,99999.9977648258,9999.99977648258,99999998000.0,99999.999627471,99999.9996274709,9999301.59978972,999939416700.0,99999998000.0,9999999.99999999,99999.999627471,9999655715.57034,99999.0,9999725274.72528,99999.9977648258,9999999.91245567,99999000000.0,99999.999627471,99999.999627471,999995424.414279,9999999.5,999999996.267259,9999.99977648258,99999.9977648259,999944538.0,99997826700000.0,999999.975785613,999999.996274709,999999.992549419,999999.970197677,9999511863.83045,999999206889.109,


In [None]:
display(df_co2_emissions.describe())

summary,At1 (mm),At2 (mm),Cn,Cr,Ct,De,Dr,E (g/km),Ec (cm3),Enedc (g/km),Ep (KW),Er (g/km),Ernedc (g/km),Erwltp (g/km),Ewltp (g/km),Fc,Fm,Ft,ID,IT,M (kg),MMS,MS,Man,Mh,Mk,Mp,Mt,R,Status,T,TAN,VFN,Va,Ve,Version_file,Vf,W (mm),year,Z (Wh/km),Zr,emissions
count,283796.0,276264.0,299985,299092,298302,10009.0,0.0,0.0,299975.0,300000.0,248465.0,0.0,7155.0,2837.0,109271.0,0.0,299993,300000,300000.0,275347,299434.0,299262,299996,300000,300000,299782,300000,109747.0,300000.0,300000,299974,299872,285553,299901,299844,300000,16327.0,286707.0,300000.0,57.0,0.0,300000.0
mean,1630.9797742040057,1637.6940788521124,800.3777280320749,,,-0.0017782595663903,,,4193.517909825819,265.74504333333334,340.6469603364659,,1.637302585604349,0.0190341910468805,307.6761537827969,,,,5428304.994346667,7.285714285714286,2060.2311761523406,8.0,,,,8.0,,2197.4197290130937,2.3217833333333333,,474.68976943734225,111.0,,3.1976744186046466E90,322387.70663470967,,0.2548539229497152,2786.198125612559,2018.0,217.24561403508773,,265.74504333333334
stddev,57.1876011615477,53.33979711205964,948.7842391553709,,,0.040910280640427,,,1161.5307516224473,46.16069680797603,107.26642344267088,,0.3011054880652328,0.1855804693595503,35.98164437703905,,,,3900984.1100472105,3.5326937743674747,352.7165040089179,0.0,,,,0.0,,518.6146858038334,22.57625296139836,,5661.372274374247,128.17175976009693,,3.352264964430472E91,683784.4486736971,,0.4357924194707832,320.61966085480924,0.816497941758763,21.869973464738614,,46.16069680797603
min,501.0,501.0,,,,-0.078,,,898.0,185.0,27.0,,1.0,0.0,11.0,,,,307.0,,242.0,,AT,,AA-IVA,,,0.0,1.0,F,,,,,,v15,0.0,0.0,2017.0,100.0,,185.0
max,2895.0,2250.0,unknown,m1,m1,2.0,,,8382.0,671.0,1103.0,,3.0,1.9,535.0,,P,petrol/electric,15499148.0,e24*2,4160.0,ZESZUTA,SK,VOLVO CAR CORPORATION,unknown,ZENVO,VW GROUP PC,4612.0,3250.0,P,l50,e9*ks07/46*6716*01,RL-ZE1AEM57A6_0000-SJN-1,ZYX10(H),h,v20,1.0,4762.0,2019.0,296.0,,671.0


## 7. c) Number of records

In [None]:


print(f'Number of records  for World Development Indicators DataFrame is: {df_world.count()}')


Number of records  for World Development Indicators DataFrame is: 395276


In [None]:
print(f"Number of records  of CO2 Emissions DataFrame is: {df_co2_emissions.count()}")

Number of records  of CO2 Emissions DataFrame is: 300000


## 7. d) Sample from the two datasets

In [None]:
display(df_world.sample(fraction=0.00001))

Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,_c67
Cyprus,CYP,Adjusted savings: mineral depletion (current US$),NY.ADJ.DMIN.CD,,,,,,,,,,,10190113.1616984,6373649.80845472,5962953.94497045,10350492.1431835,9864318.1123715,3127978.93552354,4051772.15095905,1927653.64587632,1535004.86502435,693716.862188629,761017.931211332,0.0,205838.53154384,404865.14863129,295362.216541707,281979.92064392,89068.8243458013,32053.9679541438,221139.364036438,612571.36093426,367614.366548224,72241.1197017051,72423.0823507301,44405.4311671528,0.0,0.0,392690.527661548,964071.21907391,421822.875595412,576412.479230363,948081.822953723,422300.318840657,557858.879095043,492944.73131785,741398.709233644,0.0,0.0,628091.093019088,468001.607844664,2503310.47893927,4946712.74485517,8608124.99381906,7347577.61801209,4703287.02775891,3538403.40083868,1798120.80512297,0.0,722472.474336169,763563.229253612,580738.114374684,1314665.16375144,2623796.31934394,,
India,IND,Time required to obtain an operating license (days),IC.FRM.DURS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,17.8,,,,,,,,23.8,
Lesotho,LSO,"Voice and Accountability: Percentile Rank, Upper Bound of 90% Confidence Interval",VA.PER.RNK.UPPER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,50.0,,48.756217956543,,51.7412948608398,,60.6965179443359,60.1990051269531,56.25,54.3269233703613,59.1346168518066,54.8076934814453,52.4038467407227,54.028434753418,54.028434753418,53.9906120300293,57.2769966125488,57.7464790344238,59.6059112548828,58.6206893920898,53.694580078125,54.1871910095215,55.8252410888672,53.1400947570801,53.6231880187988,54.5893707275391,55.5555572509766,
Turkmenistan,TKM,"Claims on central government, etc. (% GDP)",FS.AST.CGOV.GD.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
display(df_co2_emissions.sample(fraction=0.00001))

At1 (mm),At2 (mm),Cn,Cr,Ct,De,Dr,E (g/km),Ec (cm3),Enedc (g/km),Ep (KW),Er (g/km),Ernedc (g/km),Erwltp (g/km),Ewltp (g/km),Fc,Fm,Ft,ID,IT,M (kg),MMS,MS,Man,Mh,Mk,Mp,Mt,R,Status,T,TAN,VFN,Va,Ve,Version_file,Vf,W (mm),year,Z (Wh/km),Zr,emissions
1684,1651,AMG GT S,M1,M1,,,,3982,262,384.0,,,,278.0,,M,petrol,7303603,,1700,MERCEDES-AMG,DE,MERCEDES-AMG GMBH,MERCEDES AMG,MERCEDES-AMG,DAIMLER AG,1787.0,1,P,197,E1*2007/46*0233*15,,Z48ST0,1ZAA020A,v19,,2630,2019,,,262
1690,1683,R ROVER SVA-BIO DYN V8 SC A,M1G,M1G,,,,4999,298,415.0,,,,,,M,Petrol,3250580,,2524,LAND ROVER,GB,JAGUAR LAND ROVER LIMITED,JAGUAR LAND ROVER LIMITED,LAND ROVER,TATA MOTORS JAGUAR LAND ROVER,,1,F,LG,e11*2007/46*0649*21,IP-09-SAL-2018-0050,S5IT2K,D5IA,v18,,2922,2018,,,298
1572,1572,WRANGLER UNLIMITED,M1,M1G,,,,3604,273,209.0,,,,,,M,Petrol,508482,,1995,FCA US LLC,FR,FCA US LLC,CHRYSLER,JEEP,FCA ITALY SPA,,1,F,JK,e4*2001/116*0116*23,,JXJFX,H5HN3A,v18,,2947,2018,,,273
1634,1631,S600 MAYBACH AUTO,,M1,,,,5980,274,,,,,,,M,petrol,423386,,2335,MERCEDES,GB,DAIMLER AG,DAIMLER AG,MERCEDES,DAIMLER AG,,1,F,221,e1*2001/116*0335*32,,P97BP0,NZAA0502,v16,,3365,2017,,,274
1665,1705,AMG GLE 43,M1,M1,,,,2996,209,270.0,,,,,,M,Petrol,945186,,2240,MERCEDES,BE,DAIMLER AG,DAIMLER AG,Mercedes-Benz,DAIMLER AG,,1,F,166,E1*2007/46*0598*19,,A35HP1,CZAAA502,v16,,2915,2017,,,209


In [4]:
import requests
import os
import json

# Create the directory for data
data_dir = "/eea_input_data"
os.makedirs(data_dir, exist_ok=True)

# Define the base URL and other constants
BASE_URL = "https://discodata.eea.europa.eu"
TABLE = "CO2Emission.latest.co2cars"
ORDER = "order by emissions desc"

# Loop through the specified years
for YEAR in [2020]:
    CONDITION = f"year = {YEAR}"
    COLUMNS = "*%2C cast(%22Enedc (g%2Fkm)%22 as emissions) * 1.1 as %22Enedc (g%2Fkm) V2%22"
    SQL_QUERY = f"SELECT {COLUMNS} FROM {TABLE} WHERE {CONDITION} {ORDER}"
    FULL_URL = f"{BASE_URL}/sql?query={SQL_QUERY}&p=1&nrOfHits=100000"
    
    # Make the HTTP GET request
    response = requests.get(FULL_URL)
    
    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()['results']  # Assuming 'results' is the key containing the data
        file_path = os.path.join(data_dir, f"co2_emissions_passenger_cars_{YEAR}.json")
        
        # Save the data to a JSON file
        with open(file_path, 'w') as f:
            json.dump(data, f)
    else:
        print(f"Failed to fetch data for {YEAR}: HTTP {response.status_code}")

Failed to fetch data for 2020: HTTP 403


In [2]:
import requests
import os
import json

data_dir = "eea_input_data"
os.makedirs(data_dir, exist_ok=True)

BASE_URL = "https://discodata.eea.europa.eu"
TABLE = "CO2Emission.latest.co2cars"
ORDER = "order by emissions desc"

headers = {
    'User-Agent': 'Mozilla/5.0'
}

for YEAR in [2017, 2018, 2019]:
    CONDITION = f"year = {YEAR}"
    COLUMNS = "*%2C cast(%22Enedc (g%2Fkm)%22 as float) * 1.1 as %22Enedc (g%2Fkm) V2%22"
    SQL_QUERY = f"SELECT {COLUMNS} FROM {TABLE} WHERE {CONDITION} {ORDER}"
    FULL_URL = f"{BASE_URL}/sql?query={SQL_QUERY}&p=1&nrOfHits=100000"
    
    response = requests.get(FULL_URL, headers=headers)
    
    if response.status_code == 200:
        data = response.json()['results']
        file_path = os.path.join(data_dir, f"co2_emissions_passenger_cars_{YEAR}.json")
        
        with open(file_path, 'w') as f:
            json.dump(data, f)
    else:
        print(f"Failed to fetch data for {YEAR}: HTTP {response.status_code}")

Failed to fetch data for 2017: HTTP 403
Failed to fetch data for 2018: HTTP 403
Failed to fetch data for 2019: HTTP 403
