In [0]:
# We want to prep the data to analyse the following things:
# (1) Data about the United Kingdom specifically
# (2) Data from 2012 specifically
# (3) Education Trends on a per-country basis
# (4) Data Trends on a per-region basis
# (5) Data Trends on a per-decade basis
#
# Firstly, get the tables from the default database:
dimTableCountry = spark.table("dimensiontablecountrycode")
dimTableDate = spark.table("dimensiontabledate")
dimTableIndicator = spark.table("dimensiontableindicatorcode")
factTable = spark.table("facttable")

In [0]:
# Create table (1) - Data about the United Kingdom specifically.
ukTable = factTable.join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner').filter(dimTableCountry.regionCode == "GBR").join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').select(dimTableIndicator.indicatorName, dimTableDate.dateYear.alias("year"), factTable.value)
display(ukTable)

indicatorName,year,value
Forest area (% of land area),1988,
Forest area (% of land area),1989,
Forest area (% of land area),1990,11.48266027
Forest area (% of land area),1991,11.55540859
Forest area (% of land area),1992,11.6281569
Forest area (% of land area),1993,11.70090522
Forest area (% of land area),1994,11.77365354
Forest area (% of land area),1995,11.84640185
Forest area (% of land area),1996,11.91915017
Forest area (% of land area),1997,11.99189848


In [0]:
# Create table (2) - Data about from 2012 specifically
table2012 = factTable.join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').filter(dimTableDate.dateYear == "2012").join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner').join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').select(dimTableCountry.countryName, dimTableIndicator.indicatorName, factTable.value)
display(table2012)

countryName,indicatorName,value
Aruba,Forest area (% of land area),2.333333333
Africa Eastern and Southern,Forest area (% of land area),31.97249225
Afghanistan,Forest area (% of land area),1.850994088
Africa Western and Central,Forest area (% of land area),20.49809922
Angola,Forest area (% of land area),56.9887511
Albania,Forest area (% of land area),28.64660584
Andorra,Forest area (% of land area),34.04255319
Arab World,Forest area (% of land area),2.960547902
United Arab Emirates,Forest area (% of land area),4.467755562
Argentina,Forest area (% of land area),10.87708144


In [0]:
# Create table (3) - Education Trends on a per-country basis
educationTable = factTable.join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').filter(dimTableIndicator.indicatorCode.like('%SE%')).join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner').join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').select(dimTableCountry.countryName, dimTableIndicator.indicatorName, dimTableDate.dateYear.alias("year"), factTable.value)
display(educationTable)

countryName,indicatorName,year,value
Aruba,"Government expenditure on education, total (% of GDP)",1960,
Africa Eastern and Southern,"Government expenditure on education, total (% of GDP)",1960,
Afghanistan,"Government expenditure on education, total (% of GDP)",1960,
Africa Western and Central,"Government expenditure on education, total (% of GDP)",1960,
Angola,"Government expenditure on education, total (% of GDP)",1960,
Albania,"Government expenditure on education, total (% of GDP)",1960,
Andorra,"Government expenditure on education, total (% of GDP)",1960,
Arab World,"Government expenditure on education, total (% of GDP)",1960,
United Arab Emirates,"Government expenditure on education, total (% of GDP)",1960,
Argentina,"Government expenditure on education, total (% of GDP)",1960,


In [0]:
# Create table (4) - Data Trends on a per-region basis
dataRegionTable = factTable.join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner').filter(dimTableCountry.isRegion == True).join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').select(dimTableCountry.countryName, dimTableIndicator.indicatorName, dimTableDate.dateYear.alias("year"), factTable.value)
display(dataRegionTable)

countryName,indicatorName,year,value
Africa Eastern and Southern,Forest area (% of land area),1988,
Africa Western and Central,Forest area (% of land area),1988,
Arab World,Forest area (% of land area),1988,
Central Europe and the Baltics,Forest area (% of land area),1988,
Caribbean small states,Forest area (% of land area),1988,
East Asia & Pacific,Forest area (% of land area),1988,
Europe & Central Asia,Forest area (% of land area),1988,
Euro area,Forest area (% of land area),1988,
European Union,Forest area (% of land area),1988,
Latin America & Caribbean,Forest area (% of land area),1988,


In [0]:
# Create table (5) - Data on a per-decade basis
from pyspark.sql.types import DoubleType, StringType, StructField, StructType
from pyspark.sql.functions import avg, broadcast, col
decadeTable = factTable.join(broadcast(dimTableCountry), factTable.regionCode == dimTableCountry.regionCode, 'inner').join(broadcast(dimTableIndicator), factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').join(broadcast(dimTableDate), factTable.dateYear == dimTableDate.dateYear, 'inner').select(dimTableCountry.countryName, dimTableIndicator.indicatorName, dimTableDate.dateYear, dimTableDate.dateDecade, factTable.value)
decadeTable = decadeTable.na.drop(subset=["value"])
decadeTable = decadeTable.groupby("countryName", "indicatorName", "dateDecade").avg("value")
decadeTable = decadeTable.select(col("countryName"), col("indicatorName"), col("dateDecade"), col("avg(value)").alias("value"))
display(decadeTable)

countryName,indicatorName,dateDecade,value
Africa Western and Central,Forest area (% of land area),1990s,22.245177066000004
Bangladesh,Forest area (% of land area),1990s,14.752470923
Sub-Saharan Africa,Forest area (% of land area),1990s,32.42668298400001
Suriname,Forest area (% of land area),1990s,98.46851282
South Africa,Forest area (% of land area),1990s,14.82024417
East Asia & Pacific,Forest area (% of land area),2000s,26.382093042
Sao Tome and Principe,Forest area (% of land area),2000s,60.669791667000005
"Egypt, Arab Rep.",Forest area (% of land area),2010s,0.0524486411
"Venezuela, RB",GDP growth (annual %),1960s,4.811778324111111
Panama,GDP growth (annual %),1980s,2.0741226671999997


In [0]:
# Finally, save the tables!
ukTable.write.mode("overwrite").saveAsTable("ukTable")
table2012.write.mode("overwrite").saveAsTable("2012Table")
educationTable.write.mode("overwrite").saveAsTable("educationTable")
dataRegionTable.write.mode("overwrite").saveAsTable("dataRegionTable")
decadeTable.write.mode("overwrite").saveAsTable("decadeTable")