In [0]:
# We want to prep the data to analyse the following things:
# (1) Data about the United Kingdom specifically
# (2) Data from 2012 specifically
# (3) Education Trends on a per-country basis
# (4) Data Trends on a per-region basis
# (5) Data Trends on a per-decade basis
#
# Firstly, get the tables from the default database:
dimTableCountry = spark.table("dimensiontablecountrycode")
dimTableDate = spark.table("dimensiontabledate")
dimTableIndicator = spark.table("dimensiontableindicatorcode")
factTable = spark.table("facttable")

In [0]:
# Create table (1) - Data about the United Kingdom specifically.
ukTable = factTable.join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner').filter(dimTableCountry.regionCode == "GBR").join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').select(dimTableIndicator.indicatorName, dimTableDate.dateYear.alias("year"), factTable.value)
display(ukTable)

In [0]:
# Create table (2) - Data about from 2012 specifically
table2012 = factTable.join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').filter(dimTableDate.dateYear == "2012").join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner').join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').select(dimTableCountry.countryName, dimTableIndicator.indicatorName, factTable.value)
display(table2012)

In [0]:
# Create table (3) - Education Trends on a per-country basis
educationTable = factTable.join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').filter(dimTableIndicator.indicatorCode.like('%SE%')).join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner').join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').select(dimTableCountry.countryName, dimTableIndicator.indicatorName, dimTableDate.dateYear.alias("year"), factTable.value)
display(educationTable)

In [0]:
# Create table (4) - Data Trends on a per-region basis
dataRegionTable = factTable.join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner').filter(dimTableCountry.isRegion == True).join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').select(dimTableCountry.countryName, dimTableIndicator.indicatorName, dimTableDate.dateYear.alias("year"), factTable.value)
display(dataRegionTable)

In [0]:
# Create table (5) - Data on a per-decade basis
from pyspark.sql.types import DoubleType, StringType, StructField, StructType
from pyspark.sql.functions import avg, broadcast, col
decadeTable = factTable.join(broadcast(dimTableCountry), factTable.regionCode == dimTableCountry.regionCode, 'inner').join(broadcast(dimTableIndicator), factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').join(broadcast(dimTableDate), factTable.dateYear == dimTableDate.dateYear, 'inner').select(dimTableCountry.countryName, dimTableIndicator.indicatorName, dimTableDate.dateYear, dimTableDate.dateDecade, factTable.value)
decadeTable = decadeTable.na.drop(subset=["value"])
decadeTable = decadeTable.groupby("countryName", "indicatorName", "dateDecade").avg("value")
decadeTable = decadeTable.select(col("countryName"), col("indicatorName"), col("dateDecade"), col("avg(value)").alias("value"))
display(decadeTable)

In [0]:
# Finally, save the tables!
ukTable.write.mode("overwrite").saveAsTable("ukTable")
table2012.write.mode("overwrite").saveAsTable("2012Table")
educationTable.write.mode("overwrite").saveAsTable("educationTable")
dataRegionTable.write.mode("overwrite").saveAsTable("dataRegionTable")
decadeTable.write.mode("overwrite").saveAsTable("decadeTable")