In [0]:
# Based on the PowerBI analysis, there are 4 additional questions I'd like to explore.
# (1) Data appears to be a bit incomplete! What's the distribution of incompleteness per country, year and indicator?
# (2) For the UK table, can we calculate the rolling average over N years for each indicator?
# (3) For the 2012 table, can we calculate which countries have the min/max value for each indicator?
# (4) For the Education table, can we calculate which countries have the worst gender split?
#
# Firstly, get the tables from the default database:
from pyspark.sql.functions import col
dimTableCountry = spark.table("dimensiontablecountrycode")
dimTableDate = spark.table("dimensiontabledate")
dimTableIndicator = spark.table("dimensiontableindicatorcode")
factTable = spark.table("facttable")

In [0]:
# Explore question (1) - what's the distribution of incompleteness per country, year and indicator?
# The quickest possible way is to use the dbuitls.data.summarize command.
dbutils.data.summarize(factTable)

In [0]:
# However, we can filter more:
myCountryCode = "GBR"
myIndicatorCode = "AG.LND.AGRI.ZS"
myYearCode = "1984"

filterTableYearValue = factTable.join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').filter(dimTableIndicator.indicatorCode == myIndicatorCode).join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner').filter(dimTableCountry.regionCode == myCountryCode)

filterTableIndicatorValue = factTable.join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').filter(dimTableDate.dateYear == myYearCode).join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner').filter(dimTableCountry.regionCode == myCountryCode)

filterTableCountryValue = factTable.join(dimTableIndicator, factTable.indicatorCode == dimTableIndicator.indicatorCode, 'inner').filter(dimTableIndicator.indicatorCode == myIndicatorCode).join(dimTableDate, factTable.dateYear == dimTableDate.dateYear, 'inner').filter(dimTableDate.dateYear == myYearCode).join(dimTableCountry, factTable.regionCode == dimTableCountry.regionCode, 'inner')


In [0]:
dbutils.data.summarize(filterTableYearValue.select(filterTableYearValue.dateYear, col("value")))

In [0]:
dbutils.data.summarize(filterTableIndicatorValue.select(filterTableIndicatorValue.indicatorName, col("value")))

In [0]:
dbutils.data.summarize(filterTableCountryValue.select(filterTableCountryValue.countryName, col("value")))

In [0]:
# Explore question (2) - For the UK table, can we calculate the rolling average over N years for each indicator?
from pyspark.sql.window import Window
from pyspark.sql.functions import avg, row_number

ukTable = spark.table("uktable").na.drop("any")
rollingAvgYear = 9

windowSpec = Window.partitionBy("indicatorName").orderBy(col("year").cast('long')).rangeBetween(-(rollingAvgYear), 0)
windowDf = ukTable.withColumn("avg", avg(col("value")).over(windowSpec)).select(col("indicatorName"), col("avg"), col("year"))
display(windowDf)

In [0]:
# Explore question (3) - For the 2012 table, can we calculate which countries have the min/max value for each indicator?
from pyspark.sql.functions import min, max
table2012 = spark.table("2012table").na.drop("any")
aggTable = table2012.groupBy("indicatorName").agg(min("value").alias("minValue"), max("value").alias("maxValue")).withColumnRenamed("indicatorName", "indicator")
minTable = table2012.join(aggTable, table2012.indicatorName == aggTable.indicator).filter(table2012.value == aggTable.minValue).select(table2012.countryName, aggTable.indicator, table2012.value)
maxTable = table2012.join(aggTable, table2012.indicatorName == aggTable.indicator).filter(table2012.value == aggTable.maxValue).select(table2012.countryName, aggTable.indicator, table2012.value)
display(maxTable)

In [0]:
# Explore question (4) - For the Education table, can we calculate which country has the worst gender split for literacy rates in a given year?
from pyspark.sql.functions import min, max
educationTable = spark.table("educationTable")
educationTable = educationTable.join(dimTableIndicator, educationTable.indicatorName == dimTableIndicator.indicatorName).filter(dimTableIndicator.indicatorCode.like('SE.ADT.LITR.%.ZS')).select(col("countryName"), dimTableIndicator.indicatorName, col("year"), col("value")).na.drop("any")
genderSplitTable = educationTable.groupBy(['countryName', 'year']).agg(min("value").alias("minValue"), max("value").alias("maxValue"))
genderSplitTable = genderSplitTable.withColumn('genderSplit', (genderSplitTable['maxValue'] - genderSplitTable['minValue'])).select(col("countryName"), col("year"), col("genderSplit"))
genderSplitMinMax = genderSplitTable.groupBy("year").agg(min("genderSplit").alias("minGenderSplit"), max("genderSplit").alias("maxGenderSplit"))
genderSplitMinTable = genderSplitMinMax.join(genderSplitTable, genderSplitMinMax.year == genderSplitTable.year).filter(genderSplitTable.genderSplit == genderSplitMinMax.minGenderSplit).select(col("countryName"), genderSplitMinMax.year, genderSplitMinMax.minGenderSplit)
genderSplitMaxTable = genderSplitMinMax.join(genderSplitTable, genderSplitMinMax.year == genderSplitTable.year).filter(genderSplitTable.genderSplit == genderSplitMinMax.maxGenderSplit).select(col("countryName"), genderSplitMinMax.year, genderSplitMinMax.maxGenderSplit)
display(genderSplitMaxTable)