# 06. Conference Top Institutions  

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from CreateFunctions_mag import *
import pyspark.sql.functions as sf

In [None]:
rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
conferenceShortName = 'WWW'
conferenceTopInstitutionsCount = 20
maDetailPagePrefix = 'https://academic.microsoft.com/#/detail/'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# First find the conference series id by matching conference short name
ConferenceSeries = ConferenceSeries(rootpath, spark)

targetConferenceSeriesId = ConferenceSeries.where(sf.col("NormalizedName").isin(conferenceShortName)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id.
Papers = Papers(rootpath, spark)

conferencePapers = Papers.join(targetConferenceSeriesId, "ConferenceSeriesId", 'inner') \
.select("PaperId", "CitationCount", "ConferenceSeriesId")


# Get all [paper] -> [affiliation] relationship from [paper] -> [author, affiliation]
PaperAuthorAffiliations = PaperAuthorAffiliations(rootpath, spark)

paperAffiliation = PaperAuthorAffiliations.where(sf.col("AffiliationId").isNotNull()) \
.select("PaperId", "AffiliationId")


# Get all distinct [conference paper] -> [affiliation] relationships
Affiliation = Affiliations(rootpath, spark)

conferenceAffiliationsPapers = conferencePapers.join(paperAffiliation, "PaperId", 'inner') \
.groupby("PaperId", "AffiliationId") \
.agg(sf.avg("CitationCount").alias("CitationCount")) \
.join(Affiliation.select("AffiliationId", "DisplayName").distinct(), "AffiliationId", 'inner') \
.selectExpr("DisplayName as InstitutionName", "AffiliationId", "PaperId", "CitationCount")

    
# Get top institution by its all time citation count
conferenceTopInstitutions = conferenceAffiliationsPapers.groupby("AffiliationId", "InstitutionName") \
.agg({"CitationCount": "sum", "*": "count"}) \
.withColumnRenamed("count(1)", "PublicationCount") \
.withColumnRenamed("sum(CitationCount)", "CitationCount") \
.withColumn("DetailsUrl", sf.concat(sf.lit(maDetailPagePrefix), "AffiliationId")) \
.orderBy("CitationCount", ascending=False) \
.limit(conferenceTopInstitutionsCount)
    

# Create two ranks based on publication and citation to display on the x and y axis for comparison
conferenceTopInstitutions = conferenceTopInstitutions.select("InstitutionName", "DetailsUrl", "PublicationCount", "CitationCount", 
                                        sf.rank().over(Window.orderBy(sf.desc("CitationCount"))).alias("InstitutionCitationRank"), 
                                        sf.rank().over(Window.orderBy(sf.desc("PublicationCount"))).alias("InstitutionPublicationRank"))


conferenceTopInstitutions.write.csv(outputDir + "conferenceTopInstitutions.csv", mode='overwrite', header='true')

In [None]:
sc.stop()