# 06. Conference Top Institutions  

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/user99/pyspark/'
conferenceShortName = 'WWW'
conferenceTopInstitutionsCount = 20

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# First find the conference series id by matching conference short name
targetConferenceSeriesId = spark.read.load(rootpath + "ConferenceSeries.txt", format="csv", sep="\t") \
.toDF("ConferenceSeriesId", "Rank", "NormalizedName", "DisplayName", "PaperCount", 
      "CitationCount", "CreatedDate") \
.where(sf.col("NormalizedName").isin(conferenceShortName)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id.
Papers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", 
      "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", 
      "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate")

conferencePapers = Papers.join(targetConferenceSeriesId, "ConferenceSeriesId", 'inner') \
.select("PaperId", "CitationCount", "ConferenceSeriesId")


# Get all [paper] -> [affiliation] relationship from [paper] -> [author, affiliation]
paperAffiliation = spark.read.load(rootpath + "PaperAuthorAffiliations.txt", format="csv", sep="\t") \
.toDF("PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAffiliation") \
.where(~sf.col("AffiliationId").isna()) \
.select("PaperId", "AffiliationId")


# Get all distinct [conference paper] -> [affiliation] relationships
conferenceAffiliationsPapers = conferencePapers.join(paperAffiliation, "PaperId", 'inner') \



@conferenceAffiliationsPapers =
    SELECT ANY_VALUE(Affiliation.DisplayName) AS InstitutionName,
           Affiliation.AffiliationId,
           @conferencePapers.PaperId,
           ANY_VALUE(@conferencePapers.CitationCount) AS CitationCount
    FROM @conferencePapers
         INNER JOIN
             @paperAffiliation
         ON @conferencePapers.PaperId == @paperAffiliation.PaperId
         INNER JOIN
             Affiliation
         ON @paperAffiliation.AffiliationId == Affiliation.AffiliationId
    //Each paper may have mutiple authors from the same institution.
    //We want to have distinct [conference paper] -> [affiliation] relationships so we don't double count
    GROUP BY @conferencePapers.PaperId, Affiliation.AffiliationId;


# Get top institution by its all time citation count
@conferenceTopInstitutions =
    SELECT ANY_VALUE(@conferenceAffiliationsPapers.InstitutionName) AS InstitutionName, //Guaranteed to have only 1 value since AffiliationId is the key
           @maDetailPagePrefix + @conferenceAffiliationsPapers.AffiliationId AS DetailsUrl,
           COUNT( * ) AS PublicationCount,
           SUM(@conferenceAffiliationsPapers.CitationCount) AS CitationCount
    FROM @conferenceAffiliationsPapers
    GROUP BY @conferenceAffiliationsPapers.AffiliationId
    ORDER BY CitationCount DESC
    FETCH @conferenceTopInstitutionsCount ROWS; //Only take top @conferenceTopInstitutionsCount institutions.


# Create two ranks based on publication and citation to display on the x and y axis for comparison
@conferenceTopInstitutions = 
    SELECT @conferenceTopInstitutions.InstitutionName,
           @conferenceTopInstitutions.DetailsUrl,
           @conferenceTopInstitutions.PublicationCount,
           @conferenceTopInstitutions.CitationCount,
           RANK() OVER(ORDER BY @conferenceTopInstitutions.CitationCount DESC) AS InstitutionCitationRank,
           RANK() OVER(ORDER BY @conferenceTopInstitutions.PublicationCount DESC) AS InstitutionPublicationRank
    FROM @conferenceTopInstitutions;

In [None]:
sc.stop()