# 08. Conference Top Referenced Venues     

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from CreateFunctions_mag import *
import pyspark.sql.functions as sf

In [None]:
rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
conferenceShortName = 'WWW'
conferenceAnalyticsBaseDir = '/output/conferenceAnalytics/'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# The visualization can join against this table to get venue information. 
ConferenceSeries = ConferenceSeries(rootpath, spark)

ConferenceSeries1 = ConferenceSeries.selectExpr("ConferenceSeriesId as VenueId", "NormalizedName as VenueShortName", "DisplayName as VenueName")

Journals = Journals(rootpath, spark)

Journals1 = Journals.selectExpr("JournalId as VenueId", "NormalizedName as VenueShortName", "DisplayName as VenueName")

venues = ConferenceSeries1.union(Journals1)

venues.write.csv(outputDir + "venues.csv", mode='overwrite', header='true')

In [None]:
# First find the conference series id by matching conference short name.
targetConferenceSeriesId = ConferenceSeries.where(sf.col("NormalizedName").isin(conferenceShortName)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id.
Papers = Papers(rootpath, spark)

conferencePapers = Papers.join(targetConferenceSeriesId, "ConferenceSeriesId", 'inner') \
.selectExpr("PaperId", "Year", "ConferenceSeriesId as VenueId")


# Get all reference from [Conference Paper] -> [Other Paper]
PaperReference = PaperReferences(rootpath, spark)

conferencePaperReferences = conferencePapers.join(PaperReference, "PaperId", 'inner')

# A single paper may be published in conference and in a journal later.
# Use conference as its venue if present otherwise, journal
Papers1 = Papers.withColumn("ReferenceVenueId", sf.coalesce("ConferenceSeriesId", "JournalId")) \
.selectExpr("PaperId as ReferenceId", "ReferenceVenueId")

conferencePaperReferences =  Papers1.join(conferencePaperReferences, Papers1.ReferenceId == conferencePaperReferences.PaperReferenceId, 'inner') \
.selectExpr("PaperId", "Year", "VenueId", "ReferenceId", "ReferenceVenueId")


# Count number of references between conference and referenced conference
conferencePaperReferenceByVenues = conferencePaperReferences.groupby("VenueId", "ReferenceVenueId") \
.agg(sf.count("*")) \
.withColumnRenamed("count(1)", "ReferenceCount")

conferencePaperReferenceByVenues.write.csv(outputDir + "conferencePaperReferenceByVenues.csv", mode='overwrite', header='true')

In [None]:
# For each yeah of conference, count number of references between conference and referenced conference
conferencePaperReferenceByYearsReferenceVenues = conferencePaperReferences.groupby("Year", "ReferenceVenueId") \
.agg(sf.count("*")) \
.withColumnRenamed("count(1)", "ReferenceCount")
        
conferencePaperReferences.write.csv(outputDir + "conferencePaperReferences.csv", mode='overwrite', header='true')

In [None]:
sc.stop()