# 09. Conference Top Citing Venues       

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from CreateFunctions_mag import *
import pyspark.sql.functions as sf

In [None]:
rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
conferenceShortName = 'WWW'
conferenceAnalyticsBaseDir = '/output/conferenceAnalytics/'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# The visualization can join against this table to get venue information. 
ConferenceSeries = ConferenceSeries(rootpath, spark)

ConferenceSeries1 = ConferenceSeries.selectExpr("ConferenceSeriesId as VenueId", "NormalizedName as VenueShortName", "DisplayName as VenueName")

Journals = Journals(rootpath, spark)

Journals1 = Journals.selectExpr("JournalId as VenueId", "NormalizedName as VenueShortName", "DisplayName as VenueName")

venues = ConferenceSeries1.union(Journals1)

venues.write.csv(outputDir + "venues.csv", mode='overwrite', header='true')

In [None]:
# First find the conference series id by matching conference short name.
targetConferenceSeriesId = ConferenceSeries.where(sf.col("NormalizedName").isin(conferenceShortName)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id.
Papers = Papers(rootpath, spark)

conferencePapers = Papers.join(targetConferenceSeriesId, "ConferenceSeriesId", 'inner') \
.selectExpr("PaperId", "Year", "ConferenceSeriesId as VenueId")


# Get relationships between [citing paper] -> [conference paper]
PaperReference = PaperReferences(rootpath, spark)

conferencePaperCitations = conferencePapers.join(PaperReference, "PaperId", 'inner')

Papers1 = Papers.withColumn("CitationVenueId", sf.coalesce("ConferenceSeriesId", "JournalId")) \
.selectExpr("PaperId as CitationId", "CitationVenueId")

conferencePaperCitations =  Papers1.join(conferencePaperCitations, Papers1.CitationId == conferencePaperCitations.PaperReferenceId, 'inner') \
.select("PaperId", "Year", "VenueId", "CitationId", "CitationVenueId")


# Count the number of citations between citing conference and conference
# Keep VenueId and CitationVenueId such that the visulization can join directly with Venue table to get venue names
conferencePaperCitationsByVenue = conferencePaperCitations.groupby("VenueId", "CitationVenueId") \
.agg(sf.count("*")) \
.withColumnRenamed("count(1)", "CitationCount")

conferencePaperCitationsByVenue.write.csv(outputDir + "conferencePaperCitationsByVenue.csv", mode='overwrite', header='true')

In [None]:
# For each yeah of conference, count the number of citations between citing conference and conference
# Keep VenueId and CitationVenueId such that the visulization can join directly with Venue table to get venue names
conferencePaperCitationsByYearCitationVenue = conferencePaperCitations.groupby("CitationVenueId", "Year") \
.agg(sf.count("*")) \
.withColumnRenamed("count(1)", "CitationCount")

conferencePaperCitationsByYearCitationVenue.write.csv(outputDir + "conferencePaperCitationsByYearCitationVenue.csv", mode='overwrite', header='true')

In [None]:
sc.stop()