# 08. Conference Top Referenced Venues     

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
conferenceShortName = 'WWW'
conferenceAnalyticsBaseDir = '/output/conferenceAnalytics/'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# The visualization can join against this table to get venue information. 
ConferenceSeries = spark.read.load(rootpath + "ConferenceSeries.txt", format="csv", sep="\t") \
.toDF("ConferenceSeriesId", "Rank", "NormalizedName", "DisplayName", "PaperCount", "CitationCount", "CreatedDate") \
.selectExpr("ConferenceSeriesId as VenueId", "NormalizedName as VenueShortName", "DisplayName as VenueName")

Journals = spark.read.load(rootpath + "Journals.txt", format="csv", sep="\t") \
.toDF("JournalId", "Rank", "NormalizedName", "DisplayName", "Issn", "Publisher", "Webpage", "PaperCount", "CitationCount", "CreatedDate") \
.selectExpr("JournalId as VenueId", "NormalizedName as VenueShortName", "DisplayName as VenueName")

venues = ConferenceSeries.union(Journals)

venues.write.csv(outputDir + "venues.csv", mode='overwrite', header='true')

In [None]:
# First find the conference series id by matching conference short name.
targetConferenceSeriesId = ConferenceSeries.where(sf.col("NormalizedName").isin(conferenceShortName)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id.
Papers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", 
      "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", 
      "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate")

conferencePapers = Papers.join(targetConferenceSeriesId, "ConferenceSeriesId", 'inner') \
.selectExpr("PaperId", "Year", "ConferenceSeriesId as VenueId")


# Get all reference from [Conference Paper] -> [Other Paper]
PaperReference = spark.read.load(rootpath + "PaperReferences.txt", format="csv", sep="\t") \
.toDF("PaperId", "PaperReferenceId")

conferencePaperReferences = conferencePapers.join(PaperReference, "PaperId", 'inner')

Papers1 = Papers.selectExpr("PaperId as ReferenceId", "Year as ReferenceYear")

conferencePaperReferences =  Papers1.join(conferencePaperReferences, Papers1.ReferenceId == conferencePaperReferences.PaperReferenceId, 'inner') \
.select("PaperId", "Year", "VenueId", "ReferenceId", "ConferenceSeriesId")


# Count number of references between conference and referenced conference
conferencePaperReferenceByVenues = conferencePaperReferences.groupby("VenueId", "ReferenceVenueId") \
.agg(sf.count("*")) \
.withColumnRenamed("count", "ReferenceCount")


conferencePaperReferenceByVenues.write.csv(outputDir + "conferencePaperReferenceByVenues.csv", mode='overwrite', header='true')

In [None]:
# For each yeah of conference, count number of references between conference and referenced conference
conferencePaperReferenceByYearsReferenceVenues = conferencePaperReferences.groupby("Year", "ReferenceVenueId") \
.agg(sf.count("*")) \
.withColumnRenamed("count", "ReferenceCount")
        

conferencePaperReferences.write.csv(outputDir + "conferencePaperReferences.csv", mode='overwrite', header='true')

In [None]:
sc.stop()